我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。 程序如下package vnet.com.weather1;
import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;
import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;
import vnet.com.update.Getdata;/** * 正则方式抓取新浪天气新闻上的新闻 * 地址http://weather.news.sina.com.cn/weather/news/index.html * @param args */public class Newlist { private static final Log log = LogFactory.getLog(Newlist.class); /** * 测试 * @param args */ public static void main(String args[]){ Newlist n=new Newlist(); String[] k=n.getNewList(); for (int i=0;i<k.length;i++){ System.out.println(k[i].replace("href=/"", "href=/"newinfo2.jsp?url=")); } String[] m=n.getNewinfo("news/2008/1119/35261.html"); for (int l=0;l<m.length;l++){ System.out.println(m[l]); } } /** * 由url地址获得新闻内容string[] * 新闻中的图片下载到本地,文中新闻地址改成本地地址 * @param url * @return */ public String[] getNewinfo(String url){ String URL="http://weather.news.sina.com.cn/"+url; //30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30); for (int i=0;i<s.length;i++){ Pattern sp = Pattern.compile("src=/"(.*?)/""); Matcher matcher = sp.matcher(s[i]); if (matcher.find()){ String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0]; if(!imageurl.startsWith("http://")){ imageurl="http://weather.news.sina.com.cn/"+imageurl; } System.out.println("新闻有图片:"+imageurl); String content=getContent(imageurl); String[] images=imageurl.split("/"); String imagename=images[images.length-1]; System.out.println("图片名:"+imagename); try { File fwl = new File(imagename); PrintWriter outl = new PrintWriter(fwl); outl.println(content); outl.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("s[i]:"+s[i]); //修改文件图片地址 s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename); } } return s; } public String[] getNewList(){ String url="http://weather.news.sina.com.cn/weather/news/index.html"; return getNewList(getContent(url)); }
private String[] getNewList(String content ){ //String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50); String[] s = analysis("<li>(.*?)</li>" , content , 50); return s; } private String[] analysis(String pattern, String match , int i){ Pattern sp = Pattern.compile(pattern); Matcher matcher = sp.matcher(match); String[] content = new String[i]; for (int i1 = 0; matcher.find(); i1++){ content[i1] = matcher.group(1); } //下面一段是为了剔除为空的串 int l=0; for (int k=0;k<content.length;k++){ if (content[k]==null){ l=k; break; } } String[] content2; if (l!=0){ content2=new String[l]; for (int n=0;n<l;n++){ content2[n]=content[n]; } return content2; }else{ return content; } } /** * 由地址获取网页内容 * @param strUrl * @return private String getContent(String strUrl){ try{ //URL url = new URL(strUrl); //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); URLConnection uc = new URL(strUrl).openConnection(); //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求 uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("-----------------------------------------"); System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); System.out.println("-----------------------------------------"); //获取文件头信息 System.out.println("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312")); String s = ""; StringBuffer sb=new StringBuffer(); while((s = br.readLine())!=null){ sb.append(s+"/r/n"); } System.out.println("长度+"+sb.toString().length()); return sb.toString(); }catch(Exception e){ return "error open url" + strUrl; } } */
public static String getContent (String strUrl){ URLConnection uc = null; String all_content=null;
try { all_content =new String(); URL url = new URL(strUrl);
uc = url.openConnection(); uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("-----------------------------------------"); System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); System.out.println("-----------------------------------------"); //获取文件头信息 System.out.println("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); if (uc == null) return null;
InputStream ins = uc.getInputStream(); ByteArrayOutputStream outputstream = new ByteArrayOutputStream(); byte[] str_b = new byte[1024]; int i = -1; while ((i=ins.read(str_b)) > 0) { outputstream.write(str_b,0,i); } all_content = outputstream.toString(); // System.out.println(all_content);
} catch (Exception e) { e.printStackTrace(); log.error("获取网页内容出错"); }finally{ uc = null; } // return new String(all_content.getBytes("ISO8859-1")); System.out.println(all_content.length()); return all_content; } }
现在的问题是:图片下载不全,我用后面两种getContent方法下图片,下来的图片大小都和文件头里获得的Content-Length,也就是图片的实际大小不符,预览不了。 而且反复测试,两种方法每次下来的东西大小是固定的,所以重复下载没有用? 测试toString后length大小比图片实际的小,而生成的图片比图片数据大。下载后存储过程中图片数据增加了! 图片数据流toString过程中数据大小发生了改变,还原不回来。其它新闻内容没有问题。估计是图片的编码格式等的问题。在图片数据流读过来时直接生成图片就可以了。public int saveImage (String strUrl){ URLConnection uc = null; try { URL url = new URL(strUrl); uc = url.openConnection(); uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); //uc.setReadTimeout(30000); //获取图片长度 //System.out.println("Content-Length: "+uc.getContentLength()); //获取文件头信息 //System.out.println("Header"+uc.getHeaderFields().toString()); if (uc == null) return 0; InputStream ins = uc.getInputStream(); byte[] str_b = new byte[1024]; int byteRead=0; String[] images=strUrl.split("/"); String imagename=images[images.length-1]; File fwl = new File(imagename); FileOutputStream fos= new FileOutputStream(fwl); while ((byteRead=ins.read(str_b)) > 0) { fos.write(str_b,0,byteRead); }; fos.flush(); fos.close(); } catch (Exception e) { e.printStackTrace(); log.error("获取网页内容出错"); }finally{ uc = null; } return 1; }
方法二:首先把搜索后的页面用流读取出来,再写个正则,去除不要的内容,再把最后的结果存成xml格式文件、或者直接存入数据库,用的时候再调用
本代码只是显示html页的源码内容,如果需要抽取内容请自行改写public static String regex()中的正则式 package rssTest; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MyRSS { /** * 获取搜索结果的html源码 * */ public static String getHtmlSource(String url) { StringBuffer codeBuffer = null; BufferedReader in=null; try { URLConnection uc = new URL(url).openConnection(); /** * 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求. * 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用 * */ uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); // 读取url流内容 in = new BufferedReader(new InputStreamReader(uc .getInputStream(), "gb2312")); codeBuffer = new StringBuffer(); String tempCode = ""; // 把buffer内的值读取出来,保存到code中 while ((tempCode = in.readLine()) != null) { codeBuffer.append(tempCode).append("/n"); } in.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return codeBuffer.toString(); } /** * 正则表达式 * */ public static String regex() { String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>"; return googleRegex; } /** * 测试用 * 在google中检索关键字,并抽取自己想要的内容 * * */ public static List<String> GetNews() { List<String> newsList = new ArrayList<String>(); String allHtmlSource = MyRSS .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os- maxthon&hs=SUZ&q=见龙卸甲&meta=&aq=f"); Pattern pattern = Pattern.compile(regex()); Matcher matcher = pattern.matcher(allHtmlSource); while (matcher.find()) { String urlLink = matcher.group(2); String title = matcher.group(4); title = title.replaceAll("<font color=CC0033>", ""); title = title.replaceAll("</font>", ""); title = title.replaceAll("<b>...</b>", ""); String content = matcher.group(6); content = content.replaceAll("<font color=CC0033>", ""); content = content.replaceAll("</font>", ""); content = content.replaceAll("<b>...</b>", ""); newsList.add(urlLink); newsList.add(title); newsList.add(content); } return newsList; } /** * main方法 * */ public static void main(String[] args) { System.out .println(MyRSS .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html")); } }
方法三:jsp自动抓取新闻 自动抓取新闻
package com.news.spider;
import java.io.File;import java.io.FileFilter;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Calendar;import java.util.Date;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;
import com.db.DBAccess;
public class SpiderNewsServer {public static void main(String[] args) throws Exception{
//设置抓取信息的首页面 String endPointUrl = "http://cn.china.cn/zixun/"; //获得当前时间 Calendar calendar=Calendar.getInstance(); SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); String DateNews = sdf.format(calendar.getTime()); /******************** * 抓取二级URl 开始 * url匹配类型:"http://cn.china.cn/article/" */ List listNewsType = new ArrayList(); //取入口页面html WebHtml webHtml = new WebHtml(); String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl); if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){ return; } String strTemp1 = "http://cn.china.cn/article/"; String strTemp2 = "</li>"; int stopIndex=0; int startIndex=0; int dd=0; while(true){ dd++; startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex); System.out.println("=========="+startIndex); stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex); System.out.println("==========---------"+stopIndex); if(startIndex!=-1 && stopIndex!=-1){ String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex); System.out.println("@@@@@--------"+companyType); System.out.println("@@@@@--------"+companyType.indexOf("/"")); companyType=companyType.substring(0,companyType.indexOf("/"")); System.out.println("#####--------"+companyType); listNewsType.add(companyType); } if(dd>10){ break; } if(stopIndex==-1 || startIndex==-1){ break; } } System.out.println("listCompanyType====="+listNewsType.size()); /** * 抓取二级URl 结束 ********************/ /******************** * 抓取页面内容 开始 */ String title=""; String hometext=""; String bodytext=""; String keywords=""; String counter = "221"; String cdate= ""; int begainIndex=0;//检索字符串的起点索引 int endIndex=0;//检索字符串的终点索引 String begainStr;//检索开始字符串 String endStr;//检索结束字符串 for (int rows = 1; rows < listNewsType.size(); rows++) { String strNewsDetail = listNewsType.get(rows).toString(); System.out.println("strNewsDetail====="+strNewsDetail); if(strNewsDetail != null && strNewsDetail.length() > 0){ WebHtml newsListHtml = new WebHtml(); String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail); System.out.println("$$$$$------"+htmlDocuemtntCom); if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){ return; } //截取时间 int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>时间:"); System.out.println("%%%%%--"+dateBegainIndex); String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20); System.out.println("^^^^^^^^^^^^^^^---"+newTime); String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3); String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1); System.out.println("^^^^^^^^^^^^^^^---"+newTimeM); System.out.println("^^^^^^^^^^^^^^^---"+dateM); if(newTimeM == dateM || newTimeM.equals(dateM)){ //检索新闻标题 begainStr="<div class=/"divCon bg008 /">"; endStr="<div>时间:"; begainIndex=htmlDocuemtntCom.indexOf(begainStr,0); System.out.println("&&&&&&------"+begainIndex); endIndex=htmlDocuemtntCom.indexOf(endStr,0); System.out.println("&&&&&&------"+endIndex); if(begainIndex!=-1 && endIndex!=-1){ title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim(); title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>")); title = title.replace("'", ""); title = title.replace(";", ""); title = title.replace(" ", ""); } //检索新闻内容 begainStr="<div class=/"divCon bg008 /">"; endStr="<!-- page begin -->"; begainIndex=htmlDocuemtntCom.indexOf(begainStr,0); endIndex=htmlDocuemtntCom.indexOf(endStr,0); if(begainIndex!=-1 && endIndex!=-1){ bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim(); if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0) bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>")); bodytext=bodytext.replace(" ", ""); bodytext=bodytext.replace("<br>", ""); bodytext=bodytext.replace("/n", "<br>"); bodytext=bodytext.replace("'", ""); bodytext=bodytext.replace(";", ""); } //简介 if(bodytext.length()>40) hometext = bodytext.substring(0,40)+"......"; else{ hometext = bodytext+"......"; } //浏览量 String str = String.valueOf(Math.random()); counter = str.substring(str.lastIndexOf(".")+1,5); Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cdate = cal.getTimeInMillis()+""; cdate = cdate.substring(0,10); }else{ continue; } } System.out.println("-------------------------"+title); System.out.println("-------------------------"+cdate); System.out.println("-------------------------"+cdate); System.out.println("-------------------------"+hometext); System.out.println("-------------------------"+bodytext); System.out.println("-------------------------"+keywords); System.out.println("-------------------------"+counter); /*String str = "INSERT INTO ecim_stories(uid,title,created,published,hostname,hometext,bodytext,keywords,counter,topicid,ihome,notifypub,story_type,topicdisplay,topicalign,comments,rating,votes,description) "; str += "VALUE (1,'"+title+"',"+cdate+","+cdate+",'125.122.83.177','"+hometext+"','"+bodytext+"','"+keywords+"',"+counter+",1,0,1,'admin',0,'R',0,0,0,'')"; DBAccess db = new DBAccess();; if(db.executeUpdate(str)>0) { System.out.println("-------------------------成功!!!!!!!!!!"); }else { System.out.println("-------------------------失败!!!!!!!!!!"); }*/ } /** * 抓取页面内容 结束 ********************/}}
package com.news.spider;
import java.net.URL;import java.net.URLConnection;import java.io.BufferedReader;import java.io.InputStreamReader;
public class WebHtml {
/*** 根据url,抓取webhmtl内容* @param url*/public String getWebHtml(String url){ try { URL myURL = new URL(url); URLConnection conn = myURL.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); String line = null; StringBuffer document = new StringBuffer(""); while ((line = reader.readLine()) != null){ document.append(line + "/n"); } reader.close(); String resutlDocument = new String(document); return resutlDocument; } catch (Exception e) {} return "";}
}
出处:【Gjava人才】 网址: http://www.gjrencai.com 转载时请注明出处和网址