新浪天气预报新闻java抓去程序

    技术2022-07-06  284

    我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。 程序如下package vnet.com.weather1;

    import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;

    import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;

    import vnet.com.update.Getdata;/** * 正则方式抓取新浪天气新闻上的新闻 * 地址http://weather.news.sina.com.cn/weather/news/index.html * @param args */public class Newlist {    private static final Log log = LogFactory.getLog(Newlist.class);    /**     * 测试     * @param args     */    public  static void main(String args[]){        Newlist n=new Newlist();        String[] k=n.getNewList();        for (int i=0;i<k.length;i++){        System.out.println(k[i].replace("href=/"", "href=/"newinfo2.jsp?url="));        }        String[] m=n.getNewinfo("news/2008/1119/35261.html");        for (int l=0;l<m.length;l++){                    System.out.println(m[l]);            }            }    /**     * 由url地址获得新闻内容string[]     * 新闻中的图片下载到本地,文中新闻地址改成本地地址     * @param url     * @return     */    public String[] getNewinfo(String url){        String URL="http://weather.news.sina.com.cn/"+url;        //30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null        String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);        for (int i=0;i<s.length;i++){            Pattern sp = Pattern.compile("src=/"(.*?)/"");            Matcher matcher = sp.matcher(s[i]);            if (matcher.find()){                                 String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0];                 if(!imageurl.startsWith("http://")){                     imageurl="http://weather.news.sina.com.cn/"+imageurl;                  }                System.out.println("新闻有图片:"+imageurl);                String content=getContent(imageurl);                  String[] images=imageurl.split("/");                  String imagename=images[images.length-1];                  System.out.println("图片名:"+imagename);                                           try {            File fwl = new File(imagename);             PrintWriter outl = new PrintWriter(fwl);            outl.println(content);            outl.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }            System.out.println("s[i]:"+s[i]);            //修改文件图片地址            s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename);            }        }                return s;    }    public  String[] getNewList(){        String url="http://weather.news.sina.com.cn/weather/news/index.html";        return getNewList(getContent(url));           }

        private  String[] getNewList(String content ){        //String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50);            String[] s = analysis("<li>(.*?)</li>" , content , 50);                return s;    }    private String[] analysis(String pattern, String match , int i){        Pattern sp = Pattern.compile(pattern);        Matcher matcher = sp.matcher(match);        String[] content = new String[i];        for (int i1 = 0; matcher.find(); i1++){                    content[i1] = matcher.group(1);               }        //下面一段是为了剔除为空的串        int l=0;        for (int k=0;k<content.length;k++){            if (content[k]==null){                l=k;                break;            }        }        String[] content2;        if (l!=0){            content2=new String[l];            for (int n=0;n<l;n++){                content2[n]=content[n];            }             return content2;        }else{            return content;            }           }    /**     * 由地址获取网页内容     * @param strUrl     * @return    private String getContent(String strUrl){        try{            //URL url = new URL(strUrl);                //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));            URLConnection uc = new URL(strUrl).openConnection();                //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求              uc.setRequestProperty("User-Agent",                                      "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                                System.out.println("-----------------------------------------");                System.out.println("Content-Length:     "+uc.getContentLength());                System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));                System.out.println("-----------------------------------------");               //获取文件头信息              System.out.println("Header"+uc.getHeaderFields().toString());              System.out.println("-----------------------------------------");              BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312"));             String s = "";            StringBuffer sb=new StringBuffer();            while((s = br.readLine())!=null){                sb.append(s+"/r/n");            }            System.out.println("长度+"+sb.toString().length());                        return sb.toString();        }catch(Exception e){            return "error open url" + strUrl;        }    }    */

        public static  String getContent (String strUrl){        URLConnection uc = null;        String all_content=null;

               try {               all_content =new  String();               URL url = new URL(strUrl);

                   uc = url.openConnection();               uc.setRequestProperty("User-Agent",                                      "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                                System.out.println("-----------------------------------------");                System.out.println("Content-Length:     "+uc.getContentLength());                System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));                System.out.println("-----------------------------------------");               //获取文件头信息              System.out.println("Header"+uc.getHeaderFields().toString());              System.out.println("-----------------------------------------");                 if (uc == null)                   return null;

                   InputStream ins = uc.getInputStream();                ByteArrayOutputStream outputstream = new ByteArrayOutputStream();               byte[] str_b = new byte[1024];                   int i = -1;                   while ((i=ins.read(str_b)) > 0) {                    outputstream.write(str_b,0,i);                   }                   all_content = outputstream.toString();                  // System.out.println(all_content);

               } catch (Exception e) {               e.printStackTrace();               log.error("获取网页内容出错");           }finally{               uc = null;           }                    // return new String(all_content.getBytes("ISO8859-1"));           System.out.println(all_content.length());           return all_content;       }      }

    现在的问题是:图片下载不全,我用后面两种getContent方法下图片,下来的图片大小都和文件头里获得的Content-Length,也就是图片的实际大小不符,预览不了。   而且反复测试,两种方法每次下来的东西大小是固定的,所以重复下载没有用? 测试toString后length大小比图片实际的小,而生成的图片比图片数据大。下载后存储过程中图片数据增加了!   图片数据流toString过程中数据大小发生了改变,还原不回来。其它新闻内容没有问题。估计是图片的编码格式等的问题。在图片数据流读过来时直接生成图片就可以了。public  int saveImage (String strUrl){        URLConnection uc = null;    try {               URL url = new URL(strUrl);               uc = url.openConnection();               uc.setRequestProperty("User-Agent",                                      "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                   //uc.setReadTimeout(30000);         //获取图片长度            //System.out.println("Content-Length:     "+uc.getContentLength());           //获取文件头信息           //System.out.println("Header"+uc.getHeaderFields().toString());                        if (uc == null)                   return 0;               InputStream ins = uc.getInputStream();                  byte[] str_b = new byte[1024];                          int byteRead=0;                                           String[] images=strUrl.split("/");        String imagename=images[images.length-1];              File fwl = new File(imagename);              FileOutputStream fos= new FileOutputStream(fwl);                   while ((byteRead=ins.read(str_b)) > 0) {                      fos.write(str_b,0,byteRead);                     };                    fos.flush();                    fos.close();           } catch (Exception e) {               e.printStackTrace();               log.error("获取网页内容出错");           }finally{               uc = null;           }           return 1;       }

     

     

     

    方法二:首先把搜索后的页面用流读取出来,再写个正则,去除不要的内容,再把最后的结果存成xml格式文件、或者直接存入数据库,用的时候再调用

    本代码只是显示html页的源码内容,如果需要抽取内容请自行改写public static String regex()中的正则式   package rssTest;     import java.io.BufferedReader;   import java.io.IOException;   import java.io.InputStreamReader;   import java.net.HttpURLConnection;   import java.net.MalformedURLException;   import java.net.URL;   import java.net.URLConnection;   import java.util.ArrayList;   import java.util.List;   import java.util.regex.Matcher;   import java.util.regex.Pattern;     public class MyRSS   {       /**       * 获取搜索结果的html源码       * */      public static String getHtmlSource(String url)       {                      StringBuffer codeBuffer = null;           BufferedReader in=null;           try          {              URLConnection uc = new URL(url).openConnection();                 /**              * 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求.               * 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用               *                */              uc.setRequestProperty("User-Agent",                       "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                 // 读取url流内容               in = new BufferedReader(new InputStreamReader(uc                       .getInputStream(), "gb2312"));               codeBuffer = new StringBuffer();               String tempCode = "";               // 把buffer内的值读取出来,保存到code中               while ((tempCode = in.readLine()) != null)               {                   codeBuffer.append(tempCode).append("/n");               }               in.close();           }           catch (MalformedURLException e)           {               e.printStackTrace();           }           catch (IOException e)           {               e.printStackTrace();           }                      return codeBuffer.toString();       }         /**       * 正则表达式       * */      public static String regex()       {           String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>";           return googleRegex;       }         /**       * 测试用       * 在google中检索关键字,并抽取自己想要的内容       *        * */      public static List<String> GetNews()       {           List<String> newsList = new ArrayList<String>();           String allHtmlSource = MyRSS                   .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os-  maxthon&hs=SUZ&q=见龙卸甲&meta=&aq=f");           Pattern pattern = Pattern.compile(regex());           Matcher matcher = pattern.matcher(allHtmlSource);             while (matcher.find())           {               String urlLink = matcher.group(2);               String title = matcher.group(4);               title = title.replaceAll("<font color=CC0033>", "");               title = title.replaceAll("</font>", "");               title = title.replaceAll("<b>...</b>", "");                 String content = matcher.group(6);               content = content.replaceAll("<font color=CC0033>", "");               content = content.replaceAll("</font>", "");               content = content.replaceAll("<b>...</b>", "");                 newsList.add(urlLink);               newsList.add(title);               newsList.add(content);           }          return newsList;       }         /**       * main方法       * */      public static void main(String[] args)       {          System.out           .println(MyRSS                   .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html"));       }   }

     

    方法三:jsp自动抓取新闻 自动抓取新闻

    package com.news.spider;

    import java.io.File;import java.io.FileFilter;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Calendar;import java.util.Date;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;

    import com.db.DBAccess;

    public class SpiderNewsServer {public static void main(String[] args) throws Exception{

       //设置抓取信息的首页面   String endPointUrl = "http://cn.china.cn/zixun/";   //获得当前时间   Calendar calendar=Calendar.getInstance();      SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");      String DateNews = sdf.format(calendar.getTime());          /********************   * 抓取二级URl 开始   * url匹配类型:"http://cn.china.cn/article/"   */   List listNewsType = new ArrayList();   //取入口页面html   WebHtml webHtml = new WebHtml();   String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl);   if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){    return;   }   String strTemp1 = "http://cn.china.cn/article/";   String strTemp2 = "</li>";   int stopIndex=0;   int startIndex=0;   int dd=0;   while(true){    dd++;    startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex);       System.out.println("=========="+startIndex);    stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex);    System.out.println("==========---------"+stopIndex);    if(startIndex!=-1 && stopIndex!=-1){     String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex);     System.out.println("@@@@@--------"+companyType);     System.out.println("@@@@@--------"+companyType.indexOf("/""));     companyType=companyType.substring(0,companyType.indexOf("/""));     System.out.println("#####--------"+companyType);     listNewsType.add(companyType);    }    if(dd>10){     break;    }    if(stopIndex==-1 || startIndex==-1){     break;    }   }   System.out.println("listCompanyType====="+listNewsType.size());   /**   * 抓取二级URl 结束   ********************/       /********************   * 抓取页面内容 开始   */   String title="";     String hometext="";     String bodytext="";     String keywords="";     String counter = "221";     String cdate= "";     int begainIndex=0;//检索字符串的起点索引   int endIndex=0;//检索字符串的终点索引   String begainStr;//检索开始字符串           String endStr;//检索结束字符串     for (int rows = 1; rows < listNewsType.size(); rows++) {    String strNewsDetail = listNewsType.get(rows).toString();     System.out.println("strNewsDetail====="+strNewsDetail);    if(strNewsDetail != null && strNewsDetail.length() > 0){     WebHtml newsListHtml = new WebHtml();     String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail);     System.out.println("$$$$$------"+htmlDocuemtntCom);         if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){      return;     }     //截取时间          int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>时间:");     System.out.println("%%%%%--"+dateBegainIndex);     String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20);     System.out.println("^^^^^^^^^^^^^^^---"+newTime);     String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3);      String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1);     System.out.println("^^^^^^^^^^^^^^^---"+newTimeM);     System.out.println("^^^^^^^^^^^^^^^---"+dateM);     if(newTimeM == dateM || newTimeM.equals(dateM)){      //检索新闻标题      begainStr="<div class=/"divCon bg008 /">";              endStr="<div>时间:";           begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);      System.out.println("&&&&&&------"+begainIndex);      endIndex=htmlDocuemtntCom.indexOf(endStr,0);      System.out.println("&&&&&&------"+endIndex);      if(begainIndex!=-1 && endIndex!=-1){       title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();       title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>"));       title = title.replace("'", "");       title = title.replace(";", "");       title = title.replace(" ", "");      }           //检索新闻内容      begainStr="<div class=/"divCon bg008 /">";              endStr="<!-- page begin -->";      begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);      endIndex=htmlDocuemtntCom.indexOf(endStr,0);      if(begainIndex!=-1 && endIndex!=-1){       bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();       if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0)        bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>"));       bodytext=bodytext.replace(" ", "");       bodytext=bodytext.replace("<br>", "");       bodytext=bodytext.replace("/n", "<br>");       bodytext=bodytext.replace("'", "");       bodytext=bodytext.replace(";", "");      }      //简介      if(bodytext.length()>40)       hometext = bodytext.substring(0,40)+"......";      else{       hometext = bodytext+"......";      }      //浏览量      String str = String.valueOf(Math.random());      counter = str.substring(str.lastIndexOf(".")+1,5);           Calendar cal = Calendar.getInstance();      cal.setTime(new Date());      cdate = cal.getTimeInMillis()+"";      cdate = cdate.substring(0,10);     }else{      continue;     }    }    System.out.println("-------------------------"+title);    System.out.println("-------------------------"+cdate);    System.out.println("-------------------------"+cdate);    System.out.println("-------------------------"+hometext);    System.out.println("-------------------------"+bodytext);    System.out.println("-------------------------"+keywords);    System.out.println("-------------------------"+counter);    /*String str = "INSERT INTO ecim_stories(uid,title,created,published,hostname,hometext,bodytext,keywords,counter,topicid,ihome,notifypub,story_type,topicdisplay,topicalign,comments,rating,votes,description) ";    str += "VALUE (1,'"+title+"',"+cdate+","+cdate+",'125.122.83.177','"+hometext+"','"+bodytext+"','"+keywords+"',"+counter+",1,0,1,'admin',0,'R',0,0,0,'')";    DBAccess db = new DBAccess();;    if(db.executeUpdate(str)>0) {     System.out.println("-------------------------成功!!!!!!!!!!");    }else {     System.out.println("-------------------------失败!!!!!!!!!!");    }*/   }     /**   * 抓取页面内容 结束   ********************/}} 

     

    package com.news.spider;

    import java.net.URL;import java.net.URLConnection;import java.io.BufferedReader;import java.io.InputStreamReader;

    public class WebHtml {

    /*** 根据url,抓取webhmtl内容* @param url*/public String getWebHtml(String url){   try {    URL myURL = new URL(url);    URLConnection conn = myURL.openConnection();    BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));    String line = null;    StringBuffer document = new StringBuffer("");    while ((line = reader.readLine()) != null){     document.append(line + "/n");    }    reader.close();       String resutlDocument = new String(document);    return resutlDocument;      } catch (Exception e) {}   return "";}

    }

     

    出处:【Gjava人才】 网址: http://www.gjrencai.com 转载时请注明出处和网址

    最新回复(0)