public List<String[]> extractItems(String url, String encoding, String gp, String itp) throws MalformedURLException, UnsupportedEncodingException, IOException ...{ gp = sanifyPattern(gp); itp = sanifyPattern(itp);
Pattern globalPattern = Pattern.compile(gp, Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.UNIX_LINES); Pattern itemPattern = Pattern.compile(itp, Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.UNIX_LINES);
String html = source(url, encoding); Matcher matcher = globalPattern.matcher(html); List<String[]> items = new ArrayList<String[]>(); if (matcher != null && matcher.find()) ...{ String global = matcher.group(1); Matcher itm = itemPattern.matcher(global); while (itm != null && itm.find()) ...{ List<String> groups = new ArrayList<String>(); for (int i = 1; i <= itm.groupCount(); i++) ...{ groups.add(itm.group(i)); } items.add(groups.toArray(new String[groups.size()])); } } return items; }
为了使得方便的抽取网页中的某些信息,采用JAVA里面的正则表达式写了一个可以抽取其中网页一些信息,并通过dom4j写为XML的程序。实现了对新闻,MP3等比较固定的网页模版的信息抽取工作。
一个全局的pattern选出有用信息块,然后通过itermpattern,重复的抽取网页中的信息单位。并将这些提取出来的items写成一个XML文件。 写dom4j的程序:
public String asXml(String fmt) ...{ Document doc = DocumentHelper.createDocument(); // <list> Element root = doc.addElement("list"); List<Song> songs = getSongs(); for (Song song : songs) ...{ Element element = DocumentHelper.createElement("song"); element.addElement("title").setText(song.getTitle()); element.addElement("album").setText(song.getAlbum()); element.addElement("singer").setText(song.getSinger()); element.addElement("link").setText(song.getLink()); element.addElement("source").setText(song.getSource()); element.addElement("format").setText(song.getFormat()); element.addElement("megabyte").setText( String.valueOf(song.getMegabyte())); element.addElement("speed") .setText(String.valueOf(song.getSpeed())); root.add(element); } root.addAttribute("total", String.valueOf(songs.size())); return doc.asXML(); }
