信息抽取

    技术2022-05-11  68

     

    public List<String[]> extractItems(String url, String encoding, String gp,            String itp) throws MalformedURLException,            UnsupportedEncodingException, IOException ...{        gp = sanifyPattern(gp);        itp = sanifyPattern(itp);

            Pattern globalPattern = Pattern.compile(gp, Pattern.CASE_INSENSITIVE                | Pattern.DOTALL | Pattern.UNIX_LINES);        Pattern itemPattern = Pattern.compile(itp, Pattern.CASE_INSENSITIVE                | Pattern.DOTALL | Pattern.UNIX_LINES);

            String html = source(url, encoding);        Matcher matcher = globalPattern.matcher(html);        List<String[]> items = new ArrayList<String[]>();        if (matcher != null && matcher.find()) ...{            String global = matcher.group(1);            Matcher itm = itemPattern.matcher(global);            while (itm != null && itm.find()) ...{                List<String> groups = new ArrayList<String>();                for (int i = 1; i <= itm.groupCount(); i++) ...{                    groups.add(itm.group(i));                }                items.add(groups.toArray(new String[groups.size()]));            }        }        return items;    } 

       为了使得方便的抽取网页中的某些信息,采用JAVA里面的正则表达式写了一个可以抽取其中网页一些信息,并通过dom4j写为XML的程序。实现了对新闻,MP3等比较固定的网页模版的信息抽取工作。

              一个全局的pattern选出有用信息块,然后通过itermpattern,重复的抽取网页中的信息单位。并将这些提取出来的items写成一个XML文件。 写dom4j的程序:

    public String asXml(String fmt) ...{        Document doc = DocumentHelper.createDocument();        // <list>        Element root = doc.addElement("list");        List<Song> songs = getSongs();        for (Song song : songs) ...{            Element element = DocumentHelper.createElement("song");            element.addElement("title").setText(song.getTitle());            element.addElement("album").setText(song.getAlbum());            element.addElement("singer").setText(song.getSinger());            element.addElement("link").setText(song.getLink());            element.addElement("source").setText(song.getSource());            element.addElement("format").setText(song.getFormat());            element.addElement("megabyte").setText(                    String.valueOf(song.getMegabyte()));            element.addElement("speed")                    .setText(String.valueOf(song.getSpeed()));            root.add(element);        }        root.addAttribute("total", String.valueOf(songs.size()));        return doc.asXML();    }

      


    最新回复(0)