Spider

    技术2022-05-11  52

    import java.util.*;import java.net.*;import java.io.*;import javax.swing.text.*;import javax.swing.text.html.*;

    /** * That class implements a reusable spider. To use this * class you must have a class setup to recieve  * the information found by the spider. This class must  * implement the ISpiderReportable method. Written by  * Jeff Heaton. Jeff Heaton is the author of "Programming  * Spiders, Bots, and Aggregators" by Sybex. Jeff can be  * contacted through his web site at http://www.jeffheaton.com.  *  * @author Jeff Heaton(http://www.jeffheaton.com) * @version 1.0 */public class Spider {

      /**   * A collection of URL's that resulted in an error.   */  protected Collection workloadError = new ArrayList(3);

      /**   * A collection of URL's that are waiting to    * be processed.   */  protected Collection workloadWaiting = new ArrayList(3);

      /**   * A collection of URL's that were processed.   */  protected Collection workloadProcessed = new ArrayList(3);

      /**   * The class that the spider should report its   * URL's to.   */  protected ISpiderReportable report;

      /**   * A flag that indicates if this process   * should be canceled.   */  protected boolean cancel = false;

      /**   * The constructor.   *    * @param report A class that implements the ISpiderReportable interface,    * that will recieve information that the spider finds.   */  public Spider(ISpiderReportable report)  {    this.report = report;  }

      /**   * Get the URL's that resulted in an error.   *    * @return A collection of URL's.   */  public Collection getWorkloadError()  {    return workloadError;  }

      /**   * Get the URL's that were waiting to be processed.    * You should add one URL to this collection to    * begin the spider.   *    * @return A collection of URL's.   */  public Collection getWorkloadWaiting()  {    return workloadWaiting;  }

      /**   * Get the URL's that were processed by this spider.   *    * @return A collection of URL's.   */  public Collection getWorkloadProcessed()  {    return workloadProcessed;  }   

      /**   * Clear all of the workloads.   */  public void clear()  {    getWorkloadError().clear();    getWorkloadWaiting().clear();    getWorkloadProcessed().clear();  }

      /**   * Set a flag that will cause the begin   * method to return before it is done.   */  public void cancel()  {    cancel = true;  }

      /**   * Add a URL for processing.   *    * @param url   */  public void addURL(URL url)  {    if ( getWorkloadWaiting().contains(url) )      return;    if ( getWorkloadError().contains(url) )      return;    if ( getWorkloadProcessed().contains(url) )      return;    log("Adding to workload: " + url );    getWorkloadWaiting().add(url);  }

      /**   * Called internally to process a URL.   *    * @param url The URL to be processed.   */  public void processURL(URL url)  {    try {      log("Processing: " + url );      // get the URL's contents      URLConnection connection = url.openConnection();      if ( (connection.getContentType()!=null) &&           !connection.getContentType().toLowerCase().startsWith("text/") ) {        getWorkloadWaiting().remove(url);                getWorkloadProcessed().add(url);            log("Not processing because content type is: " + connection.getContentType() );        return;      }            // read the URL      InputStream is = connection.getInputStream();      Reader r = new InputStreamReader(is);      // parse the URL      HTMLEditorKit.Parser parse = new HTMLParse().getParser();      parse.parse(r,new Parser(url),true);    } catch ( IOException e ) {      getWorkloadWaiting().remove(url);      getWorkloadError().add(url);      log("Error: " + url );      report.spiderURLError(url);      return;    }    // mark URL as complete    getWorkloadWaiting().remove(url);            getWorkloadProcessed().add(url);    log("Complete: " + url );

      }

      /**   * Called to start the spider.   */  public void begin()  {    cancel = false;    while ( !getWorkloadWaiting().isEmpty() && !cancel ) {      Object list[] = getWorkloadWaiting().toArray();      for ( int i=0;(i<list.length)&&!cancel;i++ )        processURL((URL)list[i]);    }  }

     

    /** * A HTML parser callback used by this class to * detect links. *  * @author Jeff Heaton * @version 1.0 */  protected class Parser  extends HTMLEditorKit.ParserCallback {    protected URL base;

        public Parser(URL base)    {      this.base = base;    }

        public void handleSimpleTag(HTML.Tag t,                                MutableAttributeSet a,int pos)    {      String href = (String)a.getAttribute(HTML.Attribute.HREF);            if( (href==null) && (t==HTML.Tag.FRAME) )        href = (String)a.getAttribute(HTML.Attribute.SRC);              if ( href==null )        return;

          int i = href.indexOf('#');      if ( i!=-1 )        href = href.substring(0,i);

          if ( href.toLowerCase().startsWith("mailto:") ) {        report.spiderFoundEMail(href);        return;      }

          handleLink(base,href);    }

        public void handleStartTag(HTML.Tag t,                               MutableAttributeSet a,int pos)    {      handleSimpleTag(t,a,pos);// handle the same way

        }

        protected void handleLink(URL base,String str)    {      try {        URL url = new URL(base,str);                    if ( report.spiderFoundURL(base,url) )          addURL(url);      } catch ( MalformedURLException e ) {        log("Found malformed URL: " + str );      }    }

      }  

      /**   * Called internally to log information.   * This basic method just writes the log   * out to the stdout.   *    * @param entry The information to be written to the log.   */  public void log(String entry)  {    System.out.println( (new Date()) + ":" + entry );  }} 


    最新回复(0)