Programming a Spider in Java 源码帖

Stella981
• 阅读 442
Programming a Spider in Java 源码帖
Listing 1: Finding the bad links (CheckLinks.java)
import java.awt.*;
import javax.swing.*;
import java.net.*;
import java.io.*;
/**
* This example uses a Java spider to scan a Web site
* and check for broken links. Written by Jeff Heaton.
* Jeff Heaton is the author of "Programming Spiders,
* Bots, and Aggregators" by Sybex. Jeff can be contacted
* through his Web site at http://www.jeffheaton.com.
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class CheckLinks extends javax.swing.JFrame implements
              Runnable,ISpiderReportable {
   /**
    * The constructor. Perform setup here.
    */
   public CheckLinks()
   {
     //{{INIT_CONTROLS
     setTitle("Find Broken Links");
     getContentPane().setLayout(null);
     setSize(405,288);
     setVisible(false);
     label1.setText("Enter a URL:");
     getContentPane().add(label1);
     label1.setBounds(12,12,84,12);
     begin.setText("Begin");
     begin.setActionCommand("Begin");
     getContentPane().add(begin);
     begin.setBounds(12,36,84,24);
     getContentPane().add(url);
     url.setBounds(108,36,288,24);
     errorScroll.setAutoscrolls(true);
     errorScroll.setHorizontalScrollBarPolicy(javax.swing.
                 ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);
     errorScroll.setVerticalScrollBarPolicy(javax.swing.
                 ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
     errorScroll.setOpaque(true);
     getContentPane().add(errorScroll);
     errorScroll.setBounds(12,120,384,156);
     errors.setEditable(false);
     errorScroll.getViewport().add(errors);
     errors.setBounds(0,0,366,138);
     current.setText("Currently Processing: ");
     getContentPane().add(current);
     current.setBounds(12,72,384,12);
     goodLinksLabel.setText("Good Links: 0");
     getContentPane().add(goodLinksLabel);
     goodLinksLabel.setBounds(12,96,192,12);
     badLinksLabel.setText("Bad Links: 0");
     getContentPane().add(badLinksLabel);
     badLinksLabel.setBounds(216,96,96,12);
     //}}
     //{{INIT_MENUS
     //}}
     //{{REGISTER_LISTENERS
     SymAction lSymAction = new SymAction();
     begin.addActionListener(lSymAction);
     //}}
   }
   /**
    * Main method for the application
    * 
    * @param args Not used
    */
   static public void main(String args[])
   {
     (new CheckLinks()).setVisible(true);
   }
   /**
    * Add notifications.
    */
   public void addNotify()
   {
     // Record the size of the window prior to calling parent's
     // addNotify.
     Dimension size = getSize();
     super.addNotify();
     if ( frameSizeAdjusted )
       return;
     frameSizeAdjusted = true;
     // Adjust size of frame according to the insets and menu bar
     Insets insets = getInsets();
     javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
     int menuBarHeight = 0;
     if ( menuBar != null )
       menuBarHeight = menuBar.getPreferredSize().height;
     setSize(insets.left + insets.right + size.width, insets.top +
                           insets.bottom + size.height + 
                           menuBarHeight);
   }
   // Used by addNotify
   boolean frameSizeAdjusted = false;
   //{{DECLARE_CONTROLS
   javax.swing.JLabel label1 = new javax.swing.JLabel();
   /**
    * The begin or cancel button
    */
   javax.swing.JButton begin = new javax.swing.JButton();
   /**
    * The URL being processed
    */
   javax.swing.JTextField url = new javax.swing.JTextField();
   /**
    * Scroll the errors.
    */
   javax.swing.JScrollPane errorScroll =
         new javax.swing.JScrollPane();
   /**
    * A place to store the errors created
    */
   javax.swing.JTextArea errors = new javax.swing.JTextArea();
   javax.swing.JLabel current = new javax.swing.JLabel();
   javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
   javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
   //}}
   //{{DECLARE_MENUS
   //}}
   /**
    * The background spider thread
    */
   protected Thread backgroundThread;
   /**
    * The spider object being used
    */
   protected Spider spider;
   /**
    * The URL that the spider began with
    */
   protected URL base;
   /**
    * How many bad links have been found
    */
   protected int badLinksCount = 0;
   /**
    * How many good links have been found
    */
   protected int goodLinksCount = 0;

   /**
    * Internal class used to dispatch events
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class SymAction implements java.awt.event.ActionListener {
     public void actionPerformed(java.awt.event.ActionEvent event)
     {
       Object object = event.getSource();
       if ( object == begin )
         begin_actionPerformed(event);
     }
   }
   /**
    * Called when the begin or cancel buttons are clicked
    * 
    * @param event The event associated with the button.
    */
   void begin_actionPerformed(java.awt.event.ActionEvent event)
   {
     if ( backgroundThread==null ) {
       begin.setLabel("Cancel");
       backgroundThread = new Thread(this);
       backgroundThread.start();
       goodLinksCount=0;
       badLinksCount=0;
     } else {
       spider.cancel();
     }
   }
   /**
    * Perform the background thread operation. This method
    * actually starts the background thread.
    */
   public void run()
   {
     try {
       errors.setText("");
       spider = new Spider(this);
       spider.clear();
       base = new URL(url.getText());
       spider.addURL(base);
       spider.begin();
       Runnable doLater = new Runnable()
       {
         public void run()
         {
           begin.setText("Begin");
         }
       };
       SwingUtilities.invokeLater(doLater);
       backgroundThread=null;
     } catch ( MalformedURLException e ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = "Bad address.";
       SwingUtilities.invokeLater(err);
     }
   }
   /**
    * Called by the spider when a URL is found. It is here
    * that links are validated.
    * 
    * @param base The page that the link was found on.
    * @param url The actual link address.
    */
   public boolean spiderFoundURL(URL base,URL url)
   {
     UpdateCurrentStats cs = new UpdateCurrentStats();
     cs.msg = url.toString();
     SwingUtilities.invokeLater(cs);
     if ( !checkLink(url) ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = url+"(on page " + base + ")\n";
       SwingUtilities.invokeLater(err);
       badLinksCount++;
       return false;
     }
     goodLinksCount++;
     if ( !url.getHost().equalsIgnoreCase(base.getHost()) )
       return false;
     else
       return true;
   }
   /**
    * Called when a URL error is found
    * 
    * @param url The URL that resulted in an error.
    */
   public void spiderURLError(URL url)
   {
   }
   /**
    * Called internally to check whether a link is good
    * 
    * @param url The link that is being checked.
    * @return True if the link was good, false otherwise.
    */
   protected boolean checkLink(URL url)
   {
     try {
       URLConnection connection = url.openConnection();
       connection.connect();
       return true;
     } catch ( IOException e ) {
       return false;
     }
   }
   /**
    * Called when the spider finds an e-mail address
    * 
    * @param email The email address the spider found.
    */
   public void spiderFoundEMail(String email)
   {
   }
   /**
    * Internal class used to update the error information
    * in a Thread-Safe way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateErrors implements Runnable {
     public String msg;
     public void run()
     {
       errors.append(msg);
     }
   }
   /**
    * Used to update the current status information
    * in a "Thread-Safe" way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateCurrentStats implements Runnable {
     public String msg;
     public void run()
     {
       current.setText("Currently Processing: " + msg );
       goodLinksLabel.setText("Good Links: " + goodLinksCount);
       badLinksLabel.setText("Bad Links: " + badLinksCount);
     }
   }
}
Listing 2: Reporting spider events(ISpiderReportable.java)
import java.net.*;
interface ISpiderReportable {
   public boolean spiderFoundURL(URL base,URL url);
   public void spiderURLError(URL url);
   public void spiderFoundEMail(String email);
}
Listing 3: A reusable spider (Spider.java)
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
/**
* That class implements a reusable spider
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class Spider {
   /**
    * A collection of URLs that resulted in an error
    */
   protected Collection workloadError = new ArrayList(3);
   /**
    * A collection of URLs that are waiting to be processed
    */
   protected Collection workloadWaiting = new ArrayList(3);
   /**
    * A collection of URLs that were processed
    */
   protected Collection workloadProcessed = new ArrayList(3);
   /**
    * The class that the spider should report its URLs to
    */
   protected ISpiderReportable report;
   /**
    * A flag that indicates whether this process
    * should be canceled
    */
   protected boolean cancel = false;
   /**
    * The constructor
    * 
    * @param report A class that implements the ISpiderReportable
    * interface, that will receive information that the
    * spider finds.
    */
   public Spider(ISpiderReportable report)
   {
     this.report = report;
   }
   /**
    * Get the URLs that resulted in an error.
    * 
    * @return A collection of URL's.
    */
   public Collection getWorkloadError()
   {
     return workloadError;
   }
   /**
    * Get the URLs that were waiting to be processed.
    * You should add one URL to this collection to
    * begin the spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadWaiting()
   {
     return workloadWaiting;
   }
   /**
    * Get the URLs that were processed by this spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadProcessed()
   {
     return workloadProcessed;
   }    
   /**
    * Clear all of the workloads.
    */
   public void clear()
   {
     getWorkloadError().clear();
     getWorkloadWaiting().clear();
     getWorkloadProcessed().clear();
   }
   /**
    * Set a flag that will cause the begin
    * method to return before it is done.
    */
   public void cancel()
   {
     cancel = true;
   }
   /**
    * Add a URL for processing.
    * 
    * @param url
    */
   public void addURL(URL url)
   {
     if ( getWorkloadWaiting().contains(url) )
       return;
     if ( getWorkloadError().contains(url) )
       return;
     if ( getWorkloadProcessed().contains(url) )
       return;
     log("Adding to workload: " + url );
     getWorkloadWaiting().add(url);
   }
   /**
    * Called internally to process a URL
    * 
    * @param url The URL to be processed.
    */
   public void processURL(URL url)
   {
     try {
       log("Processing: " + url );
       // get the URL's contents
       URLConnection connection = url.openConnection();
       if ( (connection.getContentType()!=null) &&
            !connection.getContentType().toLowerCase().s
                        tartsWith("text/") ) {
         getWorkloadWaiting().remove(url);
         getWorkloadProcessed().add(url);
         log("Not processing because content type is: " +
              connection.getContentType() );
         return;
       }
      
       // read the URL
       InputStream is = connection.getInputStream();
       Reader r = new InputStreamReader(is);
       // parse the URL
       HTMLEditorKit.Parser parse = new HTMLParse().getParser();
       parse.parse(r,new Parser(url),true);
     } catch ( IOException e ) {
       getWorkloadWaiting().remove(url);
       getWorkloadError().add(url);
       log("Error: " + url );
       report.spiderURLError(url);
       return;
     }
     // mark URL as complete
     getWorkloadWaiting().remove(url);
     getWorkloadProcessed().add(url);
     log("Complete: " + url );
   }
   /**
    * Called to start the spider
    */
   public void begin()
   {
     cancel = false;
     while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
       Object list[] = getWorkloadWaiting().toArray();
       for ( int i=0;(i<list.length)&&!cancel;i++ )
         processURL((URL)list[i]);
     }
   }
/**
* A HTML parser callback used by this class to detect links
* 
* @author Jeff Heaton
* @version 1.0
*/
   protected class Parser
   extends HTMLEditorKit.ParserCallback {
     protected URL base;
     public Parser(URL base)
     {
       this.base = base;
     }
     public void handleSimpleTag(HTML.Tag t,
                                 MutableAttributeSet a,int pos)
     {
       String href = (String)a.getAttribute(HTML.Attribute.HREF);
      
       if( (href==null) && (t==HTML.Tag.FRAME) )
         href = (String)a.getAttribute(HTML.Attribute.SRC);
        
       if ( href==null )
         return;
       int i = href.indexOf('#');
       if ( i!=-1 )
         href = href.substring(0,i);
       if ( href.toLowerCase().startsWith("mailt") ) {
         report.spiderFoundEMail(href);
         return;
       }
       handleLink(base,href);
     }
     public void handleStartTag(HTML.Tag t,
                                MutableAttributeSet a,int pos)
     {
       handleSimpleTag(t,a,pos);     // handle the same way
     }
     protected void handleLink(URL base,String str)
     {
       try {
         URL url = new URL(base,str);
         if ( report.spiderFoundURL(base,url) )
           addURL(url);
       } catch ( MalformedURLException e ) {
         log("Found malformed URL: " + str );
       }
     }
   }
   /**
    * Called internally to log information
    * This basic method just writes the log
    * out to the stdout.
    * 
    * @param entry The information to be written to the log.
    */
   public void log(String entry)
   {
     System.out.println( (new Date()) + ":" + entry );
   }
}
Listing 4: Parsing HTML (HTMLParse.java)
import javax.swing.text.html.*;
public class HTMLParse extends HTMLEditorKit {
   public HTMLEditorKit.Parser getParser()
   {
     return super.getParser();
   }
}
点赞
收藏
评论区
推荐文章
blmius blmius
2年前
MySQL:[Err] 1292 - Incorrect datetime value: ‘0000-00-00 00:00:00‘ for column ‘CREATE_TIME‘ at row 1
文章目录问题用navicat导入数据时,报错:原因这是因为当前的MySQL不支持datetime为0的情况。解决修改sql\mode:sql\mode:SQLMode定义了MySQL应支持的SQL语法、数据校验等,这样可以更容易地在不同的环境中使用MySQL。全局s
Jacquelyn38 Jacquelyn38
2年前
2020年前端实用代码段,为你的工作保驾护航
有空的时候,自己总结了几个代码段,在开发中也经常使用,谢谢。1、使用解构获取json数据let jsonData  id: 1,status: "OK",data: 'a', 'b';let  id, status, data: number   jsonData;console.log(id, status, number )
皕杰报表之UUID
​在我们用皕杰报表工具设计填报报表时,如何在新增行里自动增加id呢?能新增整数排序id吗?目前可以在新增行里自动增加id,但只能用uuid函数增加UUID编码,不能新增整数排序id。uuid函数说明:获取一个UUID,可以在填报表中用来创建数据ID语法:uuid()或uuid(sep)参数说明:sep布尔值,生成的uuid中是否包含分隔符'',缺省为
Wesley13 Wesley13
2年前
Java获得今日零时零分零秒的时间(Date型)
publicDatezeroTime()throwsParseException{    DatetimenewDate();    SimpleDateFormatsimpnewSimpleDateFormat("yyyyMMdd00:00:00");    SimpleDateFormatsimp2newS
Wesley13 Wesley13
2年前
mysql设置时区
mysql设置时区mysql\_query("SETtime\_zone'8:00'")ordie('时区设置失败,请联系管理员!');中国在东8区所以加8方法二:selectcount(user\_id)asdevice,CONVERT\_TZ(FROM\_UNIXTIME(reg\_time),'08:00','0
Wesley13 Wesley13
2年前
00:Java简单了解
浅谈Java之概述Java是SUN(StanfordUniversityNetwork),斯坦福大学网络公司)1995年推出的一门高级编程语言。Java是一种面向Internet的编程语言。随着Java技术在web方面的不断成熟,已经成为Web应用程序的首选开发语言。Java是简单易学,完全面向对象,安全可靠,与平台无关的编程语言。
Stella981 Stella981
2年前
Django中Admin中的一些参数配置
设置在列表中显示的字段,id为django模型默认的主键list_display('id','name','sex','profession','email','qq','phone','status','create_time')设置在列表可编辑字段list_editable
Stella981 Stella981
2年前
Eclipse插件开发_学习_00_资源帖
一、官方资料 1.eclipseapi(https://www.oschina.net/action/GoToLink?urlhttp%3A%2F%2Fhelp.eclipse.org%2Fmars%2Findex.jsp%3Ftopic%3D%252Forg.eclipse.platform.doc.isv%252Fguide%2
Wesley13 Wesley13
2年前
MySQL部分从库上面因为大量的临时表tmp_table造成慢查询
背景描述Time:20190124T00:08:14.70572408:00User@Host:@Id:Schema:sentrymetaLast_errno:0Killed:0Query_time:0.315758Lock_
Python进阶者 Python进阶者
3个月前
Excel中这日期老是出来00:00:00,怎么用Pandas把这个去除
大家好,我是皮皮。一、前言前几天在Python白银交流群【上海新年人】问了一个Pandas数据筛选的问题。问题如下:这日期老是出来00:00:00,怎么把这个去除。二、实现过程后来【论草莓如何成为冻干莓】给了一个思路和代码如下:pd.toexcel之前把这