Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖

Listing 1: Finding the bad links (CheckLinks.java)

import java.awt.*;

import javax.swing.*;

import java.net.*;

import java.io.*;

/**

* This example uses a Java spider to scan a Web site

* and check for broken links. Written by Jeff Heaton.

* Jeff Heaton is the author of "Programming Spiders,

* Bots, and Aggregators" by Sybex. Jeff can be contacted

* through his Web site at http://www.jeffheaton.com.

*

* @author Jeff Heaton(http://www.jeffheaton.com)

* @version 1.0

*/

public class CheckLinks extends javax.swing.JFrame implements Runnable,ISpiderReportable {

   /**

    * The constructor. Perform setup here.

    */

   public CheckLinks() {

     //{{INIT_CONTROLS

     setTitle("Find Broken Links");

     getContentPane().setLayout(null);

     setSize(405,288);

     setVisible(false);

     label1.setText("Enter a URL:");

     getContentPane().add(label1);

     label1.setBounds(12,12,84,12);

     begin.setText("Begin");

     begin.setActionCommand("Begin");

     getContentPane().add(begin);

     begin.setBounds(12,36,84,24);

     getContentPane().add(url);

     url.setBounds(108,36,288,24);

     errorScroll.setAutoscrolls(true);

     errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);

     errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);

     errorScroll.setOpaque(true);

     getContentPane().add(errorScroll);

     errorScroll.setBounds(12,120,384,156);

     errors.setEditable(false);

     errorScroll.getViewport().add(errors);

     errors.setBounds(0,0,366,138);

     current.setText("Currently Processing: ");

     getContentPane().add(current);

     current.setBounds(12,72,384,12);

     goodLinksLabel.setText("Good Links: 0");

     getContentPane().add(goodLinksLabel);

     goodLinksLabel.setBounds(12,96,192,12);

     badLinksLabel.setText("Bad Links: 0");

     getContentPane().add(badLinksLabel);

     badLinksLabel.setBounds(216,96,96,12);

     //}}

     //{{INIT_MENUS

     //}}

     //{{REGISTER_LISTENERS

     SymAction lSymAction = new SymAction();

     begin.addActionListener(lSymAction);

     //}}

   }

   /**

    * Main method for the application

    *

    * @param args Not used

    */

   static public void main(String args[]){

     (new CheckLinks()).setVisible(true);

   }

   /**

    * Add notifications.

    */

   public void addNotify(){

     // Record the size of the window prior to calling parent's

     // addNotify.

     Dimension size = getSize();

     super.addNotify();

     if ( frameSizeAdjusted )

       return;

     frameSizeAdjusted = true;

     // Adjust size of frame according to the insets and menu bar

     Insets insets = getInsets();

     javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();

     int menuBarHeight = 0;

     if ( menuBar != null )

       menuBarHeight = menuBar.getPreferredSize().height;

     setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);

   }

   // Used by addNotify

   boolean frameSizeAdjusted = false;

   //{{DECLARE_CONTROLS

   javax.swing.JLabel label1 = new javax.swing.JLabel();

   /**

    * The begin or cancel button

    */

   javax.swing.JButton begin = new javax.swing.JButton();

   /**

    * The URL being processed

    */

   javax.swing.JTextField url = new javax.swing.JTextField();

   /**

    * Scroll the errors.

    */

   javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();

   /**

    * A place to store the errors created

    */

   javax.swing.JTextArea errors = new javax.swing.JTextArea();

   javax.swing.JLabel current = new javax.swing.JLabel();

   javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();

   javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();

   //}}

   //{{DECLARE_MENUS

   //}}

   /**

    * The background spider thread

    */

   protected Thread backgroundThread;

   /**

    * The spider object being used

    */

   protected Spider spider;

   /**

    * The URL that the spider began with

    */

   protected URL base;

   /**

    * How many bad links have been found

    */

   protected int badLinksCount = 0;

   /**

    * How many good links have been found

    */

   protected int goodLinksCount = 0;

   /**

    * Internal class used to dispatch events

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class SymAction implements java.awt.event.ActionListener {

     public void actionPerformed(java.awt.event.ActionEvent event){

       Object object = event.getSource();

       if ( object == begin )

         begin_actionPerformed(event);

     }

   }

   /**

    * Called when the begin or cancel buttons are clicked

    *

    * @param event The event associated with the button.

    */

   void begin_actionPerformed(java.awt.event.ActionEvent event){

     if ( backgroundThread==null ) {

       begin.setLabel("Cancel");

       backgroundThread = new Thread(this);

       backgroundThread.start();

       goodLinksCount=0;

       badLinksCount=0;

     } else {

       spider.cancel();

     }

   }

   /**

    * Perform the background thread operation. This method

    * actually starts the background thread.

    */

   public void run(){

     try {

       errors.setText("");

       spider = new Spider(this);

       spider.clear();

       base = new URL(url.getText());

       spider.addURL(base);

       spider.begin();

       Runnable doLater = new Runnable(){

         public void run(){

           begin.setText("Begin");

         }

       };

       SwingUtilities.invokeLater(doLater);

       backgroundThread=null;

     } catch ( MalformedURLException e ) {

       UpdateErrors err = new UpdateErrors();

       err.msg = "Bad address.";

       SwingUtilities.invokeLater(err);

     }

   }

   /**

    * Called by the spider when a URL is found. It is here

    * that links are validated.

    *

    * @param base The page that the link was found on.

    * @param url The actual link address.

    */

   public boolean spiderFoundURL(URL base,URL url){

     UpdateCurrentStats cs = new UpdateCurrentStats();

     cs.msg = url.toString();

     SwingUtilities.invokeLater(cs);

     if ( !checkLink(url) ) {

       UpdateErrors err = new UpdateErrors();

       err.msg = url+"(on page " + base + ")\n";

       SwingUtilities.invokeLater(err);

       badLinksCount++;

       return false;

     }

     goodLinksCount++;

     if ( !url.getHost().equalsIgnoreCase(base.getHost()) )

       return false;

     else

       return true;

   }

   /**

    * Called when a URL error is found

    *

    * @param url The URL that resulted in an error.

    */

   public void spiderURLError(URL url){

   }

   /**

    * Called internally to check whether a link is good

    *

    * @param url The link that is being checked.

    * @return True if the link was good, false otherwise.

    */

   protected boolean checkLink(URL url){

     try {

       URLConnection connection = url.openConnection();

       connection.connect();

       return true;

     } catch ( IOException e ) {

       return false;

     }

   }

   /**

    * Called when the spider finds an e-mail address

    *

    * @param email The email address the spider found.

    */

   public void spiderFoundEMail(String email){

   }

   /**

    * Internal class used to update the error information

    * in a Thread-Safe way

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class UpdateErrors implements Runnable {

     public String msg;

     public void run(){

       errors.append(msg);

     }

   }

   /**

    * Used to update the current status information

    * in a "Thread-Safe" way

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class UpdateCurrentStats implements Runnable {

     public String msg;

     public void run(){

       current.setText("Currently Processing: " + msg );

       goodLinksLabel.setText("Good Links: " + goodLinksCount);

       badLinksLabel.setText("Bad Links: " + badLinksCount);

     }

   }

}

Listing 2: Reporting spider events(ISpiderReportable.java)

import java.net.*;

interface ISpiderReportable {

   public boolean spiderFoundURL(URL base,URL url);

   public void spiderURLError(URL url);

   public void spiderFoundEMail(String email);

}

Listing 3: A reusable spider (Spider.java)

import java.util.*;

import java.net.*;

import java.io.*;

import javax.swing.text.*;

import javax.swing.text.html.*;

/**

* That class implements a reusable spider

*

* @author Jeff Heaton(http://www.jeffheaton.com)

* @version 1.0

*/

public class Spider {

   /**

    * A collection of URLs that resulted in an error

    */

   protected Collection workloadError = new ArrayList(3);

   /**

    * A collection of URLs that are waiting to be processed

    */

   protected Collection workloadWaiting = new ArrayList(3);

   /**

    * A collection of URLs that were processed

    */

   protected Collection workloadProcessed = new ArrayList(3);

   /**

    * The class that the spider should report its URLs to

    */

   protected ISpiderReportable report;

   /**

    * A flag that indicates whether this process

    * should be canceled

    */

   protected boolean cancel = false;

   /**

    * The constructor

    *

    * @param report A class that implements the ISpiderReportable

    * interface, that will receive information that the

    * spider finds.

    */

   public Spider(ISpiderReportable report){

     this.report = report;

   }

   /**

    * Get the URLs that resulted in an error.

    *

    * @return A collection of URL's.

    */

   public Collection getWorkloadError(){

     return workloadError;

   }

   /**

    * Get the URLs that were waiting to be processed.

    * You should add one URL to this collection to

    * begin the spider.

    *

    * @return A collection of URLs.

    */

   public Collection getWorkloadWaiting(){

     return workloadWaiting;

   }

   /**

    * Get the URLs that were processed by this spider.

    *

    * @return A collection of URLs.

    */

   public Collection getWorkloadProcessed(){

     return workloadProcessed;

   }

   /**

    * Clear all of the workloads.

    */

   public void clear(){

     getWorkloadError().clear();

     getWorkloadWaiting().clear();

     getWorkloadProcessed().clear();

   }

   /**

    * Set a flag that will cause the begin

    * method to return before it is done.

    */

   public void cancel(){

     cancel = true;

   }

   /**

    * Add a URL for processing.

    *

    * @param url

    */

   public void addURL(URL url){

     if ( getWorkloadWaiting().contains(url) )

       return;

     if ( getWorkloadError().contains(url) )

       return;

     if ( getWorkloadProcessed().contains(url) )

       return;

     log("Adding to workload: " + url );

     getWorkloadWaiting().add(url);

   }

   /**

    * Called internally to process a URL

    *

    * @param url The URL to be processed.

    */

   public void processURL(URL url){

     try {

       log("Processing: " + url );

       // get the URL's contents

       URLConnection connection = url.openConnection();

       if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {

         getWorkloadWaiting().remove(url);

         getWorkloadProcessed().add(url);

         log("Not processing because content type is: " + connection.getContentType() );

         return;

       }

       // read the URL

       InputStream is = connection.getInputStream();

       Reader r = new InputStreamReader(is);

       // parse the URL

       HTMLEditorKit.Parser parse = new HTMLParse().getParser();

       parse.parse(r,new Parser(url),true);

     } catch ( IOException e ) {

       getWorkloadWaiting().remove(url);

       getWorkloadError().add(url);

       log("Error: " + url );

       report.spiderURLError(url);

       return;

     }

     // mark URL as complete

     getWorkloadWaiting().remove(url);

     getWorkloadProcessed().add(url);

     log("Complete: " + url );

   }

   /**

    * Called to start the spider

    */

   public void begin(){

     cancel = false;

     while ( !getWorkloadWaiting().isEmpty() && !cancel ) {

       Object list[] = getWorkloadWaiting().toArray();

       for ( int i=0;(i<list.length)&&!cancel;i++ )

         processURL((URL)list[i]);

     }

   }

/**

* A HTML parser callback used by this class to detect links

*

* @author Jeff Heaton

* @version 1.0

*/

   protected class Parser

   extends HTMLEditorKit.ParserCallback {

     protected URL base;

     public Parser(URL base){

       this.base = base;

     }

     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos){

       String href = (String)a.getAttribute(HTML.Attribute.HREF);

       if( (href==null) && (t==HTML.Tag.FRAME) )

         href = (String)a.getAttribute(HTML.Attribute.SRC);

       if ( href==null )

         return;

       int i = href.indexOf('#');

       if ( i!=-1 )

         href = href.substring(0,i);

       if ( href.toLowerCase().startsWith("mailt") ) {

         report.spiderFoundEMail(href);

         return;

       }

       handleLink(base,href);

     }

     public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos){

       handleSimpleTag(t,a,pos);     // handle the same way

     }

     protected void handleLink(URL base,String str){

       try {

         URL url = new URL(base,str);

         if ( report.spiderFoundURL(base,url) )

           addURL(url);

       } catch ( MalformedURLException e ) {

         log("Found malformed URL: " + str );

       }

     }

   }

   /**

    * Called internally to log information

    * This basic method just writes the log

    * out to the stdout.

    *

    * @param entry The information to be written to the log.

    */

   public void log(String entry){

     System.out.println( (new Date()) + ":" + entry );

   }

}

Listing 4: Parsing HTML (HTMLParse.java)

import javax.swing.text.html.*;

public class HTMLParse extends HTMLEditorKit {

   public HTMLEditorKit.Parser getParser(){

     return super.getParser();

   }

}

Programming a Spider in Java 源码帖的更多相关文章

如何阅读Java源码阅读java的真实体会
刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动. 源码阅读,我觉得最核心有三点:技术基础+强烈的求知欲+耐心. 说到技术基础,我打个比 ...
Android反编译(一)之反编译JAVA源码
Android反编译(一) 之反编译JAVA源码 [目录] 1.工具 2.反编译步骤 3.实例 4.装X技巧 1.工具 1).dex反编译JAR工具 dex2jar http://code.go ...
如何阅读Java源码
刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动.源码阅读,我觉得最核心有三点:技术基础+强烈的求知欲+耐心. 说到技术基础,我打个比方吧, ...
Java 源码学习线路————_先JDK工具包集合_再core包，也就是String、StringBuffer等_Java IO类库
http://www.iteye.com/topic/1113732 原则网址 Java源码初接触如果你进行过一年左右的开发,喜欢用eclipse的debug功能.好了,你现在就有阅读源码的技术基础 ...
解密随机数生成器（二）——从java源码看线性同余算法
Random Java中的Random类生成的是伪随机数,使用的是48-bit的种子,然后调用一个linear congruential formula线性同余方程(Donald Knuth的编程艺术 ...
Java--Eclipse关联Java源码
打开Eclipse,Window->Preferences->Java 点Edit按钮后弹出: 点Source Attachment后弹出: 选择Java安装路径下的src.zip文件即可 ...
使用JDT.AST解析java源码
在做java源码的静态代码审计时,最基础的就是对java文件进行解析,从而获取到此java文件的相关信息: 在java文件中所存在的东西很多,很复杂,难以用相关的正则表达式去一一匹配.但是,eclip ...
[收藏] Java源码阅读的真实体会
收藏自http://www.iteye.com/topic/1113732 刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动. 源码阅读,我 ...
Java源码解读(一)——HashMap
HashMap作为常用的一种数据结构,阅读源码去了解其底层的实现是十分有必要的.在这里也分享自己阅读源码遇到的困难以及自己的思考. HashMap的源码介绍已经有许许多多的博客,这里只记录了一些我看源 ...

随机推荐

u-boot Makefile Source Test
一.概述笔者已经写了一篇实现目标文件与源码分开的makefile测试实验,但是觉得不够完美,没有更多的体现u-boot Makefile的工作原理和特点.所以,决定重新修订,使之更加充分的接近u-b ...
Memento:客户端瘦身
说是客户端瘦身,其实备忘录模式的本质让调用客户端职责减轻,将客户端的对于实现比如数据恢复之类细节的内容封装在操作类之中.其实面向对象的一重要方面就是划分清楚职责,这样可以减少改到造成的影响,便于扩展. ...
JMX示例
HelloJMXMBean.java package jmx; /** * Created by george on 14-8-21. */ public interface HelloJMXMBea ...
[BZOJ 3282] Tree 【LCT】
题目链接:BZOJ - 3282 题目分析这道题是裸的LCT,包含 Link , Cut 和询问两点之间的路径信息. 写代码时出现的错误:Access(x) 的循环中应该切断的是原来的 Son[x] ...
[BZOJ 3129] [Sdoi2013] 方程【容斥+组合数取模+中国剩余定理】
题目链接:BZOJ - 3129 题目分析使用隔板法的思想,如果没有任何限制条件,那么方案数就是 C(m - 1, n - 1). 如果有一个限制条件是 xi >= Ai ,那么我们就可以将 ...
ios7新特性3-Map Kit新特性
Map Kit 框架 (MapKit.framework) 包含了大量的改进以及为基于地图的程序提供了新特性.利用地图显示位置信息的应用现在可以使用Maps这个程序用到的3D地图,包括控制程序控制视线 ...
Features of Spring Web MVC
21.1.1 Features of Spring Web MVC Spring Web Flow Spring Web Flow (SWF) aims to be the best solution ...
如何快速使用ECharts绘制可视化图表
1.在ECharts官网,下载ECharts的源码和示例文件. 2.解压缩下载下来的Echars压缩包,找到doc\example\www\echartsjs目录,将里面的js文件全部取出来,放到项目 ...
bzoj1816
这道题不是很难,二分答案+判定即可注意在一套牌中Joker只能用一次 ..] of longint; mid,l,r,n,m,i,ans:longint; function check(x: ...
HＤU-1969 Pie
http://acm.hdu.edu.cn/showproblem.php?pid=1969 Pie Time Limit: 5000/1000 MS (Java/Others) Memory ...

Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖的更多相关文章

随机推荐

热门专题