入口类
import java.awt.Dimension;import java.awt.Insets;import java.awt.event.ActionEvent;import java.awt.event.ActionListener;import java.io.IOException;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import javax.swing.JButton;import javax.swing.JFrame;import javax.swing.JLabel;import javax.swing.JMenuBar;import javax.swing.JScrollPane;import javax.swing.JTextArea;import javax.swing.JTextField;import javax.swing.ScrollPaneConstants;import javax.swing.SwingUtilities;/** * Description 检查URL是否是合法的URL,入口类,直接运行该类,将需要分析的URL地址粘入文本框即可 * * @author wangxu * */public class CheckLinks extends JFrame implements Runnable, ISpiderReportable { // Used by addNotify boolean frameSizeAdjusted = false; JLabel label1 = new JLabel(); JButton begin = new JButton(); JTextField url = new JTextField(); JScrollPane errorScroll = new JScrollPane(); JTextArea errors = new JTextArea(); JLabel current = new JLabel(); JLabel goodLinksLabel = new JLabel(); JLabel badLinksLabel = new JLabel(); protected Thread backgroundThread; protected Spider spider; protected URL base; protected int badLinksCount = 0; protected int goodLinksCount = 0; private static final long serialVersionUID = 1L; public CheckLinks() { setTitle("Find Broken Links");// 设置JFrame的标题 getContentPane().setLayout(null);// 设置布局方式 setSize(405, 288); setVisible(true); label1.setText("Enter a URL:"); getContentPane().add(label1); label1.setBounds(12, 12, 84, 12); begin.setText("Begin"); begin.setActionCommand("Begin"); getContentPane().add(begin); begin.setBounds(12, 36, 84, 24);// 设置坐标和宽、高 getContentPane().add(url); url.setBounds(108, 36, 288, 24); errorScroll.setAutoscrolls(true);// 自动显示滚动条 errorScroll.setHorizontalScrollBarPolicy(ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);// 水平方向始终显示 errorScroll.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);// 垂直方向始终显示 errorScroll.setOpaque(true);// 设置不透明 getContentPane().add(errorScroll); errorScroll.setBounds(12, 120, 384, 156); errors.setEditable(false);// 设置不可编辑 errorScroll.getViewport().add(errors);// 将文本域添加进滚动条 errors.setBounds(0, 0, 366, 138); current.setText("Currently Processing: "); getContentPane().add(current);// 加入显示当前信息的JLabel current.setBounds(12, 72, 384, 12); goodLinksLabel.setText("Good Links: 0"); getContentPane().add(goodLinksLabel); goodLinksLabel.setBounds(12, 96, 192, 12); badLinksLabel.setText("Bad Links: 0"); getContentPane().add(badLinksLabel); badLinksLabel.setBounds(216, 96, 96, 12); SymAction lSymAction = new SymAction();// 实例化一个事件监听器 begin.addActionListener(lSymAction);// 注册监听 } static public void main(String args[]) { new CheckLinks();// 程序入口 } public void addNotify() { // Record the size of the window prior to calling parent's addNotify. Dimension size = getSize(); super.addNotify(); if (frameSizeAdjusted) return; frameSizeAdjusted = true; // Adjust size of frame according to the insets and menu bar Insets insets = getInsets(); JMenuBar menuBar = getRootPane().getJMenuBar(); int menuBarHeight = 0; if (menuBar != null) menuBarHeight = menuBar.getPreferredSize().height; setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight); } class SymAction implements ActionListener { public void actionPerformed(ActionEvent event) { Object object = event.getSource(); if (object == begin) begin_actionPerformed(event); } } void begin_actionPerformed(ActionEvent event) { if (backgroundThread == null) { begin.setText("Cancel"); backgroundThread = new Thread(this);// 用当前对象来实例化一个Thread对象 backgroundThread.start();// 启动线程,执行run方法 goodLinksCount = 0; badLinksCount = 0; } else { spider.cancel();// 设置标志位true } } @Override public void run() { try { errors.setText(""); spider = new Spider(this);// 用当前对象来实例化一个Spider对象,因为当前类实现了ISpiderReportable接口 spider.clear(); base = new URL(url.getText());// 取得需要搜索的URL地址 spider.addURL(base);//将URL地址加入spider spider.begin();//spider开始工作 Runnable doLater = new Runnable() { public void run() { begin.setText("Begin"); } }; // 导致 doRun.run() 在 AWT 事件指派线程上异步执行。在所有挂起的 AWT // 事件被处理后才发生。此方法应该在应用程序线程需要更新该 GUI时使用。在下面的示例中,invokeLater // 调用将事件指派线程上的 Runnable对象 doHelloWorld加入队列,然后输出一条信息。 SwingUtilities.invokeLater(doLater); backgroundThread = null;// 将后台线程重新置空,以便接受下一个URL } catch (MalformedURLException e) { UpdateErrors err = new UpdateErrors(); err.msg = "Bad address."; SwingUtilities.invokeLater(err); } } //检测两个URL地址是否属于同一主机,如果是返回true,否则false @Override public boolean spiderFoundURL(URL base, URL url) { UpdateCurrentStats cs = new UpdateCurrentStats(); cs.msg = url.toString();//将URL信息赋值给cs.msg,使用后台线程进行打印 SwingUtilities.invokeLater(cs); if (!checkLink(url)) { UpdateErrors err = new UpdateErrors(); err.msg = url + "(on page " + base + ")\n"; SwingUtilities.invokeLater(err); badLinksCount++; return false; } goodLinksCount++; if (!url.getHost().equalsIgnoreCase(base.getHost())) return false; else return true; } @Override public void spiderURLError(URL url) { System.out.println("没找到的URL:" + url); } protected boolean checkLink(URL url) { try { URLConnection connection = url.openConnection(); connection.connect(); return true; } catch (IOException e) { return false; } } public void spiderFoundEMail(String email) { System.out.println("获得Email:" + email); } class UpdateErrors implements Runnable { public String msg; public void run() { errors.append(msg); } } class UpdateCurrentStats implements Runnable { public String msg; public void run() { current.setText("Currently Processing: " + msg); goodLinksLabel.setText("Good Links: " + goodLinksCount); badLinksLabel.setText("Bad Links: " + badLinksCount); } }}
import javax.swing.text.html.*;/** * Swing JEditorPane 文本组件通过称为 EditorKit 的插件机制来支持不同种类的内容。因为 HTML * 是很流行的内容格式,因此默认提供了某种支持。此类提供了 HTML version 3.2(带有某些扩展)的默认支持,并正在向 version 4.0 * 迁移。不支持 标记,但为
import java.net.*;public interface ISpiderReportable { // 找到URL链接 public boolean spiderFoundURL(URL base, URL url); public void spiderURLError(URL url); // 找到Email的链接 public void spiderFoundEMail(String email);}
import java.util.*;import java.net.*;import java.io.*;import javax.swing.text.*;import javax.swing.text.html.*;public class Spider { // 装载错误的工作集 protected Collection workloadError = new ArrayList(3); // 等待工作集 protected Collection workloadWaiting = new ArrayList(3); // 已处理的工作集 protected Collection workloadProcessed = new ArrayList(3); protected ISpiderReportable report; protected boolean cancel = false; public Spider(ISpiderReportable report) { this.report = report; } public Collection getWorkloadError() { return workloadError; } public Collection getWorkloadWaiting() { return workloadWaiting; } public Collection getWorkloadProcessed() { return workloadProcessed; } public void clear() { getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); } public void cancel() { cancel = true; } public void addURL(URL url) { if (getWorkloadWaiting().contains(url))// 如果等待的工作集中已经包含该URL,返回 return; if (getWorkloadError().contains(url))// 如果出错的工作集中已经包含该URL,返回 return; if (getWorkloadProcessed().contains(url))// 如果已处理的工作集中包含该URL,返回 return; log("Adding to workload: " + url); getWorkloadWaiting().add(url);// 将其加入等待的工作集中 } // 具体分析URL的方法 public void processURL(URL url) { try { log("Processing: " + url);// 控制台打印处理的URL地址 // get the URL's contents URLConnection connection = url.openConnection(); System.out.println(connection.getContentType() + "++++++++++++++++===="); if ((connection.getContentType() != null) && !connection.getContentType().toLowerCase().startsWith("text/")) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Not processing because content type is: " + connection.getContentType()); return; } // read the URL InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); // parse the URL HTMLEditorKit.Parser parse = new HTMLParse().getParser(); // Parse the given stream and drive the given callback with the // results of the parse. This method should be implemented to be // thread-safe. // 解析给定的流并通过解析的结果驱动给定的回调。该方法执行完之后,会调用给定的回调函数 parse.parse(r, new Parser(url), true); } catch (IOException e) {// 如果出错 getWorkloadWaiting().remove(url);// 从工作集中移除URL getWorkloadError().add(url);// 将出错的URL加入错误的工作集 log("Error: " + url); report.spiderURLError(url);// 报告该出错的URL return; } // mark URL as complete getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Complete: " + url); } // 蜘蛛工作的方法,只要等待工作集不为空,并且标志位为false,那么一直从集合中取出URL public void begin() { cancel = false; while (!getWorkloadWaiting().isEmpty() && !cancel) { Object list[] = getWorkloadWaiting().toArray(); for (int i = 0; (i