?? linkextractor.java
字號:
//package org.apache.lucene.index;import java.io.*;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.ParserException;import java.util.*;import java.net.*;/** * LinkExtractor extracts all the links from the given webpage * and prints them on standard output. */public class LinkExtractor { private String location; private Parser parser; private static int b=0; private static int tID; private static int iNode; public final int DEEP=3; //遍歷的深度 public static Vector svecLink, svecOutlink; public static String hostName; public static boolean bl; public LinkExtractor(String location) { this.location = location; hostName=GetHostName(location); System.out.println("主機名稱是 "+hostName); bl=false; svecLink=new Vector(); svecOutlink=new Vector(); svecLink.add(location); } public void extractLinks(String loc) throws ParserException { System.out.println("Parsing "+loc+" for links..."); Vector vecTemp=new Vector(); try { this.parser = new Parser(loc); // Create the parser object parser.registerScanners(); // Register standard scanners (Very Important) bl=true; } catch (ParserException e) { bl=false; e.printStackTrace(); } String ss,str1; URL wwwurl; boolean byes; int a=0; b++; Node [] links = parser.extractAllNodesThatAre(LinkTag.class); for (int i = 0;i < links.length;i++) { if(bl) { byes=true; System.out.println("Total url is "+links.length+"This page has url "+i); LinkTag linkTag = (LinkTag)links[i]; str1=linkTag.getLink(); // System.out.println("the url is "+str1);&&!svecOutlink.contains(str1) if(str1.equals("")) continue; if(str1.charAt(str1.length()-1)=='/' ||str1.charAt(str1.length()-1)=='\\') str1=str1.substring(0,str1.length()-1); if(!svecLink.contains(str1)) { try { wwwurl=new URL(str1); wwwurl.getContent(); } catch(MalformedURLException e) { byes=false; } catch(IOException e) { byes=false; } if(GetHostName(str1).equals(hostName) && byes) { a++; tID++; svecLink.add(str1); vecTemp.add(str1); System.out.println("the url is "+str1); } else { svecOutlink.add(str1); } } } } String strNew; if(a>0&&b<=DEEP) { for(int i=0;i<vecTemp.size();i++) { strNew=(String)vecTemp.get(i); System.out.println("this is "+strNew); extractLinks(strNew); } } } boolean linkAttribute(String strLink) { return true; } static void printCol(Enumeration col) { String str; while(col.hasMoreElements()) { str=(String)col.nextElement(); System.out.println(str); } } public String GetHostName(String hostname) { URL aurl; String ss=" "; try { aurl=new URL(hostname); ss=aurl.getHost(); } catch(MalformedURLException e) { e.printStackTrace(); //return "null"; } return ss; } public static void main(String[] args) { /* if (args.length<0) { System.err.println("Syntax Error : Please provide the location(URL or file) to parse"); System.exit(-1); }*/ Vector allLink=new Vector(); String strNew,strall1,strall2,str; String ss="http://www.dlut.edu.cn/"; LinkExtractor linkExtractor = new LinkExtractor(ss); try { linkExtractor.extractLinks(ss); Enumeration col=svecLink.elements(); while(col.hasMoreElements()) { str=(String)col.nextElement(); System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } }}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -