?? spider.java
字號:
package com.microvois.luence;
import cvu.html.HTMLTokenizer;
import cvu.html.TagToken;
import cvu.html.TextToken;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Enumeration;
import java.net.URL;
import java.net.HttpURLConnection;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.FileNotFoundException;
import java.security.Security;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.mozilla.intl.chardet.HtmlCharsetDetector;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import java.io.*;
import com.microvois.luence.inputproxy;
public class Spider implements Runnable {
private static String lineSep = System.getProperty("line.separator");
private String indexDir;
private ArrayList urls;
private ArrayList include;
private ArrayList exclude;
private ArrayList threadList;
private boolean verbose;
private boolean incremental;
private boolean groksHTTPS;
private HashMap indexedURLs;
private HashMap mimeTypes;
private int threads;
private int descSize;
private int nInputcount = 0;
private int bytes;
private HttpClient httpclient;
private boolean isloopget = false;
private String mainargs[] = null;
/**
* 是否全站分析入庫。
*
* @param bvalue
*/
public void setLoopSpider(boolean bvalue)
{
this.isloopget = bvalue;
}
public void spiderURL(String url) throws Exception
{
inputproxy.flush();
urls.add(url);
indexDir= Config.StrDataDirectroy;
if (urls.size() == 0)
throw new IllegalArgumentException(
"缺少需要的URL");
if (indexDir == null)
throw new IllegalArgumentException(
"Missing required argument: -d [index dir]");
if (mimeTypes.size() == 0) {
// add default MIME types
mimeTypes.put("text/html", Boolean.TRUE);
mimeTypes.put("text/plain", Boolean.TRUE);
}
go();
//inputproxy.flush();
}
public Spider()
{
this(null);
}
public Spider(String argv[]) {
groksHTTPS = true;
verbose = false;
incremental = false;
threads = 1;
descSize = 1024;
bytes = 0;
include = new ArrayList();
exclude = new ArrayList();
urls = new ArrayList();
threadList = new ArrayList();
indexedURLs = new HashMap();
mimeTypes = new HashMap();
mainargs= argv;
if(argv !=null)
parseArgs(argv);
httpclient = new HttpClient(new MultiThreadedHttpConnectionManager());
httpclient.getHttpConnectionManager().
getParams().setConnectionTimeout(30000);
}
public void go() throws Exception {
// create the index directory -- or append to existing
if (verbose) {
print("Creating index in: " + indexDir);
if (incremental)
print(" - using incremental mode");
}
// index each entry point URL
long start = System.currentTimeMillis();
for (int i = 0; i < threads; i++) {
Thread t = new Thread(this, "Spindle Spider Thread #" + (i + 1));
t.start();
threadList.add(t);
}
if(mainargs !=null)
{
while (threadList.size() > 0)
{
Thread child = (Thread) threadList.remove(0);
child.join();
}
long elapsed = System.currentTimeMillis() - start;
// save the index
print("Indexed " + indexedURLs.size() + " URLs (" + (bytes / 1024)
+ " KB) in " + (elapsed / 1000) + " seconds");
}
}
public void run() {
String url;
try {
while ((url = dequeueURL()) != null) {
indexURL(url);
}
} catch (Exception e) {
e.printStackTrace();
}
inputproxy.flush();
threads--;
}
public synchronized String dequeueURL() throws Exception {
while (true) {
if (urls.size() > 0) {
return (String) urls.remove(0);
} else {
threads--;
if (threads > 0) {
wait();
threads++;
} else {
notifyAll();
return null;
}
}
}
}
public synchronized void enqueueURL(String url) {
if (indexedURLs.get(url) == null) {
urls.add(url);
indexedURLs.put(url, Boolean.TRUE);
notifyAll();
}
}
private void indexURL(String url) throws Exception {
if (verbose)
print(" " + Thread.currentThread().getName() + ": Adding URL: "
+ url);
int nloopcount = 3;
int i=0;
URLSummary summary =null;
while(summary==null && i < nloopcount )
{
summary = loadURL(url);
i++;
}
if (summary != null && summary.body != null)
{
String urls[] = parseURLs(summary);
System.out.println(summary.toString());
synchronized (this)
{
bytes += summary.body.length();
if(inputproxy.inputData(summary.title,summary.desc , url))
{
System.out.println(url+" input search engine sucess ...");
}
else
System.out.println(url+" input search engine failed ...");
if(nInputcount % 100 ==0 && nInputcount !=0)
inputproxy.flush();
}
for ( i = 0; i < urls.length; i++)
{
// check against the include/exclude list
boolean add = true;
for (int x = 0; add && x < include.size(); x++) {
String inc = (String) include.get(x);
add = (urls[i].indexOf(inc) != -1);
}
for (int x = 0; add && x < exclude.size(); x++) {
String ex = (String) exclude.get(x);
add = (urls[i].indexOf(ex) == -1);
}
if (add) {
enqueueURL(urls[i]);
}
}
}
}
// 解析頁面中的內容 。。。。。
private String[] parseURLs(URLSummary summary) throws Exception {
StringBuffer desc = new StringBuffer();
ArrayList urls = new ArrayList();
boolean isIgnoreText = false;
HTMLTokenizer ht = new HTMLTokenizer(new StringReader(summary.body));
for (Enumeration e = ht.getTokens(); e.hasMoreElements();) {
Object obj = e.nextElement();
if (obj instanceof TagToken) {
TagToken tag = (TagToken) obj;
String tagName = tag.getName().toLowerCase();
//System.out.println("tag="+tag.toString()+"::"+tagName);
String url = null;
if(tagName.equals("meta") )
{
// 將keywords , description 加入到關鍵字中。
//System.out.println(tag.getAttribute("name")+"::"+ tag.getAttribute("content"));
if(tag.getAttribute("name")!=null && "keywords".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null)
{
desc.append(tag.getAttribute("content"));
}
else if(tag.getAttribute("name")!=null && "description".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null)
{
desc.append(tag.getAttribute("content"));
}
}
//過濾到script 里的內容, style
if(tag.isEndTag()==false && tagName.equals("style"))
isIgnoreText= true;
else if(tag.isEndTag()==true && tagName.equals("style"))
{
isIgnoreText = false;
}
if(tag.isEndTag()==false && tagName.equals("script"))
isIgnoreText= true;
else if(tag.isEndTag()==true && tagName.equals("script"))
{
isIgnoreText = false;
}
else if (tagName.equals("a"))
{
url = tag.getAttributes().get("href");
}
else if (tagName.equals("frame"))
{
url = tag.getAttributes().get("src");
}
else if (tagName.equals("title") && e.hasMoreElements()
&& !tag.isEndTag())
{
obj = e.nextElement();
if (obj instanceof TextToken) {
TextToken title = (TextToken) obj;
summary.title = title.getText();
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -