?? fetcher.java
字號:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import java.io.IOException;
import java.io.File;
import java.util.Properties;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.fs.*;
import net.nutch.util.*;
import net.nutch.protocol.*;
import net.nutch.parse.*;
import net.nutch.plugin.*;
import java.util.logging.*;
/**
* The fetcher. Most of the work is done by plugins.
*
* <p>
* Note by John Xing: As of 20041022, option -noParsing is introduced.
* Without this option, fetcher behaves the old way, i.e., it not only
* crawls but also parses content. With option -noParsing, fetcher
* does crawl only. Use ParseSegment.java to parse fetched contents.
* Check FetcherOutput.java and ParseSegment.java for further description.
*/
public class Fetcher {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.fetcher.Fetcher");
static {
if (NutchConf.getBoolean("fetcher.verbose", false)) {
setLogLevel(Level.FINE);
}
}
private ArrayFile.Reader fetchList; // the input
private ArrayFile.Writer fetcherWriter; // the output
private ArrayFile.Writer contentWriter;
private ArrayFile.Writer parseTextWriter;
private ArrayFile.Writer parseDataWriter;
private String name; // name of the segment
private long start; // start time of fetcher run
private long bytes; // total bytes fetched
private int pages; // total pages fetched
private int errors; // total pages errored
private boolean parsing = true; // whether do parsing
private int threadCount = // max number of threads
NutchConf.getInt("fetcher.threads.fetch", 10);
// All threads (FetcherThread or thread started by it) belong to
// group "fetcher". Each FetcherThread is named as "fetcherXX",
// where XX is the order it's started.
private static final String THREAD_GROUP_NAME = "fetcher";
private ThreadGroup group = new ThreadGroup(THREAD_GROUP_NAME); // our group
// count of FetcherThreads that are through the loop and just about to return
private int atCompletion = 0;
/********************************************
* Fetcher thread
********************************************/
private class FetcherThread extends Thread {
public FetcherThread(String name) { super(group, name); }
/**
* This thread keeps looping, grabbing an item off the list
* of URLs to be fetched (in a thread-safe way). It checks
* whether the URL is OK to download. If so, we do it.
*/
public void run() {
FetchListEntry fle = new FetchListEntry();
while (true) {
if (LogFormatter.hasLoggedSevere()) // something bad happened
break; // exit
String url = null;
try {
if (fetchList.next(fle) == null)
break;
url = fle.getPage().getURL().toString();
if (!fle.getFetch()) { // should we fetch this page?
if (LOG.isLoggable(Level.FINE))
LOG.fine("not fetching " + url);
handleNoFetch(fle, FetcherOutput.SUCCESS);
continue;
}
LOG.info("fetching " + url); // fetch the page
Protocol protocol = ProtocolFactory.getProtocol(url);
Content content = protocol.getContent(url);
handleFetch(url, fle, content);
synchronized (Fetcher.this) { // update status
pages++;
bytes += content.getContent().length;
if ((pages % 100) == 0) { // show status every 100pp
status();
}
}
} catch (ResourceGone e) { // don't retry
logError(url, fle, e);
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
// dealt with in handleFetch() below
//} catch (ParseException e) { // don't retry
// logError(url, fle, e);
// handleNoFetch(fle, FetcherOutput.CANT_PARSE);
} catch (RetryLater e) { // explicit retry
logError(url, fle, e);
handleNoFetch(fle, FetcherOutput.RETRY);
} catch (ProtocolException e) { // implicit retry
logError(url, fle, e);
handleNoFetch(fle, FetcherOutput.RETRY);
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, fle, t); // retry?
handleNoFetch(fle, FetcherOutput.RETRY);
}
}
}
// Explicitly invoke shutDown() for all possible plugins.
// Done by the FetcherThread finished the last.
synchronized (Fetcher.this) {
atCompletion++;
if (atCompletion == threadCount) {
try {
PluginRepository.getInstance().finalize();
} catch (java.lang.Throwable t) {
// do nothing
}
}
}
return;
}
private void logError(String url, FetchListEntry fle, Throwable t) {
LOG.info("fetch of " + url + " failed with: " + t);
LOG.log(Level.FINE, "stack", t); // stack trace
synchronized (Fetcher.this) { // record failure
errors++;
}
}
private void handleFetch(String url, FetchListEntry fle, Content content) {
if (!Fetcher.this.parsing) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
FetcherOutput.SUCCESS),
content, null, null);
return;
}
try {
String contentType = content.getContentType();
Parser parser = ParserFactory.getParser(contentType, url);
Parse parse = parser.getParse(content);
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
FetcherOutput.SUCCESS),
content, new ParseText(parse.getText()), parse.getData());
} catch (ParseException e) {
// 20041026, xing
// If fetching succeeds, but parsing fails, content should be saved
// so that we can try to parse again in separate pass, possibly
// using better/alternative parser.
LOG.info("fetch okay, but can't parse " + url + ", reason: "
+ e.getMessage());
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
FetcherOutput.CANT_PARSE),
content, new ParseText(""),
new ParseData("", new Outlink[0], new Properties()));
}
}
private void handleNoFetch(FetchListEntry fle, int status) {
String url = fle.getPage().getURL().toString();
MD5Hash hash = MD5Hash.digest(url);
if (Fetcher.this.parsing) {
outputPage(new FetcherOutput(fle, hash, status),
new Content(url, url, new byte[0], "", new Properties()),
new ParseText(""),
new ParseData("", new Outlink[0], new Properties()));
} else {
outputPage(new FetcherOutput(fle, hash, status),
new Content(url, url, new byte[0], "", new Properties()),
null, null);
}
}
private void outputPage(FetcherOutput fo, Content content,
ParseText text, ParseData parseData) {
try {
synchronized (fetcherWriter) {
fetcherWriter.append(fo);
contentWriter.append(content);
if (Fetcher.this.parsing) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -