?? segmentreader.java
字號:
package net.nutch.segment;
import java.io.*;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Vector;
//import java.util.logging.Logger;
import org.apache.log4j.*;
import net.nutch.fetcher.FetcherOutput;
import net.nutch.io.ArrayFile;
import net.nutch.io.LongWritable;
import net.nutch.io.MapFile;
import net.nutch.fs.*;
import net.nutch.parse.ParseData;
import net.nutch.parse.ParseText;
import net.nutch.protocol.Content;
/**
* This class holds together all data readers for an existing segment.
* Some convenience methods are also provided, to read from the segment and
* to reposition the current pointer.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class SegmentReader {
public static final Logger LOG = Logger.getLogger("segment");
public ArrayFile.Reader parseTextReader;
public ArrayFile.Reader parseDataReader;
/**
* The time when fetching of this segment started, as recorded
* in fetcher output data.
*/
//public long started = 0L;
/**
* The time when fetching of this segment finished, as recorded
* in fetcher output data.
*/
//public long finished = 0L;
public long size = 0L;
private long key = -1L;
public File segmentDir;
public NutchFileSystem nfs;
/**
* Open a segment for reading. If the segment is corrupted, do not attempt to fix it.
* @param dir directory containing segment data
* @throws Exception
*/
public SegmentReader(File dir) throws Exception {
this(new LocalFileSystem(), dir, true, true, false);
}
/**
* Open a segment for reading. If segment is corrupted, do not attempt to fix it.
* @param nfs filesystem
* @param dir directory containing segment data
* @throws Exception
*/
public SegmentReader(NutchFileSystem nfs, File dir) throws Exception {
this(nfs, dir, true, true, false);
}
/**
* Open a segment for reading.
* @param dir directory containing segment data
* @param autoFix if true, and the segment is corrupted, attempt to
* fix errors and try to open it again. If the segment is corrupted, and
* autoFix is false, or it was not possible to correct errors, an Exception is
* thrown.
* @throws Exception
*/
public SegmentReader(File dir, boolean autoFix) throws Exception {
this(new LocalFileSystem(), dir, true, true, autoFix);
}
/**
* Open a segment for reading.
* @param nfs filesystem
* @param dir directory containing segment data
* @param autoFix if true, and the segment is corrupted, attempt to
* fix errors and try to open it again. If the segment is corrupted, and
* autoFix is false, or it was not possible to correct errors, an Exception is
* thrown.
* @throws Exception
*/
public SegmentReader(NutchFileSystem nfs, File dir, boolean autoFix) throws Exception {
this(nfs, dir, true, true, autoFix);
}
/**
* Open a segment for reading. When a segment is open, its total size is checked
* and cached in this class - however, only by actually reading entries one can
* be sure about the exact number of valid, non-corrupt entries.
*
* <p>If the segment was created with no-parse option (see {@link FetcherOutput#DIR_NAME_NP})
* then automatically withParseText and withParseData will be forced to false.</p>
*
* @param nfs NutchFileSystem to use
* @param dir directory containing segment data
* @param withParseText if true, read ParseText, otherwise ignore it
* @param withParseData if true, read ParseData, otherwise ignore it
* @param autoFix if true, and the segment is corrupt, try to automatically fix it.
* If this parameter is false, and the segment is corrupt, or fixing was unsuccessful,
* and Exception is thrown.
* @throws Exception
*/
public SegmentReader(NutchFileSystem nfs, File dir,
boolean withParseText, boolean withParseData,
boolean autoFix) throws Exception {
try {
init(nfs, dir, withParseText, withParseData);
} catch (Exception e) {
boolean ok = false;
if (autoFix) {
// corrupt segment, attempt to fix
ok = fixSegment(nfs, dir, withParseText, withParseData, false);
}
if (ok)
init(nfs, dir, withParseText, withParseData);
else throw new Exception("Segment " + dir + " is corrupted.");
}
}
public static boolean isParsedSegment(NutchFileSystem nfs, File segdir) throws Exception {
boolean res;
File foDir = new File(segdir, FetcherOutput.DIR_NAME);
if (nfs.exists(foDir) && nfs.isDirectory(foDir)) return true;
foDir = new File(segdir, FetcherOutput.DIR_NAME_NP);
if (nfs.exists(foDir) && nfs.isDirectory(foDir)) return false;
throw new Exception("Missing or invalid '" + FetcherOutput.DIR_NAME + "' or '"
+ FetcherOutput.DIR_NAME_NP + "' directory in " + segdir);
}
/**
* Attempt to fix a partially corrupted segment. Currently this means just
* fixing broken MapFile's, using {@link MapFile#fix(NutchFileSystem, File, Class, Class, boolean)}
* method.
* @param nfs filesystem
* @param dir segment directory
* @param withContent if true, fix content, otherwise ignore it
* @param withParseText if true, fix parse_text, otherwise ignore it
* @param withParseData if true, fix parse_data, otherwise ignore it
* @param dryrun if true, only show what would be done without performing any actions
* @return
*/
public static boolean fixSegment(NutchFileSystem nfs, File dir,
boolean withParseText, boolean withParseData,
boolean dryrun) {
String dr = "";
if (dryrun) dr = "[DRY RUN] ";
File fetcherOutput = null;
File content = new File(dir, Content.DIR_NAME);
File parseData = new File(dir, ParseData.DIR_NAME);
File parseText = new File(dir, ParseText.DIR_NAME);
long cnt = 0L;
try {
if (isParsedSegment(nfs, dir)) {
fetcherOutput = new File(dir, FetcherOutput.DIR_NAME);
} else {
fetcherOutput = new File(dir, FetcherOutput.DIR_NAME_NP);
withParseText = false;
withParseData = false;
}
cnt = MapFile.fix(nfs, fetcherOutput, LongWritable.class, FetcherOutput.class, dryrun);
if (cnt != -1) LOG.info(dr + " - fixed " + fetcherOutput.getName());
if (withParseData) {
cnt = MapFile.fix(nfs, parseData, LongWritable.class, ParseData.class, dryrun);
if (cnt != -1) LOG.info(dr + " - fixed " + parseData.getName());
}
if (withParseText) {
cnt = MapFile.fix(nfs, parseText, LongWritable.class, ParseText.class, dryrun);
if (cnt != -1) LOG.info(dr + " - fixed " + parseText.getName());
}
LOG.info(dr + "Finished fixing " + dir.getName());
return true;
} catch (Throwable t) {
LOG.warn(dr + "Unable to fix segment " + dir.getName() + ": " + t.getMessage());
return false;
}
}
private void init(NutchFileSystem nfs, File dir,
boolean withParseText, boolean withParseData) throws Exception {
segmentDir = dir;
this.nfs = nfs;
if (withParseText) parseTextReader = new ArrayFile.Reader(nfs, new File(dir, ParseText.DIR_NAME).toString());
if (withParseData) parseDataReader = new ArrayFile.Reader(nfs, new File(dir, ParseData.DIR_NAME).toString());
// count the number of valid entries.
// XXX We assume that all other data files contain the
// XXX same number of valid entries - which is not always
// XXX true if Fetcher crashed in the middle of update.
// XXX One should check for this later, when actually
// XXX reading the entries.
LongWritable w = new LongWritable();
w.set(++size);
try {
while (parseTextReader.seek(w)) {
w.set(++size);
}
} catch (Throwable eof) {
// the file is truncated - probably due to a crashed fetcher.
// Use just the part that we can...
LOG.warn(" - data in segment " + dir + " is corrupt, using only " + size + " entries.");
}
parseTextReader.reset();
}
/**
* Get a specified entry from the segment. Note: even if some of the storage objects
* are null, but if respective readers are open a seek(n) operation will be performed
* anyway, to ensure that the whole entry is valid.
*
* @param n position of the entry
* @param fo storage for FetcherOutput data. Must not be null.
* @param co storage for Content data, or null.
* @param pt storage for ParseText data, or null.
* @param pd storage for ParseData data, or null.
* @return true if all requested data successfuly read, false otherwise
* @throws IOException
*/
public synchronized boolean get(long n, ParseText pt, ParseData pd) throws IOException {
//XXX a trivial implementation would be to do the following:
//XXX seek(n);
//XXX return next(fo, co, pt, pd);
//XXX However, get(long, Writable) may be more optimized
boolean valid = true;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -