?? segmentreader.java
字號:
if (parseTextReader != null) {
if (pt != null) {
if (parseTextReader.get(n, pt) == null) valid = false;
} else parseTextReader.seek(n);
}
if (parseDataReader != null) {
if (pd != null) {
if (parseDataReader.get(n, pd) == null) valid = false;
} else parseDataReader.seek(n);
}
key = n;
return valid;
}
private ParseText _pt = new ParseText();
private ParseData _pd = new ParseData();
/** Read values from all open readers. Note: even if some of the storage objects
* are null, but if respective readers are open, an underlying next() operation will
* be performed for all streams anyway, to ensure that the whole entry is valid.
*/
public synchronized boolean next(ParseText pt, ParseData pd) throws IOException {
boolean valid = true;
ParseText rpt = (pt == null) ? _pt : pt;
ParseData rpd = (pd == null) ? _pd : pd;
if (parseTextReader != null)
if (parseTextReader.next(rpt) == null) valid = false;
if (parseDataReader != null)
if (parseDataReader.next(rpd) == null) valid = false;
key++;
return valid;
}
/** Seek to a position in all readers. */
public synchronized void seek(long n) throws IOException {
if (parseTextReader != null) parseTextReader.seek(n);
if (parseDataReader != null) parseDataReader.seek(n);
key = n;
}
/** Return the current key position. */
public long key() {
return key;
}
/** Reset all readers. */
public synchronized void reset() throws IOException {
if (parseTextReader != null) parseTextReader.reset();
if (parseDataReader != null) parseDataReader.reset();
}
/** Close all readers. */
public synchronized void close() {
if (parseTextReader != null) try {
parseTextReader.close();
} catch (Exception e) {};
if (parseDataReader != null) try {
parseDataReader.close();
} catch (Exception e) {};
}
/**
* Dump the segment's content in human-readable format.
* @param sorted if true, sort segment entries by URL (ascending). If false,
* output entries in the order they occur in the segment.
* @param output where to dump to
* @throws Exception
*/
/****
public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
reset();
FetcherOutput fo = new FetcherOutput();
Content co = new Content();
ParseData pd = new ParseData();
ParseText pt = new ParseText();
long recNo = 0L;
if (!sorted) {
while(next(pt, pd)) {
output.println("Recno:: " + recNo++);
output.println("FetcherOutput::\n" + fo.toString());
if (parseDataReader != null)
output.println("ParseData::\n" + pd.toString());
if (parseTextReader != null)
output.println("ParseText::\n" + pt.toString());
output.println("");
}
} else {
File unsortedFile = new File(segmentDir, ".unsorted");
File sortedFile = new File(segmentDir, ".sorted");
nfs.delete(unsortedFile);
nfs.delete(sortedFile);
SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
unsortedFile.toString(), UTF8.class, LongWritable.class);
FetchListEntry fle;
LongWritable rec = new LongWritable();
UTF8 url = new UTF8();
String urlString;
while (fetcherReader.next(fo) != null) {
fle = fo.getFetchListEntry();
urlString = fle.getPage().getURL().toString();
rec.set(recNo);
url.set(urlString);
seqWriter.append(url, rec);
recNo++;
}
seqWriter.close();
// sort the SequenceFile
long start = System.currentTimeMillis();
SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
new UTF8.Comparator(), LongWritable.class);
sorter.sort(unsortedFile.toString(), sortedFile.toString());
float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
+ (recNo/localSecs) + " entries/s");
nfs.delete(unsortedFile);
SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
while (seqReader.next(url, rec)) {
recNo = rec.get();
get(recNo, fo, co, pt, pd);
output.println("Recno:: " + recNo++);
output.println("FetcherOutput::\n" + fo.toString());
if (contentReader != null)
output.println("Content::\n" + co.toString());
if (parseDataReader != null)
output.println("ParseData::\n" + pd.toString());
if (parseTextReader != null)
output.println("ParseText::\n" + pt.toString());
output.println("");
}
seqReader.close();
nfs.delete(sortedFile);
}
}
***/
/** Command-line wrapper. Run without arguments to see usage help. */
public static void main(String[] args) throws Exception {
if (args.length == 0) {
usage();
return;
}
SegmentReader reader = null;
NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
String segDir = null;
Vector dirs = new Vector();
boolean fix = false;
boolean list = false;
boolean dump = false;
boolean sorted = false;
boolean withParseText = true;
boolean withParseData = true;
boolean withContent = true;
for (int i = 0; i < args.length; i++) {
if (args[i] != null) {
if (args[i].equals("-noparsetext")) withParseText = false;
else if (args[i].equals("-noparsedata")) withParseData = false;
else if (args[i].equals("-nocontent")) withContent = false;
else if (args[i].equals("-fix")) fix = true;
else if (args[i].equals("-dump")) dump = true;
else if (args[i].equals("-dumpsort")) {
dump = true;
sorted = true;
} else if (args[i].equals("-list")) list = true;
else if (args[i].equals("-dir")) segDir = args[++i];
else dirs.add(new File(args[i]));
}
}
if (segDir != null) {
File sDir = new File(segDir);
if (!sDir.exists() || !sDir.isDirectory()) {
LOG.warn("Invalid path: " + sDir);
} else {
File[] files = sDir.listFiles(new FileFilter() {
public boolean accept(File f) {
return f.isDirectory();
}
});
if (files != null && files.length > 0) {
for (int i = 0; i < files.length; i++) dirs.add(files[i]);
}
}
}
if (dirs.size() == 0) {
LOG.warn("No input segment dirs.");
usage();
return;
}
long total = 0L;
int cnt = 0;
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd'-'HH:mm:ss");
DecimalFormat df = new DecimalFormat("########");
df.setParseIntegerOnly(true);
if (list)
LOG.info("PARSED?\tSTARTED\t\t\tFINISHED\t\tCOUNT\tDIR NAME");
for (int i = 0; i < dirs.size(); i++) {
File dir = (File)dirs.get(i);
try {
reader = new SegmentReader(nfs, dir,
withParseText, withParseData, fix);
if (list) {
LOG.info(df.format(reader.size) +
"\t" + dir);
}
total += reader.size;
cnt++;
//if (dump) reader.dump(sorted, System.out);
} catch (Throwable t) {
LOG.error(t.getMessage());
}
}
if (list)
LOG.info("TOTAL: " + total + " entries in " + cnt + " segments.");
}
private static void usage() {
System.err.println("SegmentReader [-fix] [-dump] [-dumpsort] [-list] [-nocontent] [-noparsedata] [-noparsetext] (-dir segments | seg1 seg2 ...)");
System.err.println("\tNOTE: at least one segment dir name is required, or '-dir' option.");
System.err.println("\t-fix\t\tautomatically fix corrupted segments");
System.err.println("\t-dump\t\tdump segment data in human-readable format");
System.err.println("\t-dumpsort\tdump segment data in human-readable format, sorted by URL");
System.err.println("\t-list\t\tprint useful information about segments");
System.err.println("\t-nocontent\tignore content data");
System.err.println("\t-noparsedata\tignore parse_data data");
System.err.println("\t-nocontent\tignore parse_text data");
System.err.println("\t-dir segments\tdirectory containing multiple segments");
System.err.println("\tseg1 seg2 ...\tsegment directories\n");
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -