?? webdbreader.java
字號:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;
import java.io.*;
import java.util.*;
import java.nio.channels.*;
import net.nutch.io.*;
import net.nutch.fs.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
/**********************************************
* The WebDBReader implements all the read-only
* parts of accessing our web database.
* All the writing ones can be found in WebDBWriter.
*
* @author Mike Cafarella
**********************************************/
public class WebDBReader implements IWebDBReader {
static final Page[] PAGE_RECORDS = new Page[0];
static final Link[] LINK_RECORDS = new Link[0];
// filenames
static final String PAGES_BY_URL = "pagesByURL";
static final String PAGES_BY_MD5 = "pagesByMD5";
static final String LINKS_BY_URL = "linksByURL";
static final String LINKS_BY_MD5 = "linksByMD5";
static final String STATS_FILE = "stats";
NutchFileSystem nfs;
File dbDir, dbFile;
MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
long totalPages = 0, totalLinks = 0;
Vector mapReaders = null, setReaders = null;
FileInputStream dbReadLockData;
FileLock dbReadLock;
/**
* Open a web db reader for the named directory.
*/
public WebDBReader(NutchFileSystem nfs, File dbDir) throws IOException, FileNotFoundException {
this.nfs = nfs;
this.dbDir = dbDir;
this.dbFile = new File(dbDir, "webdb");
// Obtain read lock on db so writers don't try to
// move it out from under us. This obtains a non-exclusive
// lock on the directory that holds the dbs (old and new)
nfs.lock(new File(dbDir, "dbreadlock"), true);
this.pagesByURL = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
this.pagesByMD5 = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());
this.linksByURL = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
this.linksByMD5 = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());
// Load in statistics
File stats = new File(dbFile, STATS_FILE);
if (nfs.exists(stats)) {
DataInputStream in = new DataInputStream(nfs.open(stats));
try {
int version = (byte) in.read();
this.totalPages = in.readLong();
this.totalLinks = in.readLong();
} finally {
in.close();
}
}
// Create vectors so we can GC readers used by
// enum() calls. We do this so we can have multiple
// simultaneous enum users. However, since we keep
// a handle to each one, we're assuming that we don't
// create too many before WebDBReader.close() is called.
this.mapReaders = new Vector();
this.setReaders = new Vector();
}
/**
* Shutdown
*/
public void close() throws IOException {
pagesByURL.close();
pagesByMD5.close();
linksByURL.close();
linksByMD5.close();
for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) {
MapFile.Reader tmp = (MapFile.Reader) e.nextElement();
tmp.close();
}
for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) {
SetFile.Reader tmp = (SetFile.Reader) e.nextElement();
tmp.close();
}
// release the lock
nfs.release(new File(dbDir, "dbreadlock"));
}
/**
* Get Page from the pagedb with the given URL
*/
public Page getPage(String url) throws IOException {
return (Page) pagesByURL.get(new UTF8(url), new Page());
}
/**
* Get Pages from the pagedb according to their
* content hash.
*/
public Page[] getPages(MD5Hash md5) throws IOException {
Vector records = new Vector(3);
Page p = new Page();
p.getMD5().set(md5);
pagesByMD5.seek(p);
while (pagesByMD5.next(p, NullWritable.get())) {
if (p.getMD5().compareTo(md5) == 0) {
records.add(p);
p = new Page();
} else {
break;
}
}
// Xfer from the vector into an array
return (Page[]) records.toArray(PAGE_RECORDS);
}
/**
* Test whether a certain piece of content is in the
* database, but don't bother returning the Page(s) itself.
*/
public boolean pageExists(MD5Hash md5) throws IOException {
Page p = new Page();
p.getMD5().set(md5);
pagesByMD5.seek(p);
if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
return true;
} else {
return false;
}
}
/**
* Iterate through all the Pages, sorted by URL
*/
public Enumeration pages() throws IOException {
MapFile.Reader tmpReader = new MapFile.Reader(nfs, new File(dbFile, "pagesByURL").getPath());
mapReaders.add(tmpReader);
return new TableEnumerator(tmpReader);
}
//
// The TableEnumerator goes through all the entries
// in the Table (which is a MapFile).
//
class TableEnumerator implements Enumeration {
MapFile.Reader reader;
Page nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public TableEnumerator(MapFile.Reader reader) {
this.reader = reader;
this.nextItem = new Page();
try {
if (! reader.next(new UTF8(), this.nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Page toReturn = nextItem;
this.nextItem = new Page();
try {
if (! reader.next(new UTF8(), nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* Iterate through all the Pages, sorted by MD5
*/
public Enumeration pagesByMD5() throws IOException {
SetFile.Reader tmpReader = new SetFile.Reader(nfs, new File(dbFile, "pagesByMD5").getPath());
setReaders.add(tmpReader);
return new IndexEnumerator(tmpReader);
}
/**
* Return the number of pages we're dealing with
*/
public long numPages() {
return totalPages;
}
//
// The IndexEnumerator goes through all the entries
// in the index (which is a SequenceFile).
//
class IndexEnumerator implements Enumeration {
SetFile.Reader reader;
Page nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public IndexEnumerator(SetFile.Reader reader) {
this.reader = reader;
this.nextItem = new Page();
try {
if (! reader.next(nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -