?? webdbreader.java
字號:
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Page toReturn = nextItem;
this.nextItem = new Page();
try {
if (! reader.next(nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* Get all the hyperlinks that link TO the indicated URL.
*/
public Link[] getLinks(UTF8 url) throws IOException {
Vector records = new Vector(3);
Link l = new Link();
l.getURL().set(url);
linksByURL.seek(l);
while (linksByURL.next(l, NullWritable.get())) {
if (url.equals(l.getURL())) {
records.add(l);
l = new Link();
} else {
break;
}
}
// Xfer from the vector into an array
return (Link[]) records.toArray(LINK_RECORDS);
}
/**
* Grab all the links from the given MD5 hash.
*/
public Link[] getLinks(MD5Hash md5) throws IOException {
Vector records = new Vector(3);
Link l = new Link();
l.getFromID().set(md5);
linksByMD5.seek(l);
while (linksByMD5.next(l, NullWritable.get())) {
if (md5.equals(l.getFromID())) {
records.add(l);
l = new Link();
} else {
break;
}
}
// Xfer from the vector into an array
return (Link[]) records.toArray(LINK_RECORDS);
}
/**
* Return all the links, by target URL
*/
public Enumeration links() {
return new MapEnumerator(linksByURL);
}
/**
* Return the number of links in our db.
*/
public long numLinks() {
return totalLinks;
}
//
// Here's the class for the above function
//
class MapEnumerator implements Enumeration {
MapFile.Reader reader;
Link nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public MapEnumerator(MapFile.Reader reader) {
this.reader = reader;
this.nextItem = new Link();
try {
if (! reader.next(this.nextItem, NullWritable.get())) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Link toReturn = nextItem;
this.nextItem = new Link();
try {
if (! reader.next(nextItem, NullWritable.get())) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* The WebDBReader.main() provides some handy utility methods
* for looking through the contents of the webdb. Hoo-boy!
*/
public static void main(String argv[]) throws FileNotFoundException, IOException {
if (argv.length < 2) {
System.out.println("Usage: java net.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
return;
}
int i = 0;
NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
File dbDir = new File(argv[i++]);
WebDBReader reader = new WebDBReader(nfs, dbDir);
try {
String cmd = argv[i++];
if ("-pageurl".equals(cmd)) {
String url = argv[i++];
System.out.println(reader.getPage(url.trim()));
} else if ("-pagemd5".equals(cmd)) {
MD5Hash md5 = new MD5Hash(argv[i++]);
Page pages[] = reader.getPages(md5);
System.out.println("Found " + pages.length + " pages.");
for (int j = 0; j < pages.length; j++) {
System.out.println("Page " + j + ": " + pages[j]);
}
} else if ("-dumppageurl".equals(cmd)) {
System.out.println(reader);
System.out.println();
int j = 1;
for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + j + ": " + page);
System.out.println();
}
} else if ("-dumppagemd5".equals(cmd)) {
System.out.println(reader);
System.out.println();
int j = 1;
for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + j + ": " + page);
System.out.println();
}
} else if ("-toppages".equals(cmd)) {
int topSize = Integer.parseInt(argv[i++]);
// Create a sorted list
SortedSet topSet = new TreeSet(new Comparator() {
public int compare(Object o1, Object o2) {
Page p1 = (Page) o1;
Page p2 = (Page) o2;
if (p1.getScore() < p2.getScore()) {
return -1;
} else if (p1.getScore() == p2.getScore()) {
// If two scores are equal, we will
// use regular Page comparison (which
// uses URL as the primary key). We
// don't want to uniquify by score!
return p1.compareTo(p2);
} else {
return 1;
}
}
}
);
// Find the top "topSize" elts
Page lowestPage = null;
for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
Page curPage = (Page) e.nextElement();
if (topSet.size() < topSize) {
topSet.add(curPage);
lowestPage = (Page) topSet.first();
} else if (lowestPage.getScore() < curPage.getScore()) {
topSet.remove(lowestPage);
topSet.add(curPage);
lowestPage = (Page) topSet.first();
}
}
// Print them out
int j = 0;
for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
System.out.println("Page " + j + ": " + (Page) it.next());
System.out.println();
}
} else if ("-linkurl".equals(cmd)) {
String url = argv[i++];
Link links[] = reader.getLinks(new UTF8(url.trim()));
System.out.println("Found " + links.length + " links.");
for (int j = 0; j < links.length; j++) {
System.out.println("Link " + j + ": " + links[j]);
}
} else if ("-linkmd5".equals(cmd)) {
MD5Hash fromID = new MD5Hash(argv[i++]);
Link links[] = reader.getLinks(fromID);
System.out.println("Found " + links.length + " links.");
for (int j = 0; j < links.length; j++) {
System.out.println("Link " + j + ": " + links[j]);
}
} else if ("-dumplinks".equals(cmd)) {
System.out.println(reader);
System.out.println();
Enumeration e = reader.pagesByMD5();
while (e.hasMoreElements()) {
Page page = (Page) e.nextElement();
Link[] links = reader.getLinks(page.getMD5());
if (links.length > 0) {
System.out.println("from " + page.getURL());
for (int j = 0; j < links.length; j++) {
System.out.println(" to " + links[j].getURL());
}
System.out.println();
}
}
} else if ("-stats".equals(cmd)) {
System.out.println("Stats for " + reader);
System.out.println("-------------------------------");
System.out.println("Number of pages: " + reader.numPages());
System.out.println("Number of links: " + reader.numLinks());
} else {
System.out.println("Sorry, no command with name " + cmd);
}
} finally {
reader.close();
nfs.close();
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -