?? jerichoextractorhtml.java
字號:
/* JerichoExtractorHTML * * Copyright (C) 2006 Olaf Freyer * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * $Id: JerichoExtractorHTML.java 4726 2006-11-15 17:57:11Z stack-sf $ */package org.archive.crawler.extractor;import java.util.ArrayList;import java.util.Collection;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.httpclient.URIException;import org.apache.commons.lang.StringEscapeUtils;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.TextUtils;import au.id.jericho.lib.html.Attribute;import au.id.jericho.lib.html.Attributes;import au.id.jericho.lib.html.Element;import au.id.jericho.lib.html.FormControl;import au.id.jericho.lib.html.FormControlType;import au.id.jericho.lib.html.FormField;import au.id.jericho.lib.html.FormFields;import au.id.jericho.lib.html.HTMLElementName;import au.id.jericho.lib.html.Source;import au.id.jericho.lib.html.StartTagType;/** * Improved link-extraction from an HTML content-body using jericho-html parser. * This extractor extends ExtractorHTML and mimics its workflow - but has some * substantial differences when it comes to internal implementation. Instead * of heavily relying upon java regular expressions it uses a real html parser * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net). * Using this parser it can better handle broken html (i.e. missing quotes) * and also offer improved extraction of HTML form URLs (not only extract * the action of a form, but also its default values). * Unfortunately this parser also has one major drawback - it has to read the * whole document into memory for parsing, thus has an inherent OOME risk. * This OOME risk can be reduced/eleminated by limiting the size of documents * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule). * Also note that this extractor seems to have a lower overall memory * consumption compared to ExtractorHTML. (still to be confirmed on a larger * scale crawl) * * @author Olaf Freyer * @version $Date: 2006-11-15 17:57:11 +0000 (Wed, 15 Nov 2006) $ $Revision: 4726 $ */public class JerichoExtractorHTML extends ExtractorHTML implements CoreAttributeConstants { private static final long serialVersionUID = 1684681316546343615L; private Logger logger = Logger.getLogger(this.getClass().getName()); protected long numberOfFormsProcessed = 0; public JerichoExtractorHTML(String name) { this(name, "Jericho-HTML extractor. Extracts links from HTML " + "documents using Jericho HTML Parser. Offers same " + "basic functionality as ExtractorHTML but better " + "handles broken HTML and extraction of default " + "values from HTML forms. A word of warning: the used " + "parser, the Jericho HTML Parser, reads the whole " + "document into memory for " + "parsing - thus this extractor has an inherent OOME risk. " + "This OOME risk can be reduced/eleminated by limiting the " + "size of documents to be parsed (i.e. using " + "NotExceedsDocumentLengthTresholdDecideRule). "); } public JerichoExtractorHTML(String name, String description) { super(name, description); } private static List<Attribute> findOnAttributes(Attributes attributes) { List<Attribute> result = new LinkedList<Attribute>(); for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) { Attribute attr = (Attribute) attrIter.next(); if (attr.getKey().startsWith("on")) result.add(attr); } return result; } protected void processGeneralTag(CrawlURI curi, Element element, Attributes attributes) { Attribute attr; String attrValue; List attrList; String elementName = element.getName(); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList<String> resources = null; final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue(); final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute( curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); final boolean overlyEagerLinkDetection = ((Boolean)getUncheckedAttribute( curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue(); // HREF if (((attr = attributes.get("href")) != null) && ((attrValue = attr.getValue()) != null)) { CharSequence context = Link.elementContext(elementName, attr .getKey()); if ("link".equals(elementName)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(curi, attrValue, context); } else { // other HREFs treated as links processLink(curi, attrValue, context); } if ("base".equals(elementName)) { try { curi.setBaseURI(attrValue); } catch (URIException e) { if (getController() != null) { // Controller can be null: e.g. when running // ExtractorTool. getController().logUriError(e, curi.getUURI(), attrValue); } else { logger.info("Failed set base uri: " + curi + ", " + attrValue + ": " + e.getMessage()); } } } } // ACTION else if (((attr = attributes.get("action")) != null) && ((attrValue = attr.getValue()) != null)) { if (!ignoreFormActions) { CharSequence context = Link.elementContext(elementName, attr .getKey()); processLink(curi, attrValue, context); } } // ON_ else if ((attrList = findOnAttributes(attributes)).size() != 0) { for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) { attr = (Attribute) attrIter.next(); CharSequence valueSegment = attr.getValueSegment(); if (valueSegment != null) processScriptCode(curi, valueSegment); } } // SRC atc. else if ((((attr = attributes.get("src")) != null) || ((attr = attributes.get("lowsrc")) != null) || ((attr = attributes.get("background")) != null) || ((attr = attributes.get("cite")) != null) || ((attr = attributes.get("longdesc")) != null) || ((attr = attributes.get("usemap")) != null) || ((attr = attributes.get("profile")) != null) || ((attr = attributes.get("datasrc")) != null)) && ((attrValue = attr.getValue()) != null)) { final char hopType; CharSequence context = Link.elementContext(elementName, attr .getKey()); if (!framesAsEmbeds && ("frame".equals(elementName) || "iframe" .equals(elementName))) hopType = Link.NAVLINK_HOP; else hopType = Link.EMBED_HOP; processEmbed(curi, attrValue, context, hopType); } // CODEBASE else if (((attr = attributes.get("codebase")) != null) && ((attrValue = attr.getValue()) != null)) { codebase = StringEscapeUtils.unescapeHtml(attrValue); CharSequence context = Link.elementContext(elementName, attr .getKey()); processEmbed(curi, codebase, context); } // CLASSID DATA else if ((((attr = attributes.get("classid")) != null) || ((attr = attributes.get("data")) != null)) && ((attrValue = attr.getValue()) != null)) { if (resources == null) resources = new ArrayList<String>(); resources.add(attrValue); } // ARCHIVE else if (((attr = attributes.get("archive")) != null) && ((attrValue = attr.getValue()) != null)) { if (resources == null) resources = new ArrayList<String>(); String[] multi = TextUtils.split(WHITESPACE, attrValue); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } // CODE else if (((attr = attributes.get("code")) != null) && ((attrValue = attr.getValue()) != null)) { if (resources == null)
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -