亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

? 歡迎來到蟲蟲下載站! | ?? 資源下載 ?? 資源專輯 ?? 關于我們
? 蟲蟲下載站

?? jerichoextractorhtml.java

?? 這是個爬蟲和lucece相結合最好了
?? JAVA
?? 第 1 頁 / 共 2 頁
字號:
/* JerichoExtractorHTML *  * Copyright (C) 2006 Olaf Freyer * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * $Id: JerichoExtractorHTML.java 4726 2006-11-15 17:57:11Z stack-sf $ */package org.archive.crawler.extractor;import java.util.ArrayList;import java.util.Collection;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.httpclient.URIException;import org.apache.commons.lang.StringEscapeUtils;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.TextUtils;import au.id.jericho.lib.html.Attribute;import au.id.jericho.lib.html.Attributes;import au.id.jericho.lib.html.Element;import au.id.jericho.lib.html.FormControl;import au.id.jericho.lib.html.FormControlType;import au.id.jericho.lib.html.FormField;import au.id.jericho.lib.html.FormFields;import au.id.jericho.lib.html.HTMLElementName;import au.id.jericho.lib.html.Source;import au.id.jericho.lib.html.StartTagType;/** * Improved link-extraction from an HTML content-body using jericho-html parser. * This extractor extends ExtractorHTML and mimics its workflow - but has some * substantial differences when it comes to internal implementation. Instead * of heavily relying upon java regular expressions it uses a real html parser * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net). * Using this parser it can better handle broken html (i.e. missing quotes) * and also offer improved extraction of HTML form URLs (not only extract * the action of a form, but also its default values). * Unfortunately this parser also has one major drawback - it has to read the * whole document into memory for parsing, thus has an inherent OOME risk. * This OOME risk can be reduced/eleminated by limiting the size of documents * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule). * Also note that this extractor seems to have a lower overall memory  * consumption compared to ExtractorHTML. (still to be confirmed on a larger  * scale crawl)  *  * @author Olaf Freyer * @version $Date: 2006-11-15 17:57:11 +0000 (Wed, 15 Nov 2006) $ $Revision: 4726 $ */public class JerichoExtractorHTML extends ExtractorHTML implements        CoreAttributeConstants {    private static final long serialVersionUID = 1684681316546343615L;    private Logger logger = Logger.getLogger(this.getClass().getName());    protected long numberOfFormsProcessed = 0;    public JerichoExtractorHTML(String name) {        this(name, "Jericho-HTML extractor. Extracts links from HTML " +                "documents using Jericho HTML Parser. Offers same " +                 "basic functionality as ExtractorHTML but better " +                "handles broken HTML and extraction of default " +                "values from HTML forms. A word of warning: the used " +                "parser, the Jericho HTML Parser, reads the whole " +                "document into memory for " +                "parsing - thus this extractor has an inherent OOME risk. " +                "This OOME risk can be reduced/eleminated by limiting the " +                "size of documents to be parsed (i.e. using " +                "NotExceedsDocumentLengthTresholdDecideRule). ");    }    public JerichoExtractorHTML(String name, String description) {        super(name, description);    }    private static List<Attribute> findOnAttributes(Attributes attributes) {        List<Attribute> result = new LinkedList<Attribute>();        for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {            Attribute attr = (Attribute) attrIter.next();            if (attr.getKey().startsWith("on"))                result.add(attr);        }        return result;    }    protected void processGeneralTag(CrawlURI curi, Element element,            Attributes attributes) {        Attribute attr;        String attrValue;        List attrList;        String elementName = element.getName();        // Just in case it's an OBJECT or APPLET tag        String codebase = null;        ArrayList<String> resources = null;        final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi,                ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();        final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(                curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();        final boolean overlyEagerLinkDetection =            ((Boolean)getUncheckedAttribute(                curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();        // HREF        if (((attr = attributes.get("href")) != null) &&            ((attrValue = attr.getValue()) != null)) {            CharSequence context = Link.elementContext(elementName, attr                    .getKey());            if ("link".equals(elementName)) {                // <LINK> elements treated as embeds (css, ico, etc)                processEmbed(curi, attrValue, context);            } else {                // other HREFs treated as links                processLink(curi, attrValue, context);            }            if ("base".equals(elementName)) {                try {                    curi.setBaseURI(attrValue);                } catch (URIException e) {                    if (getController() != null) {                        // Controller can be null: e.g. when running                        // ExtractorTool.                        getController().logUriError(e, curi.getUURI(),                                attrValue);                    } else {                        logger.info("Failed set base uri: " + curi + ", "                                + attrValue + ": " + e.getMessage());                    }                }            }        }        // ACTION        else if (((attr = attributes.get("action")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            if (!ignoreFormActions) {                CharSequence context = Link.elementContext(elementName, attr                        .getKey());                processLink(curi, attrValue, context);            }        }        // ON_        else if ((attrList = findOnAttributes(attributes)).size() != 0) {            for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {                attr = (Attribute) attrIter.next();                CharSequence valueSegment = attr.getValueSegment();                if (valueSegment != null)                    processScriptCode(curi, valueSegment);            }        }        // SRC atc.        else if ((((attr = attributes.get("src")) != null)                || ((attr = attributes.get("lowsrc")) != null)                || ((attr = attributes.get("background")) != null)                || ((attr = attributes.get("cite")) != null)                || ((attr = attributes.get("longdesc")) != null)                || ((attr = attributes.get("usemap")) != null)                || ((attr = attributes.get("profile")) != null)                || ((attr = attributes.get("datasrc")) != null)) &&                   ((attrValue = attr.getValue()) != null)) {            final char hopType;            CharSequence context = Link.elementContext(elementName, attr                    .getKey());            if (!framesAsEmbeds                    && ("frame".equals(elementName) || "iframe"                            .equals(elementName)))                hopType = Link.NAVLINK_HOP;            else                hopType = Link.EMBED_HOP;            processEmbed(curi, attrValue, context, hopType);        }        // CODEBASE        else if (((attr = attributes.get("codebase")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            codebase = StringEscapeUtils.unescapeHtml(attrValue);            CharSequence context = Link.elementContext(elementName, attr                    .getKey());            processEmbed(curi, codebase, context);        }        // CLASSID DATA        else if ((((attr = attributes.get("classid")) != null)                || ((attr = attributes.get("data")) != null)) &&                   ((attrValue = attr.getValue()) != null)) {            if (resources == null)                resources = new ArrayList<String>();            resources.add(attrValue);        }        // ARCHIVE        else if (((attr = attributes.get("archive")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            if (resources == null)                resources = new ArrayList<String>();            String[] multi = TextUtils.split(WHITESPACE, attrValue);            for (int i = 0; i < multi.length; i++) {                resources.add(multi[i]);            }        }        // CODE        else if (((attr = attributes.get("code")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            if (resources == null)

?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频
日日夜夜一区二区| 欧美一区二区大片| 国产女人18毛片水真多成人如厕| 日韩激情中文字幕| 欧美日韩一区二区三区在线| 亚洲欧美日韩电影| 在线亚洲一区观看| 亚洲精品中文在线观看| 91污片在线观看| 亚洲色图.com| 在线日韩一区二区| 亚洲精品国产a久久久久久| 一本一本久久a久久精品综合麻豆| 欧美激情一区二区三区全黄| 懂色av一区二区三区免费观看| 久久久亚洲精华液精华液精华液| 久久成人精品无人区| 精品国内二区三区| 国产美女娇喘av呻吟久久| 久久无码av三级| 风间由美一区二区三区在线观看| 国产亚洲视频系列| 国产一区二区三区高清播放| 国产丝袜美腿一区二区三区| 成人小视频在线观看| **欧美大码日韩| 欧美日韩国产三级| 亚洲成人7777| 日韩三级免费观看| 精品少妇一区二区三区日产乱码 | 美女一区二区在线观看| 91麻豆精品国产91久久久| 久久精品国产99| 日本一区二区三区免费乱视频| 成人激情图片网| 艳妇臀荡乳欲伦亚洲一区| 在线电影欧美成精品| 久久福利视频一区二区| www精品美女久久久tv| 99精品久久99久久久久| 日韩福利视频网| 国产欧美中文在线| 欧美在线视频日韩| 国内精品久久久久影院色| 国产精品久久久久久久久久免费看| 色综合久久久久综合| 亚洲第一主播视频| wwwwxxxxx欧美| 在线一区二区视频| 美女一区二区三区| 亚洲天堂免费看| 欧美一区三区四区| 99热在这里有精品免费| 日韩制服丝袜av| 中文字幕不卡在线播放| 欧美日韩国产首页| 成人动漫中文字幕| 日本美女一区二区| 久久综合九色综合久久久精品综合| 大白屁股一区二区视频| 日韩在线一二三区| 国产精品久久久久aaaa樱花 | 久久久久久久久久久久久女国产乱 | 美腿丝袜在线亚洲一区| 中文字幕国产精品一区二区| 日韩一区二区三免费高清| 一本一道综合狠狠老| 国产在线不卡一区| 亚洲第一福利视频在线| 国产精品电影一区二区| 精品国产乱码久久久久久久久| 国产一区二区在线视频| 天天影视色香欲综合网老头| 国产精品无人区| 久久久久久免费| 日韩一区二区三区视频| 欧美唯美清纯偷拍| 99视频热这里只有精品免费| 国产精品18久久久久久久网站| 最新中文字幕一区二区三区| xfplay精品久久| 欧美一区二区三区性视频| 欧美亚洲综合另类| 91免费版pro下载短视频| 粗大黑人巨茎大战欧美成人| 黄色成人免费在线| 免费在线一区观看| 日韩国产在线观看一区| 一区二区国产视频| 一区二区成人在线观看| 亚洲少妇中出一区| 亚洲视频精选在线| 亚洲毛片av在线| 一区二区三区不卡视频 | 色婷婷亚洲一区二区三区| 成人激情免费视频| 国产另类ts人妖一区二区| 狠狠色丁香九九婷婷综合五月| 蜜臀av性久久久久蜜臀aⅴ流畅 | 午夜欧美电影在线观看| 亚洲欧洲色图综合| 亚洲精品美腿丝袜| 亚洲精品国产成人久久av盗摄 | 国产午夜一区二区三区| 日本一区二区三区在线观看| 欧美成人aa大片| 国产午夜亚洲精品午夜鲁丝片| 久久综合色鬼综合色| 国产亚洲短视频| 国产精品久久免费看| 亚洲欧美激情在线| 亚洲国产欧美日韩另类综合 | 盗摄精品av一区二区三区| 极品少妇xxxx精品少妇偷拍| 国产最新精品精品你懂的| 国产精品自拍av| 国产成人久久精品77777最新版本| 国产呦萝稀缺另类资源| 成人av电影免费在线播放| 91国模大尺度私拍在线视频| 欧美性xxxxx极品少妇| 欧美一卡二卡三卡四卡| 精品国产乱码久久久久久闺蜜| 337p日本欧洲亚洲大胆精品| 国产精品美女久久久久久久久| 亚洲欧洲国产日本综合| 午夜激情久久久| 国产一区 二区 三区一级| 色噜噜狠狠成人网p站| 久久久亚洲国产美女国产盗摄 | 国产欧美日本一区二区三区| 一区二区免费视频| 国产凹凸在线观看一区二区| 欧美日韩国产一区二区三区地区| 久久久九九九九| 免费高清视频精品| 欧美三区在线观看| 国产精品美女久久久久久久久| 免费高清在线一区| 欧美色手机在线观看| 中文字幕一区二区三区精华液| 久久99精品国产91久久来源| 欧美久久一二区| 亚洲欧洲综合另类在线| 丁香另类激情小说| 久久亚洲一区二区三区明星换脸| 天堂精品中文字幕在线| 色综合 综合色| 亚洲色图欧洲色图| 成人h精品动漫一区二区三区| 精品成人一区二区三区| 五月天激情综合网| 色综合久久久久网| 亚洲欧美另类久久久精品2019| 国产不卡在线一区| 国产欧美日韩综合精品一区二区| 激情小说欧美图片| 欧美一区三区四区| 日本女人一区二区三区| 欧美日韩电影一区| 亚洲v精品v日韩v欧美v专区| 欧美色综合网站| 亚洲综合网站在线观看| 91福利国产成人精品照片| 亚洲欧美日韩电影| 色乱码一区二区三区88| 亚洲欧洲日韩av| 91丨九色丨国产丨porny| 国产精品久久午夜| 一本色道亚洲精品aⅴ| 亚洲最新视频在线播放| 欧美性感一区二区三区| 亚洲成年人网站在线观看| 69堂亚洲精品首页| 奇米影视7777精品一区二区| 日韩午夜电影av| 国内精品视频666| 国产日韩欧美不卡| 95精品视频在线| 亚洲精品videosex极品| 欧美性猛交一区二区三区精品| 亚洲成av人片一区二区三区| 欧美一区日韩一区| 国产一区91精品张津瑜| 亚洲欧洲国产专区| 欧美三级韩国三级日本三斤 | 久久综合av免费| 成人高清视频免费观看| 亚洲三级免费电影| 欧美性三三影院| 麻豆极品一区二区三区| 欧美国产日韩亚洲一区| 在线观看国产91| 麻豆国产精品一区二区三区| 国产欧美日韩三区| 欧美日韩一区高清| 国模冰冰炮一区二区| 亚洲男人的天堂网| 日韩欧美精品三级| 91欧美激情一区二区三区成人|