?? preconditionenforcer.java

?? 爬蟲
?? JAVA
?? 第 1 頁 / 共 2 頁
字號:
12 下一頁
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * SimplePolitenessEnforcer.java * Created on May 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/prefetch/PreconditionEnforcer.java,v 1.25 2006/08/30 21:24:45 stack-sf Exp $ */package org.archive.crawler.prefetch;import java.util.Iterator;import java.util.Set;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.net.UURI;/** * Ensures the preconditions for a fetch -- such as DNS lookup  * or acquiring and respecting a robots.txt policy -- are * satisfied before a URI is passed to subsequent stages. * * @author gojomo */public class PreconditionEnforcer        extends Processor        implements CoreAttributeConstants, FetchStatusCodes {    private static final Logger logger =        Logger.getLogger(PreconditionEnforcer.class.getName());    private final static Integer DEFAULT_IP_VALIDITY_DURATION =         new Integer(60*60*6); // six hours     private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION =        new Integer(60*60*24); // one day    /** seconds to keep IP information for */    public final static String ATTR_IP_VALIDITY_DURATION        = "ip-validity-duration-seconds";    /** seconds to cache robots info */    public final static String ATTR_ROBOTS_VALIDITY_DURATION        = "robot-validity-duration-seconds";    /** whether to calculate robots exclusion without applying */    public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;    public final static String ATTR_CALCULATE_ROBOTS_ONLY         = "calculate-robots-only";        public PreconditionEnforcer(String name) {        super(name, "Precondition enforcer");        Type e;        e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION,                "The minimum interval for which a dns-record will be considered " +                "valid (in seconds). " +                "If the record's DNS TTL is larger, that will be used instead.",                DEFAULT_IP_VALIDITY_DURATION));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION,                "The time in seconds that fetched robots.txt information is " +                "considered to be valid. " +                "If the value is set to '0', then the robots.txt information" +                " will never expire.",                DEFAULT_ROBOTS_VALIDITY_DURATION));        e.setExpertSetting(true);                e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY,                "Whether to only calculate the robots status of an URI, " +                "without actually applying any exclusions found. If true, " +                "exlcuded URIs will only be annotated in the crawl.log, but " +                "still fetched. Default is false. ",                DEFAULT_CALCULATE_ROBOTS_ONLY));        e.setExpertSetting(true);    }    protected void innerProcess(CrawlURI curi) {        if (considerDnsPreconditions(curi)) {            return;        }        // make sure we only process schemes we understand (i.e. not dns)        String scheme = curi.getUURI().getScheme().toLowerCase();        if (! (scheme.equals("http") || scheme.equals("https"))) {            logger.fine("PolitenessEnforcer doesn't understand uri's of type " +                scheme + " (ignoring)");            return;        }        if (considerRobotsPreconditions(curi)) {            return;        }        if (!curi.isPrerequisite() && credentialPrecondition(curi)) {            return;        }        // OK, it's allowed        // For all curis that will in fact be fetched, set appropriate delays.        // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors        // curi.setDelayFactor(getDelayFactorFor(curi));        // curi.setMinimumDelay(getMinimumDelayFor(curi));        return;    }    /**     * Consider the robots precondition.     *     * @param curi CrawlURI we're checking for any required preconditions.     * @return True, if this <code>curi</code> has a precondition or processing     *         should be terminated for some other reason.  False if     *         we can precede to process this url.     */    private boolean considerRobotsPreconditions(CrawlURI curi) {//        // treat /robots.txt fetches specially//        UURI uuri = curi.getUURI();//        try {//            if (uuri != null && uuri.getPath() != null &&//                    curi.getUURI().getPath().equals("/robots.txt")) {//                // allow processing to continue//                curi.setPrerequisite(true);//                return false;//            }//        }//        catch (URIException e) {//            logger.severe("Failed get of path for " + curi);//        }//        // require /robots.txt if not present//        if (isRobotsExpired(curi)) {//        	// Need to get robots//            if (logger.isLoggable(Level.FINE)) {//                logger.fine( "No valid robots for " +//                    getController().getServerCache().getServerFor(curi) +//                    "; deferring " + curi);//            }////            // Robots expired - should be refetched even though its already//            // crawled.//            try {//                String prereq = curi.getUURI().resolve("/robots.txt").toString();//                curi.markPrerequisite(prereq,//                    getController().getPostprocessorChain());//            }//            catch (URIException e1) {//                logger.severe("Failed resolve using " + curi);//                throw new RuntimeException(e1); // shouldn't ever happen//            }//            return true;//        }//        // test against robots.txt if available//        CrawlServer cs = getController().getServerCache().getServerFor(curi);//        if(cs.isValidRobots()){//            String ua = getController().getOrder().getUserAgent(curi);//            if(cs.getRobots().disallows(curi, ua)) {//                if(((Boolean)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {//                    // annotate URI as excluded, but continue to process normally//                    curi.addAnnotation("robotExcluded");//                    return false; //                }//                // mark as precluded; in FetchHTTP, this will//                // prevent fetching and cause a skip to the end//                // of processing (unless an intervening processor//                // overrules)//                curi.setFetchStatus(S_ROBOTS_PRECLUDED);//                curi.putString("error","robots.txt exclusion");//                logger.fine("robots.txt precluded " + curi);//                return true;//            }//            return false;//        }//        // No valid robots found => Attempt to get robots.txt failed//        curi.skipToProcessorChain(getController().getPostprocessorChain());//        curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);//        curi.putString("error","robots.txt prerequisite failed");//        if (logger.isLoggable(Level.FINE)) {//            logger.fine("robots.txt prerequisite failed " + curi);//        }//        return true;    	return false;    }    /**     * @param curi CrawlURI whose dns prerequisite we're to check.     * @return true if no further processing in this module should occur     */    private boolean considerDnsPreconditions(CrawlURI curi) {        if(curi.getUURI().getScheme().equals("dns")){            // DNS URIs never have a DNS precondition            curi.setPrerequisite(true);            return false;         }                CrawlServer cs = getController().getServerCache().getServerFor(curi);        if(cs == null) {            curi.setFetchStatus(S_UNFETCHABLE_URI);            curi.skipToProcessorChain(getController().getPostprocessorChain());            return true;        }        // If we've done a dns lookup and it didn't resolve a host        // cancel further fetch-processing of this URI, because        // the domain is unresolvable        CrawlHost ch = getController().getServerCache().getHostFor(curi);        if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {            if (logger.isLoggable(Level.FINE)) {                logger.fine( "no dns for " + ch +                    " cancelling processing for CrawlURI " + curi.toString());            }            curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);            curi.skipToProcessorChain(getController().getPostprocessorChain());            return true;        }        // If we haven't done a dns lookup  and this isn't a dns uri        // shoot that off and defer further processing        if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
12 下一頁
?? 文件大小 10600 K
?? 上傳用戶 ludingpc
?? 所屬分類中間件編程
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? preconditionenforcer.java

?? 快捷鍵說明