?? preconditionenforcer.java
字號:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimplePolitenessEnforcer.java * Created on May 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/prefetch/PreconditionEnforcer.java,v 1.25 2006/08/30 21:24:45 stack-sf Exp $ */package org.archive.crawler.prefetch;import java.util.Iterator;import java.util.Set;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.net.UURI;/** * Ensures the preconditions for a fetch -- such as DNS lookup * or acquiring and respecting a robots.txt policy -- are * satisfied before a URI is passed to subsequent stages. * * @author gojomo */public class PreconditionEnforcer extends Processor implements CoreAttributeConstants, FetchStatusCodes { private static final Logger logger = Logger.getLogger(PreconditionEnforcer.class.getName()); private final static Integer DEFAULT_IP_VALIDITY_DURATION = new Integer(60*60*6); // six hours private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION = new Integer(60*60*24); // one day /** seconds to keep IP information for */ public final static String ATTR_IP_VALIDITY_DURATION = "ip-validity-duration-seconds"; /** seconds to cache robots info */ public final static String ATTR_ROBOTS_VALIDITY_DURATION = "robot-validity-duration-seconds"; /** whether to calculate robots exclusion without applying */ public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE; public final static String ATTR_CALCULATE_ROBOTS_ONLY = "calculate-robots-only"; public PreconditionEnforcer(String name) { super(name, "Precondition enforcer"); Type e; e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION, "The minimum interval for which a dns-record will be considered " + "valid (in seconds). " + "If the record's DNS TTL is larger, that will be used instead.", DEFAULT_IP_VALIDITY_DURATION)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION, "The time in seconds that fetched robots.txt information is " + "considered to be valid. " + "If the value is set to '0', then the robots.txt information" + " will never expire.", DEFAULT_ROBOTS_VALIDITY_DURATION)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY, "Whether to only calculate the robots status of an URI, " + "without actually applying any exclusions found. If true, " + "exlcuded URIs will only be annotated in the crawl.log, but " + "still fetched. Default is false. ", DEFAULT_CALCULATE_ROBOTS_ONLY)); e.setExpertSetting(true); } protected void innerProcess(CrawlURI curi) { if (considerDnsPreconditions(curi)) { return; } // make sure we only process schemes we understand (i.e. not dns) String scheme = curi.getUURI().getScheme().toLowerCase(); if (! (scheme.equals("http") || scheme.equals("https"))) { logger.fine("PolitenessEnforcer doesn't understand uri's of type " + scheme + " (ignoring)"); return; } if (considerRobotsPreconditions(curi)) { return; } if (!curi.isPrerequisite() && credentialPrecondition(curi)) { return; } // OK, it's allowed // For all curis that will in fact be fetched, set appropriate delays. // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors // curi.setDelayFactor(getDelayFactorFor(curi)); // curi.setMinimumDelay(getMinimumDelayFor(curi)); return; } /** * Consider the robots precondition. * * @param curi CrawlURI we're checking for any required preconditions. * @return True, if this <code>curi</code> has a precondition or processing * should be terminated for some other reason. False if * we can precede to process this url. */ private boolean considerRobotsPreconditions(CrawlURI curi) {// // treat /robots.txt fetches specially// UURI uuri = curi.getUURI();// try {// if (uuri != null && uuri.getPath() != null &&// curi.getUURI().getPath().equals("/robots.txt")) {// // allow processing to continue// curi.setPrerequisite(true);// return false;// }// }// catch (URIException e) {// logger.severe("Failed get of path for " + curi);// }// // require /robots.txt if not present// if (isRobotsExpired(curi)) {// // Need to get robots// if (logger.isLoggable(Level.FINE)) {// logger.fine( "No valid robots for " +// getController().getServerCache().getServerFor(curi) +// "; deferring " + curi);// }//// // Robots expired - should be refetched even though its already// // crawled.// try {// String prereq = curi.getUURI().resolve("/robots.txt").toString();// curi.markPrerequisite(prereq,// getController().getPostprocessorChain());// }// catch (URIException e1) {// logger.severe("Failed resolve using " + curi);// throw new RuntimeException(e1); // shouldn't ever happen// }// return true;// }// // test against robots.txt if available// CrawlServer cs = getController().getServerCache().getServerFor(curi);// if(cs.isValidRobots()){// String ua = getController().getOrder().getUserAgent(curi);// if(cs.getRobots().disallows(curi, ua)) {// if(((Boolean)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {// // annotate URI as excluded, but continue to process normally// curi.addAnnotation("robotExcluded");// return false; // }// // mark as precluded; in FetchHTTP, this will// // prevent fetching and cause a skip to the end// // of processing (unless an intervening processor// // overrules)// curi.setFetchStatus(S_ROBOTS_PRECLUDED);// curi.putString("error","robots.txt exclusion");// logger.fine("robots.txt precluded " + curi);// return true;// }// return false;// }// // No valid robots found => Attempt to get robots.txt failed// curi.skipToProcessorChain(getController().getPostprocessorChain());// curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);// curi.putString("error","robots.txt prerequisite failed");// if (logger.isLoggable(Level.FINE)) {// logger.fine("robots.txt prerequisite failed " + curi);// }// return true; return false; } /** * @param curi CrawlURI whose dns prerequisite we're to check. * @return true if no further processing in this module should occur */ private boolean considerDnsPreconditions(CrawlURI curi) { if(curi.getUURI().getScheme().equals("dns")){ // DNS URIs never have a DNS precondition curi.setPrerequisite(true); return false; } CrawlServer cs = getController().getServerCache().getServerFor(curi); if(cs == null) { curi.setFetchStatus(S_UNFETCHABLE_URI); curi.skipToProcessorChain(getController().getPostprocessorChain()); return true; } // If we've done a dns lookup and it didn't resolve a host // cancel further fetch-processing of this URI, because // the domain is unresolvable CrawlHost ch = getController().getServerCache().getHostFor(curi); if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) { if (logger.isLoggable(Level.FINE)) { logger.fine( "no dns for " + ch + " cancelling processing for CrawlURI " + curi.toString()); } curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); curi.skipToProcessorChain(getController().getPostprocessorChain()); return true; } // If we haven't done a dns lookup and this isn't a dns uri // shoot that off and defer further processing if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -