?? heritrix.java
字號:
/* Heritrix * * $Id: Heritrix.java 6081 2008-12-09 00:58:14Z gojomo $ * * Created on May 15, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.InetAddress;import java.net.URL;import java.net.URLConnection;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Properties;import java.util.StringTokenizer;import java.util.TimeZone;import java.util.Vector;import java.util.logging.Level;import java.util.logging.LogManager;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InstanceNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.MBeanServerFactory;import javax.management.MalformedObjectNameException;import javax.management.NotCompliantMBeanException;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.OpenType;import javax.management.openmbean.SimpleType;import javax.management.openmbean.TabularData;import javax.management.openmbean.TabularDataSupport;import javax.management.openmbean.TabularType;import javax.naming.CompoundName;import javax.naming.Context;import javax.naming.NameNotFoundException;import javax.naming.NamingException;import javax.naming.NoInitialContextException;import org.apache.commons.cli.Option;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.CrawlJobErrorHandler;import org.archive.crawler.admin.CrawlJobHandler;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.AlertManager;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.selftest.SelfTestCrawlJobHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.io.SinkHandler;import org.archive.io.SinkHandlerLogRecord;import org.archive.net.UURI;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.JmxUtils;import org.archive.util.JndiUtils;import org.archive.util.PropertyUtils;import org.archive.util.TextUtils;import sun.net.www.protocol.file.FileURLConnection;/** * Main class for Heritrix crawler. * * Heritrix is usually launched by a shell script that backgrounds heritrix * that redirects all stdout and stderr emitted by heritrix to a log file. So * that startup messages emitted subsequent to the redirection of stdout and * stderr show on the console, this class prints usage or startup output * such as where the web UI can be found, etc., to a STARTLOG that the shell * script is waiting on. As soon as the shell script sees output in this file, * it prints its content and breaks out of its wait. * See ${HERITRIX_HOME}/bin/heritrix. * * <p>Heritrix can also be embedded or launched by webapp initialization or * by JMX bootstrapping. So far I count 4 methods of instantiation: * <ol> * <li>From this classes main -- the method usually used;</li> * <li>From the Heritrix UI (The local-instances.jsp) page;</li> * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li> * <li>A container such as tomcat or jboss.</li> * </ol> * * @author gojomo * @author Kristinn Sigurdsson * @author Stack */public class Heritrix implements DynamicMBean, MBeanRegistration { /** * Heritrix logging instance. */ private static final Logger logger = Logger.getLogger(Heritrix.class.getName()); private static final File TMPDIR = new File(System.getProperty("java.io.tmpdir", "/tmp")); /** * Name of the heritrix properties file. */ private static final String PROPERTIES = "heritrix.properties"; /** * Name of the key to use specifying alternate heritrix properties on * command line. */ private static final String PROPERTIES_KEY = PROPERTIES; /** * Prefix used on our properties we'll add to the System.properties list. */ private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix."; /** * Prefix used on other properties we'll add to the System.properties * list (after stripping this prefix). */ private static final String SYSTEM_PREFIX = "system."; /** * Instance of web server if one was started. */ private static SimpleHttpServer httpServer = null; /** * CrawlJob handler. Manages multiple crawl jobs at runtime. */ private CrawlJobHandler jobHandler = null; /** * Heritrix start log file. * * This file contains standard out produced by this main class for startup * only. Used by heritrix shell script. Name here MUST match that in the * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell * wrapper has on this here java heritrix. */ private static final String STARTLOG = "heritrix_dmesg.log"; /** * Default encoding. * * Used for content when fetching if none specified. */ public static final String DEFAULT_ENCODING = "ISO-8859-1"; /** * Heritrix stderr/stdout log file. * * This file should have nothing in it except messages over which we have * no control (JVM stacktrace, 3rd-party lib emissions). The wrapper * startup script directs stderr/stdout here. This is an INTERDEPENDENCY * this program has with the wrapper shell script. Shell can actually * pass us an alternate to use for this file. */ private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log"; /** * Where to write this classes startup output. * * This out should only be used if Heritrix is being run from the * command-line. */ private static PrintWriter out = null; /** * The org.archive package */ private static final String ARCHIVE_PACKAGE = "org.archive."; /** * The crawler package. */ private static final String CRAWLER_PACKAGE = Heritrix.class.getName(). substring(0, Heritrix.class.getName().lastIndexOf('.')); /** * The root context for a webapp. */ private static final String ROOT_CONTEXT = "/"; /** * Set to true if application is started from command line. */ private static boolean commandLine = false; /** * True if container initialization has been run. */ private static boolean containerInitialized = false; /** * True if properties have been loaded. */ private static boolean propertiesLoaded = false; private static final String JAR_SUFFIX = ".jar"; private AlertManager alertManager; /** * The context of the GUI webapp. Default is root. */ private static String adminContext = ROOT_CONTEXT; /** * True if we're to put up a GUI. * Cmdline processing can override. */ private static boolean gui = !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui"); /** * Port to put the GUI up on. * Cmdline processing can override. */ private static int guiPort = SimpleHttpServer.DEFAULT_PORT; /** * A collection containing only localhost. Used as default value * for guiHosts, and passed to SimpleHttpServer when doing selftest. */ final private static Collection<String> LOCALHOST_ONLY = Collections.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" })); /** * Hosts to bind the GUI webserver to. * By default, only contans localhost. * Set to an empty collection to indicate that all available network * interfaces should be used for the webserver. */ private static Collection<String> guiHosts = LOCALHOST_ONLY; /** * Web UI server, realm, context name. */ private static String ADMIN = "admin"; // OpenMBean support. /** * The MBean server we're registered with (May be null). */ private MBeanServer mbeanServer = null; /** * MBean name we were registered as. */ private ObjectName mbeanName = null; /** * Keep reference to all instances of Heritrix. * Used by the UI to figure which of the local Heritrice it should * be going against and to figure what to shutdown on the way out (If * there was always a JMX Agent, we wouldn't need to keep this list. We * could always ask the JMX Agent for all instances. UPDATE: True we could * always ask the JMX Agent but we might keep around this local reference * because it will allow faster, less awkward -- think of marshalling the args * for JMX invoke operation -- access to local Heritrix instances. A new * usage for this instances Map is in CrawlJob#preRegister to find the hosting * Heritrix instance). */ private static Map<String,Heritrix> instances = new Hashtable<String,Heritrix>(); private OpenMBeanInfoSupport openMBeanInfo; private final static String STATUS_ATTR = "Status"; private final static String VERSION_ATTR = "Version"; private final static String ISRUNNING_ATTR = "IsRunning"; private final static String ISCRAWLING_ATTR = "IsCrawling"; private final static String ALERTCOUNT_ATTR = "AlertCount"; private final static String NEWALERTCOUNT_ATTR = "NewAlertCount"; private final static String CURRENTJOB_ATTR = "CurrentJob"; private final static List ATTRIBUTE_LIST; static { ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR, VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR, ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR});
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -