?? heritrix.java
字號:
addJob(createCrawlJob(this.jobHandler, order, name)); } catch (InvalidAttributeValueException e) { FatalConfigurationException fce = new FatalConfigurationException( "Converted InvalidAttributeValueException on " + order.getAbsolutePath() + ": " + e.getMessage()); fce.setStackTrace(e.getStackTrace()); } return addedJob != null? addedJob.getUID(): null; } /** * Undo jar file and use as basis for a new job. * @param jarFile Pointer to file that holds jar. * @param name Name to use for new job. * @param description * @param seeds * @return Message. * @throws IOException * @throws FatalConfigurationException */ protected String addCrawlJobBasedonJar(final File jarFile, final String name, final String description, final String seeds) throws IOException, FatalConfigurationException { if (jarFile == null || !jarFile.exists()) { throw new FileNotFoundException(jarFile.getAbsolutePath()); } // Create a directory with a tmp name. Do it by first creating file, // removing it, then creating the directory. There is a hole during // which the OS may put a file of same exact name in our way but // unlikely. File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar", TMPDIR); dir.delete(); dir.mkdir(); try { org.archive.crawler.util.IoUtils.unzip(jarFile, dir); // Expect to find an order file at least. File orderFile = new File(dir, "order.xml"); if (!orderFile.exists()) { throw new IOException("Missing order: " + orderFile.getAbsolutePath()); } CrawlJob job = createCrawlJobBasedOn(orderFile, name, description, seeds); // Copy into place any seeds and settings directories before we // add job to Heritrix to crawl. File seedsFile = new File(dir, "seeds.txt"); if (seedsFile.exists()) { FileUtils.copyFiles(seedsFile, new File(job.getDirectory(), seedsFile.getName())); } addCrawlJob(job); return job.getUID(); } finally { // After job has been added, no more need of expanded content. // (Let the caller be responsible for cleanup of jar. Sometimes // its should be deleted -- when its a local copy of a jar pulled // across the net -- wherease other times, if its a jar passed // in w/ a 'file' scheme, it shouldn't be deleted. org.archive.util.FileUtils.deleteDir(dir); } } public String addCrawlJobBasedOn(String jobUidOrProfile, String name, String description, String seeds) { try { CrawlJob cj = getJobHandler().getJob(jobUidOrProfile); if (cj == null) { throw new InvalidAttributeValueException(jobUidOrProfile + " is not a job UID or profile name (Job UIDs are " + " usually the 14 digit date portion of job name)."); } CrawlJob job = addCrawlJobBasedOn( cj.getSettingsHandler().getOrderFile(), name, description, seeds); return job.getUID(); } catch (Exception e) { e.printStackTrace(); return "Exception on " + jobUidOrProfile + ": " + e.getMessage(); } } protected CrawlJob addCrawlJobBasedOn(final File orderFile, final String name, final String description, final String seeds) throws FatalConfigurationException { return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description, seeds)); } protected CrawlJob createCrawlJobBasedOn(final File orderFile, final String name, final String description, final String seeds) throws FatalConfigurationException { CrawlJob job = getJobHandler().newJob(orderFile, name, description, seeds); return CrawlJobHandler.ensureNewJobWritten(job, name, description); } protected CrawlJob addCrawlJob(final CrawlJob job) { return getJobHandler().addJob(job); } public void startCrawling() { if (getJobHandler() == null) { throw new NullPointerException("Heritrix jobhandler is null."); } getJobHandler().startCrawler(); } public void stopCrawling() { if (getJobHandler() == null) { throw new NullPointerException("Heritrix jobhandler is null."); } getJobHandler().stopCrawler(); } /** * Get the heritrix version. * * @return The heritrix version. May be null. */ public static String getVersion() { return System.getProperty("heritrix.version"); } /** * Get the job handler * * @return The CrawlJobHandler being used. */ public CrawlJobHandler getJobHandler() { return this.jobHandler; } /** * Get the configuration directory. * @return The conf directory under HERITRIX_HOME or null if none can * be found. * @throws IOException */ public static File getConfdir() throws IOException { return getConfdir(true); } /** * Get the configuration directory. * @param fail Throw IOE if can't find directory if true, else just * return null. * @return The conf directory under HERITRIX_HOME or null (or an IOE) if * can't be found. * @throws IOException */ public static File getConfdir(final boolean fail) throws IOException { final String key = "heritrix.conf"; // Look to see if heritrix.conf property passed on the cmd-line. String tmp = System.getProperty(key); // if not fall back to default $HERITIX_HOME/conf if (tmp == null || tmp.length() == 0) { return getSubDir("conf", fail); } File dir = new File(tmp); if (!dir.exists()) { if (fail) { throw new IOException("Cannot find conf dir: " + tmp); } else { logger.log(Level.WARNING, "Specified " + key + " dir does not exist. Falling back on default"); } dir = getSubDir("conf", fail); } return dir; } /** * @return Returns the httpServer. May be null if one was not started. */ public static SimpleHttpServer getHttpServer() { return Heritrix.httpServer; } /** * @throws IOException * @return Returns the directory under which reside the WAR files * we're to load into the servlet container. */ public static File getWarsdir() throws IOException { return getSubDir("webapps"); } /** * Prepars for program shutdown. This method does it's best to prepare the * program so that it can exit normally. It will kill the httpServer and * terminate any running job.<br> * It is advisible to wait a few (~1000) millisec after calling this method * and before calling performHeritrixShutDown() to allow as many threads as * possible to finish what they are doing. */ public static void prepareHeritrixShutDown() { // Stop and destroy all running Heritrix instances. // Get array of the key set to avoid CCEs for case where call to // destroy does a remove of an instance from Heritrix.instances. final Object [] keys = Heritrix.instances.keySet().toArray(); for (int i = 0; i < keys.length; i++) { ((Heritrix)Heritrix.instances.get(keys[i])).destroy(); } try { deregisterJndi(getJndiContainerName()); } catch (NameNotFoundException e) { // We were probably unbound already. Ignore. logger.log(Level.WARNING, "deregistration of jndi", e); } catch (Exception e) { e.printStackTrace(); } if(Heritrix.httpServer != null) { // Shut down the web access. try { Heritrix.httpServer.stopServer(); } catch (InterruptedException e) { // Generally this can be ignored, but we'll print a stack trace // just in case. e.printStackTrace(); } finally { Heritrix.httpServer = null; } } } /** * Exit program. Recommended that prepareHeritrixShutDown() be invoked * prior to this method. */ public static void performHeritrixShutDown() { performHeritrixShutDown(0); } /** * Exit program. Recommended that prepareHeritrixShutDown() be invoked * prior to this method. * * @param exitCode Code to pass System.exit. * */ public static void performHeritrixShutDown(int exitCode) { System.exit(exitCode); } /** * Shutdown all running heritrix instances and the JVM. * Assumes stop has already been called. * @param exitCode Exit code to pass system exit. */ public static void shutdown(final int exitCode) { getShutdownThread(true, exitCode, "Heritrix shutdown").start(); } protected static Thread getShutdownThread(final boolean sysexit, final int exitCode, final String name) { Thread t = new Thread(name) { public void run() { Heritrix.prepareHeritrixShutDown(); if (sysexit) { Heritrix.performHeritrixShutDown(exitCode); } } }; t.setDaemon(true); return t; } public static void shutdown() { shutdown(0); } /** * Register Heritrix with JNDI, JMX, and with the static hashtable of all * Heritrix instances known to this JVM. * * If launched from cmdline, register Heritrix MBean if an agent to register * ourselves with. Usually this method will only have effect if we're * running in a 1.5.0 JDK and command line options such as * '-Dcom.sun.management.jmxremote.port=8082 * -Dcom.sun.management.jmxremote.authenticate=false * -Dcom.sun.management.jmxremote.ssl=false' are supplied. * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring * and Management Using JMX</a> * for more on the command line options and how to connect to the * Heritrix bean using the JDK 1.5.0 jconsole tool. We register currently * with first server we find (TODO: Make configurable). * * <p>If we register successfully with a JMX agent, then part of the * registration will include our registering ourselves with JNDI. * * <p>Finally, add the heritrix instance to the hashtable of all the * Heritrix instances floating in the current VM. This latter registeration * happens whether or no there is a JMX agent to register with. This is * a list we keep out of convenience so its easy iterating over all * all instances calling stop when main application is going down. * * @param h Instance of heritrix to register. * @param name Name to use for this Heritrix instance. * @param jmxregister True if we are to register this instance with JMX. * @throws NullPointerException * @throws MalformedObjectNameException * @throws NotCompliantMBeanException * @throws MBeanRegistrationException * @throws InstanceAlreadyExistsException */ protected static void registerHeritrix(final Heritrix h, final String name, final boolean jmxregister) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { MBeanServer server = getMBeanServer(); if (server != null) { // Are we to manage the jmx registration? Or is it being done for // us by an external process: e.g. This instance was created by // MBeanAgent. if (jmxregister) { ObjectName objName = (name == null || name.length() <= 0)? getJmxObjectName(): getJmxObjectName(name); registerMBean(server, h, objName); } } else { // JMX ain't available. Put this instance into the list of Heritrix // instances so findable by the UI (Normally this is done in the // JMX postRegister routine below). When no JMX, can only have // one instance of Heritrix so no need to do the deregisteration. Heritrix.instances.put(h.getNoJmxName(), h); } } protected static void unregisterHeritrix(final Heritrix h) throws InstanceNotFoundException, MBeanRegistration
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -