?? heritrix.java
字號(hào):
new File(File.separator + SELFTEST); File crawlOrderFile = new File(selftestDir, "order.xml"); // Create a job based off the selftest order file. Then use this as // a template to pass jobHandler.newJob(). Doing this gets our // selftest output to show under the jobs directory. // Pass as a seed a pointer to the webserver we just put up. final String ROOTURI = "127.0.0.1:" + Integer.toString(port); String selfTestUrl = "http://" + ROOTURI + '/'; if (oneSelfTestName != null && oneSelfTestName.length() > 0) { selfTestUrl += (oneSelfTestName + '/'); } CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(), oneSelfTestName, selfTestUrl); Heritrix h = new Heritrix("Selftest", true, cjh); CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template"); job = h.getJobHandler().newJob(job, null, SELFTEST, "Integration self test", selfTestUrl, CrawlJob.PRIORITY_AVERAGE); h.getJobHandler().addJob(job); // Before we start, need to change some items in the settings file. CredentialStore cs = (CredentialStore)job.getSettingsHandler(). getOrder().getAttribute(CredentialStore.ATTR_NAME); for (Iterator i = cs.iterator(null); i.hasNext();) { ((Credential)i.next()).setCredentialDomain(null, ROOTURI); } h.getJobHandler().startCrawler(); StringBuffer buffer = new StringBuffer(); buffer.append("Heritrix " + Heritrix.getVersion() + " selftest started."); buffer.append("\nSelftest first crawls " + selfTestUrl + " and then runs an analysis."); buffer.append("\nResult of analysis printed to " + getHeritrixOut() + " when done."); buffer.append("\nSelftest job directory for logs and arcs:\n" + job.getDirectory().getAbsolutePath()); return buffer.toString(); } /** * Launch the crawler without a web UI and run the passed crawl only. * * Specialized version of {@link #launch()}. * * @param crawlOrderFile The crawl order to crawl. * @throws InitializationException * @throws InvalidAttributeValueException * @return Status string. */ protected String doOneCrawl(String crawlOrderFile) throws InitializationException, InvalidAttributeValueException { return doOneCrawl(crawlOrderFile, null); } /** * Launch the crawler without a web UI and run passed crawl only. * * Specialized version of {@link #launch()}. * * @param crawlOrderFile The crawl order to crawl. * @param listener Register this crawl status listener before starting * crawl (You can use this listener to notice end-of-crawl). * @throws InitializationException * @throws InvalidAttributeValueException * @return Status string. */ protected String doOneCrawl(String crawlOrderFile, CrawlStatusListener listener) throws InitializationException, InvalidAttributeValueException { XMLSettingsHandler handler = new XMLSettingsHandler(new File(crawlOrderFile)); handler.initialize(); CrawlController controller = new CrawlController(); controller.initialize(handler); if (listener != null) { controller.addCrawlStatusListener(listener); } controller.requestCrawlStart(); return "Crawl started using " + crawlOrderFile + "."; } /** * Launch the crawler for a web UI. * * Crawler hangs around waiting on jobs. * * @exception Exception * @return A status string describing how the launch went. * @throws Exception */ public String launch() throws Exception { return launch(null, false); } /** * Launch the crawler for a web UI. * * Crawler hangs around waiting on jobs. * * @param crawlOrderFile File to crawl. May be null. * @param runMode Whether crawler should be set to run mode. * * @exception Exception * @return A status string describing how the launch went. */ public String launch(String crawlOrderFile, boolean runMode) throws Exception { String status = null; if (crawlOrderFile != null) { addCrawlJob(crawlOrderFile, "Autolaunched", "", ""); if(runMode) { this.jobHandler.startCrawler(); status = "Job being crawled: " + crawlOrderFile; } else { status = "Crawl job ready and pending: " + crawlOrderFile; } } else if(runMode) { // The use case is that jobs are to be run on a schedule and that // if the crawler is in run mode, then the scheduled job will be // run at appropriate time. Otherwise, not. this.jobHandler.startCrawler(); status = "Crawler set to run mode."; } return status; } /** * Start up the embedded Jetty webserver instance. * This is done when we're run from the command-line. * @param port Port number to use for web UI. * @param adminLoginPassword Compound of login and password. * @throws Exception * @return Status on webserver startup. * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword) */ protected static String startEmbeddedWebserver(final int port, final boolean lho, final String adminLoginPassword) throws Exception { ArrayList<String> hosts = new ArrayList<String>(); if (lho) { hosts.add("127.0.0.1"); } return startEmbeddedWebserver(hosts, port, adminLoginPassword); } /** * Parses a list of host names. * * <p>If the given string is <code>/</code>, then an empty * collection is returned. This indicates that all available network * interfaces should be used. * * <p>Otherwise, the string must contain a comma-separated list of * IP addresses or host names. The parsed list is then returned. * * @param hosts the string to parse * @return the parsed collection of hosts */ private static Collection<String> parseHosts(String hosts) { hosts = hosts.trim(); if (hosts.equals("/")) { return new ArrayList<String>(1); } String[] hostArray = hosts.split(","); for (int i = 0; i < hostArray.length; i++) { hostArray[i] = hostArray[i].trim(); } return Arrays.asList(hostArray); } /** * Start up the embedded Jetty webserver instance. * This is done when we're run from the command-line. * * @param hosts a list of IP addresses or hostnames to bind to, or an * empty collection to bind to all available network * interfaces * @param port Port number to use for web UI. * @param adminLoginPassword Compound of login and password. * @throws Exception * @return Status on webserver startup. */ protected static String startEmbeddedWebserver(Collection<String> hosts, int port, String adminLoginPassword) throws Exception { adminUsername = adminLoginPassword. substring(0, adminLoginPassword.indexOf(":")); adminPassword = adminLoginPassword. substring(adminLoginPassword.indexOf(":") + 1); Heritrix.httpServer = new SimpleHttpServer("admin", Heritrix.adminContext, hosts, port, false); final String DOTWAR = ".war"; final String SELFTEST = "selftest"; // Look for additional WAR files beyond 'selftest' and 'admin'. File[] wars = getWarsdir().listFiles(); for(int i = 0; i < wars.length; i++) { if(wars[i].isFile()) { final String warName = wars[i].getName(); final String warNameNC = warName.toLowerCase(); if(warNameNC.endsWith(DOTWAR) && !warNameNC.equals(ADMIN + DOTWAR) && !warNameNC.equals(SELFTEST + DOTWAR)) { int dot = warName.indexOf('.'); Heritrix.httpServer.addWebapp(warName.substring(0, dot), null, true); } } } // Name of passed 'realm' must match what is in configured in web.xml. // We'll use ROLE for 'realm' and 'role'. final String ROLE = ADMIN; Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext, adminUsername, adminPassword, ROLE); Heritrix.httpServer.startServer(); StringBuffer buffer = new StringBuffer(); buffer.append("Heritrix " + Heritrix.getVersion() + " is running."); for (String host: httpServer.getHosts()) { buffer.append("\nWeb console is at: http://"); buffer.append(host).append(':').append(port); } buffer.append("\nWeb console login and password: " + adminUsername + "/" + adminPassword); return buffer.toString(); } /** * Replace existing administrator login info with new info. * * @param newUsername new administrator login username * @param newPassword new administrator login password */ public static void resetAuthentication(String newUsername, String newPassword) { Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername, newUsername, newPassword); adminUsername = newUsername; adminPassword = newPassword; logger.info("administrative login changed to " +newUsername+":"+newPassword); } protected static CrawlJob createCrawlJob(CrawlJobHandler handler, File crawlOrderFile, String name) throws InvalidAttributeValueException { XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile); settings.initialize(); return new CrawlJob(handler.getNextJobUID(), name, settings, new CrawlJobErrorHandler(Level.SEVERE), CrawlJob.PRIORITY_HIGH, crawlOrderFile.getAbsoluteFile().getParentFile()); } /** * This method is called when we have an order file to hand that we want * to base a job on. It leaves the order file in place and just starts up * a job that uses all the order points to for locations for logs, etc. * @param orderPathOrUrl Path to an order file or to a seeds file. * @param name Name to use for this job. * @param description * @param seeds * @return A status string. * @throws IOException * @throws FatalConfigurationException */ public String addCrawlJob(String orderPathOrUrl, String name, String description, String seeds) throws IOException, FatalConfigurationException { if (!UURI.hasScheme(orderPathOrUrl)) { // Assume its a file path. return addCrawlJob(new File(orderPathOrUrl), name, description, seeds); } // Otherwise, must be an URL. URL url = new URL(orderPathOrUrl); // Handle http and file only for now (Tried to handle JarUrlConnection // but too awkward undoing jar stream. Rather just look for URLs that // end in '.jar'). String result = null; URLConnection connection = url.openConnection(); if (connection instanceof HttpURLConnection) { result = addCrawlJob(url, (HttpURLConnection)connection, name, description, seeds); } else if (connection instanceof FileURLConnection) { result = addCrawlJob(new File(url.getPath()), name, description, seeds); } else { throw new UnsupportedOperationException("No support for " + connection); } return result; } protected String addCrawlJob(final URL url, final HttpURLConnection connection, final String name, final String description, final String seeds) throws IOException, FatalConfigurationException { // Look see if its a jar file. If it is undo it. boolean isJar = url.getPath() != null && url.getPath().toLowerCase().endsWith(JAR_SUFFIX); // If http url connection, bring down the resource local. File localFile = File.createTempFile(Heritrix.class.getName(), isJar? JAR_SUFFIX: null, TMPDIR); connection.connect(); String result = null; try { IoUtils.readFullyToFile(connection.getInputStream(), localFile); result = addCrawlJob(localFile, name, description, seeds); } catch (IOException ioe) { // Cleanup if an Exception. localFile.delete(); localFile = null; } finally { connection.disconnect(); // If its a jar file, then we made a job based on the jar contents. // Its no longer needed. Remove it. If not a jar file, then leave // the file around because the job depends on it. if (isJar && localFile != null && localFile.exists()) { localFile.delete(); } } return result; } protected String addCrawlJob(final File order, final String name, final String description, final String seeds) throws FatalConfigurationException, IOException { CrawlJob addedJob = null; if (this.jobHandler == null) { throw new NullPointerException("Heritrix jobhandler is null."); } try { if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) { return addCrawlJobBasedonJar(order, name, description, seeds); } addedJob = this.jobHandler.
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -