?? heritrix.properties
字號(hào):
############################################################################### H E R I T R I X P R O P E R T I E S############################################################################### Properties with 'heritrix.' or 'org.archive.crawler.' prefix get loaded# into System.properties on startup so available via System.getProperties.# Version is filled in by the maven.xml pregoal. It copies here the project# currentVersion property.heritrix.version = 1.12.1# Location of the heritrix jobs directory.heritrix.jobsdir = jobs# Default commandline startup values.# Below values are used if unspecified on the command line.heritrix.cmdline.admin = heritrix.cmdline.port = 8080heritrix.cmdline.run = falseheritrix.cmdline.nowui = falseheritrix.cmdline.order =heritrix.cmdline.jmxserver = falseheritrix.cmdline.jmxserver.port = 8081############################################################################### L O G G I N G ################################################################################ Basic logging setup; to console, all levels.# Note, after startup Heritrix adds a special logging handler called SinkHandler# that keeps around all instances of severe and warning messages for display by# the Heritrix alerting system. We used to initialize it here with the# ConsoleHandler but it wasn't being found when Heritrix was packaged as a webapp# deployed in Tomcat.handlers = java.util.logging.ConsoleHandlerjava.util.logging.ConsoleHandler.level = ALLjava.util.logging.ConsoleHandler.formatter= org.archive.util.OneLineSimpleLogger # Default global logging level: only warnings or higher.level= WARNING# Enable frontier INFO logging# org.archive.crawler.frontier.BdbFrontier.level = FINER# org.archive.crawler.frontier.BdbMultipleWorkQueues.level = INFO# org.archive.crawler.frontier.AbstractFrontier.level = INFO# org.archive.crawler.frontier.WorkQueueFrontier.level = INFO# Currently necessary (?) for standard logs to workcrawl.level= INFOruntime-errors.level= INFOlocal-errors.level= INFOuri-errors.level= INFOprogress-statistics.level= INFOrecover.level= INFO# Set the selftest console logger level.org.archive.crawler.admin.SelftestCrawlJobHandler.level = INFO# Enable info level on CrawlJob so can see it registered jobs w/ mbeanserverorg.archive.crawler.admin.CrawlJob.level = INFO# Enable the following lines to watch authentications running.# org.archive.crawler.fetcher.FetchHTTP.level = FINE# org.archive.crawler.prefetch.PreconditionEnforcer.level = FINE # HttpClient is too chatty... only want to hear about severe problems# For more on httpclient logging,# see http://jakarta.apache.org/commons/httpclient/logging.htmlorg.apache.commons.httpclient.level = SEVERE# If you need verbose console logging of HttpClient traffic, uncomment the# following line: # httpclient.wire.level = FINE# Enable ARCWriter.level line below to see logging of the opening and closing# of arc files in console output (Console goes into heritrix_out.log). If you'd# rather have console output, including the opening and closing of ARCs, go to# a file, enable the FileHandler logger -- see the commented out 'handlers'# line above -- and enable the lines below related to FileHandler (Configure to# your preference -- the '%h' in the below means value of java.home system# property, usually your home directory). There does not seem to be a way# other than in code to configure only ARCWriter writing to the FileHandler.# Even if you do do code changes, its awkward making the log show in each# individual jobs log directory. Note, the closing of arc files on shutdown is# not logged.# org.archive.io.arc.ARCWriter.level = INFO# Enable to watch authentication progress amongst other details on http fetch.# org.archive.crawler.fetcher.FetchHTTP.level = FINE# Enable logging of already seen memory grows; this growing is probably# main culprit for OOMEs. Log goes into heritrix_out.log.org.archive.util.MemLongFPSet.level = INFO# org.archive.crawler.url.Canonicalizer.level = INFO# Enable logging of uris rejected by scope by setting level to INFO.# org.archive.crawler.postprocessor.LinksScoper.level = INFO# Below values are used by classes that set their own FileHandler as default# values. The pattern value is used as a suffix. Note, the below configuration# limits log file size (to 500000 bytes) and because count is 1, old log data# will be lost.java.util.logging.FileHandler.level = ALLjava.util.logging.FileHandler.pattern = %u.logjava.util.logging.FileHandler.formatter = org.archive.util.OneLineSimpleLoggerjava.util.logging.FileHandler.limit = 500000java.util.logging.FileHandler.count = 1# org.archive.crawler.admin.StatisticsTracker.level = INFO# Enable info logging so can see note on checkpoint completion in# heritrix_out.log.org.archive.crawler.framework.CrawlController.level = INFO# AdaptiveRevist module# org.archive.crawler.frontier.AdaptiveRevisitFrontier.level = FINE# org.archive.crawler.frontier.AdaptiveRevisitHostQueue.level = FINE# org.archive.crawler.frontier.AdaptiveRevisitQueueList.level = FINE# org.archive.crawler.processor.ChangeEvaluator.level = FINE# org.archive.crawler.processor.WaitEvaluator.level = FINE# org.archive.crawler.processor.HTTPContentDigest.level = FINE# ServerCache creation of crawl host (Make it FINER if you want to see # crawl server creation also).# org.archive.crawler.datamodel.ServerCache.level = FINE# Enable to see average alreadyseen lookup times.# org.archive.crawler.util.BdbUriUniqFilter.level = INFO# Enable to see duplicated-detection stats on each merge, # if using an FPMergeUriUniqFilter# org.archive.crawler.util.FPMergeUriUniqFilter.level = INFO# Enable to see report of duplicates encountered every 50K unique adds# org.archive.crawler.util.SetBasedUriUniqFilter.level = FINE# Used to set a log of all URIs presented to filter, for # later comparison profiling# org.archive.crawler.util.SetBasedUriUniqFilter.profileLogFile = uriUniq.log# Leave the below enabled so can see launch info messages on the console.org.archive.crawler.Heritrix.level = INFO# Leave RecoveryJournal level at INFO to see recovery progress org.archive.crawler.frontier.RecoveryJournal.level = INFO############################################################################## F R O N T I E R ############################################################################### List here all queue assignment policies you'd have show as a# queue-assignment-policy choice in AbstractFrontier derived Frontiers# (e.g. BdbFrontier).org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy = \ org.archive.crawler.frontier.HostnameQueueAssignmentPolicy \ org.archive.crawler.frontier.IPQueueAssignmentPolicy \ org.archive.crawler.frontier.BucketQueueAssignmentPolicy \ org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicyorg.archive.crawler.frontier.BdbFrontier.level = INFO############################################################################### U U R I ################################################################################ Any scheme not listed in the below will generate an UnsupportedUriScheme# exception. Make the list empty to support all schemes. org.archive.net.UURIFactory.schemes = http, https, dns, invalid# 'invalid' is a fake scheme used as a last-ditch fallback when a UURI # instance is expected and raising a URIException would be problematic (as# in deserializing what was a valid UURI at serialization time). It may be # removed, but doing so could reduce crawl robustness in the face of # potential future bugs.# following schemes will be marked as intentionally ignored in the# exception -- and thus not loggedorg.archive.net.UURIFactory.ignored-schemes = mailto, clsid, res, ftp, \ file, rtsp, about############################################################################### C r a w l U R I ################################################################################ Maximum links per page.# Default is 6000. Links beyond the maximum are not scheduled. An annotation# in crawl.log -- dol:N -- is added where N is number of links discarded# (dol == Discarded OutLinks).# org.archive.crawler.datamodel.CrawlURI.maxOutLinks = 6000############################################################################### E x t r a c t o r H T M L ################################################################################ Configuration for html extractor regex. # Without maximums, regexes can return matches that span megabytes# in strange html with corresponding megabyte String allocations (See # https://sourceforge.net/tracker/?func=detail&atid=539099&aid=1220714&group_id=73833# Set the maximum element name length.# org.archive.crawler.extractor.ExtractorHTML.maxElementNameLength = 1024# Set the maximum attribute name length.# org.archive.crawler.extractor.ExtractorHTML.maxAttributeNameLength = 1024# Set the maximum attribute value length.# org.archive.crawler.extractor.ExtractorHTML.maxAttributeValueLength = 16384############################################################################### C h e c k p o i n t i n g############################################################################### Have checkpointing activity show in logs.org.archive.crawler.framework.Checkpointer.level = INFOorg.archive.crawler.selftest.CheckpointSelfTest.level = INFO# Uncomment to run background thread that will checkpoint on the appointed# period (in hours between checkpoints).# org.archive.crawler.framework.Checkpointer.period = 4
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -