?? workqueuefrontier.html
字號:
<a name="135" href="#135">135</a> <em>/**<em>* total expenditure to allow a queue before 'retiring' it */</em></em><a name="136" href="#136">136</a> <strong>public</strong> <strong>final</strong> <strong>static</strong> String ATTR_QUEUE_TOTAL_BUDGET = <span class="string">"queue-total-budget"</span>;<a name="137" href="#137">137</a> <strong>protected</strong> <strong>final</strong> <strong>static</strong> Long DEFAULT_QUEUE_TOTAL_BUDGET = <strong>new</strong> Long(-1);<a name="138" href="#138">138</a> <a name="139" href="#139">139</a> <em>/**<em>* cost assignment policy to use (by class name) */</em></em><a name="140" href="#140">140</a> <strong>public</strong> <strong>final</strong> <strong>static</strong> String ATTR_COST_POLICY = <span class="string">"cost-policy"</span>;<a name="141" href="#141">141</a> <strong>protected</strong> <strong>final</strong> <strong>static</strong> String DEFAULT_COST_POLICY =<a name="142" href="#142">142</a> UnitCostAssignmentPolicy.<strong>class</strong>.getName();<a name="143" href="#143">143</a> <a name="144" href="#144">144</a> <em>/**<em>* target size of ready queues backlog */</em></em><a name="145" href="#145">145</a> <strong>public</strong> <strong>final</strong> <strong>static</strong> String ATTR_TARGET_READY_QUEUES_BACKLOG =<a name="146" href="#146">146</a> <span class="string">"target-ready-backlog"</span>;<a name="147" href="#147">147</a> <strong>protected</strong> <strong>final</strong> <strong>static</strong> Integer DEFAULT_TARGET_READY_QUEUES_BACKLOG =<a name="148" href="#148">148</a> <strong>new</strong> Integer(50);<a name="149" href="#149">149</a> <a name="150" href="#150">150</a> <em>/**<em>* those UURIs which are already in-process (or processed), and</em></em><a name="151" href="#151">151</a> <em> thus should not be rescheduled */</em><a name="152" href="#152">152</a> <strong>protected</strong> <strong>transient</strong> <a href="../../../../org/archive/crawler/datamodel/UriUniqFilter.html">UriUniqFilter</a> alreadyIncluded;<a name="153" href="#153">153</a> <a name="154" href="#154">154</a> <em>/**<em>* All known queues.</em></em><a name="155" href="#155">155</a> <em> */</em><a name="156" href="#156">156</a> <strong>protected</strong> <strong>transient</strong> Map<String,WorkQueue> allQueues = <strong>null</strong>; <a name="157" href="#157">157</a> <em class="comment">// of classKey -> ClassKeyQueue</em><a name="158" href="#158">158</a> <a name="159" href="#159">159</a> <em>/**<em>*</em></em><a name="160" href="#160">160</a> <em> * All per-class queues whose first item may be handed out.</em><a name="161" href="#161">161</a> <em> * Linked-list of keys for the queues.</em><a name="162" href="#162">162</a> <em> */</em><a name="163" href="#163">163</a> <strong>protected</strong> BlockingQueue<String> readyClassQueues =<a name="164" href="#164">164</a> <strong>new</strong> LinkedBlockingQueue<String>();<a name="165" href="#165">165</a> <a name="166" href="#166">166</a> <em>/**<em>* Target (minimum) size to keep readyClassQueues */</em></em><a name="167" href="#167">167</a> <strong>protected</strong> <strong>int</strong> targetSizeForReadyQueues;<a name="168" href="#168">168</a> <a name="169" href="#169">169</a> <em>/**<em>* </em></em><a name="170" href="#170">170</a> <em> * All 'inactive' queues, not yet in active rotation.</em><a name="171" href="#171">171</a> <em> * Linked-list of keys for the queues.</em><a name="172" href="#172">172</a> <em> */</em><a name="173" href="#173">173</a> <strong>protected</strong> BlockingQueue<String> inactiveQueues =<a name="174" href="#174">174</a> <strong>new</strong> LinkedBlockingQueue<String>();<a name="175" href="#175">175</a> <a name="176" href="#176">176</a> <em>/**<em>*</em></em><a name="177" href="#177">177</a> <em> * 'retired' queues, no longer considered for activation.</em><a name="178" href="#178">178</a> <em> * Linked-list of keys for queues.</em><a name="179" href="#179">179</a> <em> */</em><a name="180" href="#180">180</a> <strong>protected</strong> BlockingQueue<String> retiredQueues =<a name="181" href="#181">181</a> <strong>new</strong> LinkedBlockingQueue<String>();<a name="182" href="#182">182</a> <a name="183" href="#183">183</a> <em>/**<em>* all per-class queues from whom a URI is outstanding */</em></em><a name="184" href="#184">184</a> <strong>protected</strong> Bag inProcessQueues = <a name="185" href="#185">185</a> BagUtils.synchronizedBag(<strong>new</strong> HashBag()); <em class="comment">// of ClassKeyQueue</em><a name="186" href="#186">186</a> <a name="187" href="#187">187</a> <em>/**<em>*</em></em><a name="188" href="#188">188</a> <em> * All per-class queues held in snoozed state, sorted by wake time.</em><a name="189" href="#189">189</a> <em> */</em><a name="190" href="#190">190</a> <strong>protected</strong> SortedSet<WorkQueue> snoozedClassQueues =<a name="191" href="#191">191</a> Collections.synchronizedSortedSet(<strong>new</strong> TreeSet<WorkQueue>());<a name="192" href="#192">192</a> <a name="193" href="#193">193</a> <em>/**<em>* Timer for tasks which wake head item of snoozedClassQueues */</em></em><a name="194" href="#194">194</a> <strong>protected</strong> <strong>transient</strong> Timer wakeTimer;<a name="195" href="#195">195</a> <a name="196" href="#196">196</a> <em>/**<em>* Task for next wake */</em> </em><a name="197" href="#197">197</a> <strong>protected</strong> <strong>transient</strong> WakeTask nextWake; <a name="198" href="#198">198</a> <a name="199" href="#199">199</a> <strong>protected</strong> <a href="../../../../org/archive/crawler/frontier/WorkQueue.html">WorkQueue</a> longestActiveQueue = <strong>null</strong>;<a name="200" href="#200">200</a> <a name="201" href="#201">201</a> <em>/**<em>* how long to wait for a ready queue when there's nothing snoozed */</em></em><a name="202" href="#202">202</a> <strong>private</strong> <strong>static</strong> <strong>final</strong> <strong>long</strong> DEFAULT_WAIT = 1000; <em class="comment">// 1 second</em><a name="203" href="#203">203</a> <a name="204" href="#204">204</a> <em>/**<em>* a policy for assigning 'cost' values to CrawlURIs */</em></em><a name="205" href="#205">205</a> <strong>private</strong> <strong>transient</strong> <a href="../../../../org/archive/crawler/frontier/CostAssignmentPolicy.html">CostAssignmentPolicy</a> costAssignmentPolicy;<a name="206" href="#206">206</a> <a name="207" href="#207">207</a> <em>/**<em>* all policies available to be chosen */</em></em><a name="208" href="#208">208</a> String[] AVAILABLE_COST_POLICIES = <strong>new</strong> String[] {<a name="209" href="#209">209</a> ZeroCostAssignmentPolicy.<strong>class</strong>.getName(),<a name="210" href="#210">210</a> UnitCostAssignmentPolicy.<strong>class</strong>.getName(),<a name="211" href="#211">211</a> WagCostAssignmentPolicy.<strong>class</strong>.getName(),<a name="212" href="#212">212</a> AntiCalendarCostAssignmentPolicy.<strong>class</strong>.getName()};<a name="213" href="#213">213</a> <a name="214" href="#214">214</a> <em>/**<em>*</em></em><a name="215" href="#215">215</a> <em> * Create the CommonFrontier</em><a name="216" href="#216">216</a> <em> * </em><a name="217" href="#217">217</a> <em> * @param name</em><a name="218" href="#218">218</a> <em> * @param description</em><a name="219" href="#219">219</a> <em> */</em><a name="220" href="#220">220</a> <strong>public</strong> <a href="../../../../org/archive/crawler/frontier/WorkQueueFrontier.html">WorkQueueFrontier</a>(String name, String description) {<a name="221" href="#221">221</a> <em class="comment">// The 'name' of all frontiers should be the same (URIFrontier.ATTR_NAME)</em><a name="222" href="#222">222</a> <em class="comment">// therefore we'll ignore the supplied parameter.</em><a name="223" href="#223">223</a> <strong>super</strong>(Frontier.ATTR_NAME, description);<a name="224" href="#224">224</a> <a href="../../../../org/archive/crawler/settings/Type.html">Type</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_HOLD_QUEUES,<a name="225" href="#225">225</a> <span class="string">"Whether to hold newly-created per-host URI work"</span> +<a name="226" href="#226">226</a> <span class="string">" queues until needed to stay busy. If false (default),"</span> +<a name="227" href="#227">227</a> <span class="string">" all queues may contribute URIs for crawling at all"</span> +<a name="228" href="#228">228</a> <span class="string">" times. If true, queues begin (and collect URIs) in"</span> +<a name="229" href="#229">229</a> <span class="string">" an 'inactive' state, and only when the Frontier needs"</span> +<a name="230" href="#230">230</a> <span class="string">" another queue to keep all ToeThreads busy will new"</span> +<a name="231" href="#231">231</a> <span class="string">" queues be activated."</span>, DEFAULT_HOLD_QUEUES));<a name="232" href="#232">232</a> t.setExpertSetting(<strong>true</strong>);<a name="233" href="#233">233</a> t.setOverrideable(false);<a name="234" href="#234">234</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_BALANCE_REPLENISH_AMOUNT,<a name="235" href="#235">235</a> <span class="string">"Amount to replenish a queue's activity balance when it becomes "</span> +<a name="236" href="#236">236</a> <span class="string">"active. Larger amounts mean more URIs will be tried from the "</span> +<a name="237" href="#237">237</a> <span class="string">"queue before it is deactivated in favor of waiting queues. "</span> +<a name="238" href="#238">238</a> <span class="string">"Default is 3000"</span>, DEFAULT_BALANCE_REPLENISH_AMOUNT));<a name="239" href="#239">239</a> t.setExpertSetting(<strong>true</strong>);<a name="240" href="#240">240</a> t.setOverrideable(<strong>true</strong>);<a name="241" href="#241">241</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_ERROR_PENALTY_AMOUNT,<a name="242" href="#242">242</a> <span class="string">"Amount to additionally penalize a queue when one of"</span> +<a name="243" href="#243">243</a> <span class="string">"its URIs fails completely. Accelerates deactivation or "</span> +<a name="244" href="#244">244</a> <span class="string">"full retirement of problem queues and unresponsive sites. "</span> +<a name="245" href="#245">245</a> <span class="string">"Default is 100"</span>, DEFAULT_ERROR_PENALTY_AMOUNT));<a name="246" href="#246">246</a> t.setExpertSetting(<strong>true</strong>);<a name="247" href="#247">247</a> t.setOverrideable(<strong>true</strong>);<a name="248" href="#248">248</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_QUEUE_TOTAL_BUDGET,<a name="249" href="#249">249</a> <span class="string">"Total activity expenditure allowable to a single queue; queues "</span> +<a name="250" href="#250">250</a> <span class="string">"over this expenditure will be 'retired' and crawled no more. "</span> +<a name="251" href="#251">251</a> <span class="string">"Default of -1 means no ceiling on activity expenditures is "</span> +<a name="252" href="#252">252</a> <span class="string">"enforced."</span>, DEFAULT_QUEUE_TOTAL_BUDGET));<a name="253" href="#253">253</a> t.setExpertSetting(<strong>true</strong>);<a name="254" href="#254">254</a> t.setOverrideable(<strong>true</strong>);<a name="255" href="#255">255</a> <a name="256" href="#256">256</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_COST_POLICY,<a name="257" href="#257">257</a> <span class="string">"Policy for calculating the cost of each URI attempted. "</span> +<a name="258" href="#258">258</a> <span class="string">"The default UnitCostAssignmentPolicy considers the cost of "</span> +<a name="259" href="#259">259</a> <span class="string">"each URI to be '1'."</span>, DEFAULT_COST_POLICY, AVAILABLE_COST_POLICIES));<a name="260" href="#260">260</a> t.setExpertSetting(<strong>true</strong>);<a name="261" href="#261">261</a> <a name="262" href="#262">262</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_SNOOZE_DEACTIVATE_MS,<a name="263" href="#263">263</a> <span class="string">"Threshold above which any 'snooze' delay will cause the "</span> +<a name="264" href="#264">264</a> <span class="string">"affected queue to go inactive, allowing other queues a "</span> +<a name="265" href="#265">265</a> <span class="string">"chance to rotate into active state. Typically set to be "</span> +<a name="266" href="#266">266</a> <span class="string">"longer than the politeness pauses between successful "</span> +<a name="267" href="#267">267</a> <span class="string">"fetches, but shorter than the connection-failed "</span> +<a name="268" href="#268">268</a> <span class="string">"'retry-delay-seconds'. (Default is 5 minutes.)"</span>, <a name="269" href="#269">269</a> DEFAULT_SNOOZE_DEACTIVATE_MS));<a name="270" href="#270">270</a> t.setExpertSetting(<strong>true</strong>);<a name="271" href="#271">271</a> t.setOverrideable(false);<a name="272" href="#272">272</a> t = addElementToDefinition(<strong>new</strong> <a href="../../../../org/archive/crawler/settings/SimpleType.html">SimpleType</a>(ATTR_TARGET_READY_QUEUES_BACKLOG,<a name="273" href="#273">273</a> <span class="string">"Target size for backlog of ready queues. This many queues "</span> +<a name="274" href="#274">274</a> <span class="string">"will be brought into 'ready' state even if a thread is "</span> +<a name="275" href="#275">275</a> <span class="string">"not waiting. Only has effect if 'hold-queues' is true. "</span> +<a name="276" href="#276">276</a> <span class="string">"Default is 50."</span>, DEFAULT_TARGET_READY_QUEUES_BACKLOG));<a name="277" href="#277">277</a> t.setExpertSetting(<strong>true</strong>);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -