?? jerichoextractorhtml.java
字號:
resources = new ArrayList<String>(); // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) { resources.add(attrValue + CLASSEXT); } else { resources.add(attrValue); } } // VALUE else if (((attr = attributes.get("value")) != null) && ((attrValue = attr.getValue()) != null)) { if (TextUtils.matches(LIKELY_URI_PATH, attrValue) && overlyEagerLinkDetection) { CharSequence context = Link.elementContext(elementName, attr .getKey()); processLink(curi, attrValue, context); } } // STYLE else if (((attr = attributes.get("style")) != null) && ((attrValue = attr.getValue()) != null)) { // STYLE inline attribute // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, attrValue, getController()); } // handle codebase/resources if (resources == null) return; Iterator<String> iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase); } while (iter.hasNext()) { res = iter.next(); res = StringEscapeUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(curi, res, element); // TODO: include attribute // too } } catch (URIException e) { curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } } protected boolean processMeta(CrawlURI curi, Element element) { String name = element.getAttributeValue("name"); String httpEquiv = element.getAttributeValue("http-equiv"); String content = element.getAttributeValue("content"); if ("robots".equals(name) && content != null) { curi.putString(A_META_ROBOTS, content); RobotsHonoringPolicy policy = getSettingsHandler().getOrder() .getRobotsHonoringPolicy(); String contentLower = content.toLowerCase(); if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE) && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM))) && (contentLower.indexOf("nofollow") >= 0 || contentLower .indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and the // honoring policy is not IGNORE or CUSTOM, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag " + "for: " + curi.toString()); return true; } } if ("refresh".equals(httpEquiv) && content != null) { String refreshUri = content.substring(content.indexOf("=") + 1); try { curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP); } catch (URIException e) { if (getController() != null) { getController().logUriError(e, curi.getUURI(), refreshUri); } else { logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + element.toString() + ", " + refreshUri + ": " + e); } } } return false; } protected void processScript(CrawlURI curi, Element element) { // first, get attributes of script-open tag // as per any other tag processGeneralTag(curi, element, element.getAttributes()); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode(curi, element.getContent()); } protected void processStyle(CrawlURI curi, Element element) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(curi, element, element.getAttributes()); // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, element.getContent(), getController()); } protected void processForm(CrawlURI curi, Element element) { String action = element.getAttributeValue("action"); String name = element.getAttributeValue("name"); String queryURL = ""; final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute( curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); if (ignoreFormActions) return; numberOfFormsProcessed++; // get all form fields FormFields formFields = element.findFormFields(); for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) { // for each form field FormField formField = (FormField) fieldsIter.next(); // for each form control for (Iterator controlIter = formField.getFormControls().iterator(); controlIter.hasNext();) { FormControl formControl = (FormControl) controlIter.next(); // get name of control element (and URLEncode it) String controlName = formControl.getName(); // retrieve list of values - submit needs special handling Collection controlValues; if (!(formControl.getFormControlType() == FormControlType.SUBMIT)) { controlValues = formControl.getValues(); } else { controlValues = formControl.getPredefinedValues(); } if (controlValues.size() > 0) { // for each value set for (Iterator valueIter = controlValues.iterator(); valueIter.hasNext();) { String value = (String) valueIter.next(); queryURL += "&" + controlName + "=" + value; } } else { queryURL += "&" + controlName + "="; } } } // clean up url if (action == null) { queryURL = queryURL.replaceFirst("&", "?"); } else { if (!action.contains("?")) queryURL = queryURL.replaceFirst("&", "?"); queryURL = action + queryURL; } CharSequence context = Link.elementContext(element.getName(), "name=" + name); processLink(curi, queryURL, context); } /** * Run extractor. This method is package visible to ease testing. * * @param curi * CrawlURI we're processing. * @param cs * Sequence from underlying ReplayCharSequence. */ void extract(CrawlURI curi, CharSequence cs) { Source source = new Source(cs); List elements = source.findAllElements(StartTagType.NORMAL); for (Iterator elementIter = elements.iterator(); elementIter.hasNext();) { Element element = (Element) elementIter.next(); String elementName = element.getName(); Attributes attributes; if (elementName.equals(HTMLElementName.META)) { if (processMeta(curi, element)) { // meta tag included NOFOLLOW; abort processing break; } } else if (elementName.equals(HTMLElementName.SCRIPT)) { processScript(curi, element); } else if (elementName.equals(HTMLElementName.STYLE)) { processStyle(curi, element); } else if (elementName.equals(HTMLElementName.FORM)) { processForm(curi, element); } else if (!(attributes = element.getAttributes()).isEmpty()) { processGeneralTag(curi, element, attributes); } } } /* * (non-Javadoc) * * @see org.archive.crawler.framework.Processor#report() */ public String report() { StringBuffer ret = new StringBuffer(); ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n"); ret.append(" Function: Link extraction on HTML documents\n"); ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); ret.append(" Forms processed: " + this.numberOfFormsProcessed + "\n"); ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n"); return ret.toString(); }}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -