?? extracttext.java

?? ajax lucene 部分源代碼 HTMLParser.java MuiltiSearchTest.java

?? JAVA

字號(hào):

package test;

import net.htmlparser.jericho.*;

import java.util.*;
import java.io.*;
import java.net.*;

public class ExtractText {
	public static void main(String[] args) throws Exception {

	}

	public String getConent() throws Exception {
		// String sourceUrlString="http://blog.s135.com/nginx_php_v5/";

		String sourceUrlString = "http://www.11467.com/";
		sourceUrlString = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd=%BF%AA%B7%A2%D7%D4%BC%BA%B5%C4%CB%D1%CB%F7%D2%FD%C7%E6++%D4%B4%B4%FA%C2%EB&pn=0&ver=0&cl=3&uim=0&usm=0";
		sourceUrlString = "http://mall.sina.com.cn/product_1436088.htm";
		InputStream inputstream = new FileInputStream(
				"F:\\lucene\\test\\swtfaq.html");
		// InputStream
		// if (args.length==0)
		// System.err.println("Using default argument of
		// \""+sourceUrlString+'"');
		// else
		// sourceUrlString=args[0];
		// if (sourceUrlString.indexOf(':')==-1)
		// sourceUrlString="file:"+sourceUrlString;
		MicrosoftTagTypes.register();
		PHPTagTypes.register();
		PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this
		// example otherwise they override
		// processing instructions
		MasonTagTypes.register();
		Source source = new Source(inputstream);// new URL(sourceUrlString));

		// Call fullSequentialParse manually as most of the source will be
		// parsed.
		source.fullSequentialParse();

		System.out.println("Document title:");
		String title = getTitle(source);
		System.out.println(title == null ? "(none)" : title);

		System.out.println("\nDocument description:");
		String description = getMetaValue(source, "description");
		System.out.println(description == null ? "(none)" : description);

		System.out.println("\nDocument keywords:");
		String keywords = getMetaValue(source, "keywords");
		System.out.println(keywords == null ? "(none)" : keywords);

		System.out.println("\nLinks to other documents:");
		List<Element> linkElements = source.getAllElements(HTMLElementName.A);
		for (Element linkElement : linkElements) {
			String href = linkElement.getAttributeValue("href");
			if (href == null)
				continue;
			// A element can contain other tags so need to extract the text from
			// it:
			String label = linkElement.getContent().getTextExtractor()
					.toString();
			System.out.println(label + " <" + href + '>');
		}

		System.out
				.println("\nAll text from file (exluding content inside SCRIPT and STYLE elements):\n");
		System.out.println(source.getTextExtractor().setIncludeAttributes(true)
				.toString());

		System.out
				.println("\nSame again but this time extend the TextExtractor class to also exclude text from P elements and any elements with class=\"control\":\n");
		TextExtractor textExtractor = new TextExtractor(source) {
			public boolean excludeElement(StartTag startTag) {
				return startTag.getName() == HTMLElementName.P
						|| "control".equalsIgnoreCase(startTag
								.getAttributeValue("class"));
			}
		};

		System.out.println(textExtractor.setIncludeAttributes(true).toString());
		return textExtractor.setIncludeAttributes(true).toString();
	}

	private static String getTitle(Source source) {
		Element titleElement = source.getFirstElement(HTMLElementName.TITLE);
		if (titleElement == null)
			return null;
		// TITLE element never contains other tags so just decode it collapsing
		// whitespace:
		return CharacterReference.decodeCollapseWhiteSpace(titleElement
				.getContent());
	}

	private static String getMetaValue(Source source, String key) {
		for (int pos = 0; pos < source.length();) {
			StartTag startTag = source.getNextStartTag(pos, "name", key, false);
			if (startTag == null)
				return null;
			if (startTag.getName() == HTMLElementName.META)
				return startTag.getAttributeValue("content"); // Attribute
			// values are
			// automatically
			// decoded
			pos = startTag.getEnd();
		}
		return null;
	}
}

?? 文件大小 701 K

?? 上傳用戶 guigong

?? 所屬分類 Java編程

??? 相關(guān)標(biāo)簽

#MuiltiSearchTest #java #HTMLParser #lucene

?? 快捷鍵說(shuō)明

復(fù)制代碼 Ctrl + C

搜索代碼 Ctrl + F

全屏模式 F11

切換主題 Ctrl + Shift + D

顯示快捷鍵 ?

增大字號(hào) Ctrl + =

減小字號(hào) Ctrl + -

亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? extracttext.java

?? 快捷鍵說(shuō)明