?? commonengine.java
字號:
package com.laozizhu.search.impl;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.laozizhu.search.Engine;
import com.laozizhu.search.Item;
import com.laozizhu.search.ItemBase;
import com.laozizhu.search.util.HTMLDecoder;
import com.laozizhu.search.util.PageService;
/**
* 搜索引擎的基礎實現。<br>
* 可以用來調度解析的整個過程。<br>
* 部分細節可以在子類重寫一些方法來實現。
*
* @author 老紫竹(laozizhu.com)
*/
public class CommonEngine implements Engine {
/**
* 在得到正文之后,可以進行一些自定義的處理。
*
* @param body 解析后的未經處理的文本。
* @return 處理后的文本
*/
public String afterBody(String body) {
return body;
}
public String afterTitle(String title) {
return title;
}
/**
* 解析URL
*/
public Item parseItem(String url) {
String str = PageService.getPage(url, getCharset());
Item item = new ItemBase();
item.setUrl(url);
Matcher m = getTitlePattern().matcher(str);
if (m.find()) {
item.setTitle(afterTitle(m.group(1).trim()));
}
m = getAuthorPattern().matcher(str);
if (m.find()) {
item.setAuthor(m.group(1).trim());
}
m = getDatetPattern().matcher(str);
if (m.find()) {
int num = m.groupCount();
String date;
if (num > 1) {
StringBuilder b = new StringBuilder();
for (int i = 1; i <= num; i++) {
b.append(m.group(i));
}
date = b.toString();
} else {
date = m.group(1).trim();
}
item.setDatetimeCreate(date);
}
StringBuilder b = new StringBuilder();
Iterator<Pattern> it = getBodyPatternList().iterator();
while (it.hasNext()) {
m = it.next().matcher(str);
while (m.find()) {
b.append(m.group(1).trim());
b.append("<hr class='laozizhu'/>");
}
}
item.setBody(afterBody(b.toString()));
if (isStripHtml()) {
item.setBody(stripHtml(item.getBody()));
}
return item;
}
private String titlePatternString;
private String authorPatternString;
private String datePatternString;
private List<String> bodyPatternStringList;
private Pattern titlePattern;
private Pattern authorPattern;
private Pattern datetPattern;
private List<Pattern> bodyPatternList;
/**
* 得到標題的正則表達式。
*
* @return
*/
protected Pattern getTitlePattern() {
if (titlePattern == null) {
synchronized (this) {
if (titlePattern == null) {
if (getTitlePatternString() == null) {
titlePattern = titlePatternDefault;
} else {
titlePattern = Pattern.compile(getTitlePatternString(), Pattern.DOTALL);
}
}
}
}
return titlePattern;
}
/**
* 獲得作者的正則表達式
*
* @return
*/
protected Pattern getAuthorPattern() {
if (authorPattern == null) {
synchronized (this) {
if (authorPattern == null) {
if (getAuthorPatternString() == null) {
authorPattern = authorPatternDefault;
} else {
authorPattern = Pattern.compile(getAuthorPatternString(), Pattern.DOTALL);
}
}
}
}
return authorPattern;
}
/**
* 獲得主題的正則表達式列表
*
* @return
*/
protected List<Pattern> getBodyPatternList() {
if (bodyPatternList == null) {
synchronized (this) {
if (bodyPatternList == null) {
bodyPatternList = new LinkedList<Pattern>();
for (String str : getBodyPatternStringList()) {
bodyPatternList.add(Pattern.compile(str, Pattern.DOTALL));
}
}
}
}
return bodyPatternList;
}
/**
* 獲得日期的正則表達式
*
* @return
*/
protected Pattern getDatetPattern() {
if (datetPattern == null) {
synchronized (this) {
if (datetPattern == null) {
datetPattern = Pattern.compile(getDatePatternString(), Pattern.DOTALL);
}
}
}
return datetPattern;
}
public String getTitlePatternString() {
return titlePatternString;
}
public void setTitlePatternString(String titlePatternString) {
this.titlePatternString = titlePatternString;
}
public String getAuthorPatternString() {
return authorPatternString;
}
public void setAuthorPatternString(String authorPatternString) {
this.authorPatternString = authorPatternString;
}
public String getDatePatternString() {
return datePatternString;
}
public void setDatePatternString(String datePatternString) {
this.datePatternString = datePatternString;
}
public List<String> getBodyPatternStringList() {
return bodyPatternStringList;
}
public void setBodyPatternStringList(List<String> bodyPatternStringList) {
this.bodyPatternStringList = bodyPatternStringList;
}
// 默認的標題正則
private static final Pattern titlePatternDefault = Pattern.compile("(?i)<title>(.*?)</title>", Pattern.DOTALL);
// 默認的作者正則
private static final Pattern authorPatternDefault = Pattern.compile("(?i)<meta name=\"Author\".*?content=\"(.+?)\".*?>",
Pattern.DOTALL);
// 是否去掉HTML標記
private boolean stripHtml = true;
public boolean isStripHtml() {
return stripHtml;
}
public void setStripHtml(boolean stripHtml) {
this.stripHtml = stripHtml;
}
// 引擎處理的字符集
private String charset = "UTF-8";
public void setCharset(String charset) {
this.charset = charset;
}
/**
* 得到網站的編碼
*
* @return
*/
public String getCharset() {
return charset;
}
/**
* 去掉字符串里面的html代碼。<br>
* 要求數據要規范,比如大于小于號要配套,否則會被集體誤殺。
*
* @param content 內容
* @return 去掉后的內容
*/
public static String stripHtml(String content) {
// <p>段落替換為換行
content = content.replaceAll("(?i)<p.*?>", "\r\n");
// <h1>=<h9>段落替換為換行
content = content.replaceAll("(?i)<h\\d>", "\r\n");
// <br><br/><br />等替換為換行
content = content.replaceAll("(?i)<br\\s*/?>", "\r\n");
// 去掉其它的</>和<>之間的東西
content = content.replaceAll("<.*?/>", "");
content = content.replaceAll("<.*?>", "");
// 編碼特殊字符
content = HTMLDecoder.decode(content);
content = net.java2000.tools.StrTools.htmlencode(content);
// 去掉多個換行
content = content.replaceAll("(\r?\n(\\s*\r?\n)+)", "\r\n");
return content;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -