?? china_pubparser.java
字號:
package com.booksearch.service.htmlparser;
/************************************************************
FileName: China_pubparser.java
Author: lichao
Date:11/14/08
Description: 根據檢索關鍵字到www.china-pub.com抽取匹配內容
Class List: China_pubparser
***********************************************************/
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.StringTokenizer;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:China_pubparser
* Description: 根據檢索關鍵字到www.china-pub.com抽取匹配內容
* extens:no
* implements:HtmlParser<Element>
* @author li chao
* @since 11/14/08
*/
public class China_pubparser implements HtmlParser<String> {
//private String url = "http://www.china-pub.com/s/?key1=java" ;
/*存放本網站某一頁的記錄*/
private ArrayList<Book> list;
// private static final Logger logger;
//
// static
// {
// logger = Logger.getLogger(com.booksearch.service.htmlparser.China_pubparser.class);
// }
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定網頁,并轉化為dom對象
* Calls: no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url)throws Exception{
// 生成html parse
DOMParser parser = new DOMParser();
// 設置網站默認編碼
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"UTF-8");
URL u = new URL(url);
/*建立與源網站的連接*/
URLConnection urlConnection = u.openConnection();
urlConnection.setReadTimeout(30000);
//urlConnection.setConnectTimeout(30000);
//urlConnection.connect();
//*獲得源網站的字節流,并轉化為字符流,設置編碼為gb2312*/
BufferedReader inputStream = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"gb2312"));
parser.parse(new InputSource(inputStream));
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: 用nekohtml解析器解析指定網頁,并轉化為dom對象
* Calls: getBookImage(),getBookName(),getBookAuthor(),getBookISBN(),getBookFixPrice()
* getBookPublisher(),getBookPublishTime(),getBookPrice(),getBookUrl
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag) {
list = new ArrayList<Book>();
/*過濾出<script></script>結點*/
NodeList servers = doc.getElementsByTagName("script");
for (int i = 0; i < servers.getLength(); i++) {
Node node = servers.item(i);
NodeList childNode = node.getChildNodes();
if (childNode.getLength() > 0
&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
String tem = childNode.item(0).getNodeValue();
/*因為結點中不止有一個[],所以要先把第一二個[]過濾掉*/
tem = tem.substring(tem.indexOf("dt"));
/*過濾出數據組中的元素*/
if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
String bookStr = tem.substring(tem.indexOf("[")-1, tem
.indexOf("]"));
String bookArr[];
int j = 0;
/*按"}"進行分詞處理,分成單獨的一條條記錄*/
StringTokenizer st = new StringTokenizer(bookStr, "}");
bookArr = new String[st.countTokens()];
while (st.hasMoreElements()) {
/*因為分詞之后會有一個是空的,所以進行一下判斷*/
if(j<20){
bookArr[j] = st.nextToken();
j++;
}else break;
}
/*循環遍歷每一條記錄,取出其中詳細的信息*/
for (int k = 0; k < bookArr.length-1; k++) {
Book book = new Book();
Price price = new Price();
/*按","進行分詞*/
StringTokenizer temp =new StringTokenizer(bookArr[k],",");
while(temp.hasMoreElements()){
String temStr = temp.nextToken();
/*取出圖書名稱*/
if(temStr.indexOf("sm:")!=-1){
String bookName = getBookName(temStr);
book.setBookName(bookName);
//System.out.println(bookName);
/*取出圖書市場定價*/
}else if(temStr.indexOf("dj:")!=-1){
String bookFixPrice = getBookFixPrice(temStr);
if(null != bookFixPrice&&!"".equals(bookFixPrice))
book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
//System.out.println(bookFixPrice);
/*取出圖書折扣和打折后的價格*/
}else if(temStr.indexOf("zk:")!=-1){
String bookDiscount = this.getBookDiscount(temStr);
//book.setBookDiscount(bookDiscount);
if(null != bookDiscount&&!"".equals(bookDiscount))
price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
/*求出本網站普通會員的買書價格*/
double bookPrice = book.getBookFixPrice()*Float.valueOf(bookDiscount.trim());
/*進行格式化,保留兩位小數*/
DecimalFormat df = new DecimalFormat("####.00");
price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
//price.setChina_pubPrice(bookPrice);
//System.out.println(df.format(bookPrice));
//System.out.println(bookDiscount);
/*取出圖書ISBN號*/
}else if(temStr.indexOf("sh:")!=-1){
String bookISBN = getBookISBN(temStr);
book.setBookISBN(bookISBN);
//System.out.println(bookISBN);
/*取出圖書作者*/
}else if(temStr.indexOf("zz:")!=-1){
String bookAuthor = getBookAuthor(temStr);
book.setBookAuthor(bookAuthor);
//System.out.println(bookAuthor);
/*取出圖書出版日期*/
}else if(temStr.indexOf("cq:")!=-1){
String bookPublishTime = getBookPublishTime(temStr);
if(!"".equals(bookPublishTime)&&null!=bookPublishTime)
book.setBookPublishTime(bookPublishTime);
//System.out.println(bookPublishTime);
/*取出圖書出版社*/
}else if(temStr.indexOf("cs:")!=-1){
String bookPublisher = getBookPublisher(temStr);
book.setBookPublisher(bookPublisher);
//System.out.println(bookPublisher);
/*出版圖書封面圖書地址*/
}else if(temStr.indexOf("pd:")!=-1){
String bookImage = getBookImage(temStr);
book.setBookImage(bookImage);
//String bookUrl = getBookUrl(temStr);
//book.setBookUrl(bookUrl);
//System.out.println(bookImage);
/*取出圖書詳細信息地址*/
}else if(temStr.indexOf("th:")!=-1){
String bookUrl = getBookUrl(temStr);
// book.setBookUrl(bookUrl);
price.setChina_pubUrl(bookUrl);
//System.out.println(bookUrl);
}
}
if(book!=null){
book.setPrice(price);
list.add(book);
}
}
}
}
}
return list;
}
public Price getDetailInfo(Document doc) {
Price price = new Price();
String bookFixPrice = "";
/*過濾出<script></script>結點*/
NodeList servers = doc.getElementsByTagName("script");
for (int i = 0; i < servers.getLength(); i++) {
Node node = servers.item(i);
NodeList childNode = node.getChildNodes();
if (childNode.getLength() > 0
&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
String tem = childNode.item(0).getNodeValue();
/*因為結點中不止有一個[],所以要先把第一二個[]過濾掉*/
tem = tem.substring(tem.indexOf("dt"));
/*過濾出數據組中的元素*/
if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
String bookStr = tem.substring(tem.indexOf("[")-1, tem
.indexOf("]"));
if(bookStr.indexOf("{")!= -1&&bookStr.indexOf("}")!= -1){
String priceStr = bookStr.substring(bookStr.indexOf("{") + 1, bookStr.indexOf("}"));
/*按","進行分詞*/
StringTokenizer temp =new StringTokenizer(priceStr,",");
while(temp.hasMoreElements()){
String temStr = temp.nextToken();
if(temStr.indexOf("dj:")!=-1){
bookFixPrice = getBookFixPrice(temStr);
/*取出圖書折扣和打折后的價格*/
}else if(temStr.indexOf("zk:")!=-1){
String bookDiscount = this.getBookDiscount(temStr);
//System.out.println(bookDiscount);
price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
/*求出本網站普通會員的買書價格*/
double bookPrice = Double.valueOf(bookFixPrice)*Double.valueOf(bookDiscount.trim());
// DecimalFormat df=(DecimalFormat)DecimalFormat.getInstance();
// df.setMaximumFractionDigits(2);
DecimalFormat df = new DecimalFormat("####.00");
price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
//price.setChina_pubPrice(bookPrice);
//System.out.println(bookPrice);
}else if(temStr.indexOf("th:")!=-1){
String bookUrl = getBookUrl(temStr);
price.setChina_pubUrl(bookUrl);
//System.out.println(bookUrl);
}
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -