?? bookschinaparser.java
字號:
package com.booksearch.service.htmlparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:BookschinaParser
* Description: 根據檢索關鍵字到www.bookschina.cn抽取匹配內容
* extens:no
* implements:HtmlParser<Element>
* @author wang jiaqiang
* @since 11/09/08
*/
public class BookschinaParser implements HtmlParser<Element> {
/* 存放本網站某一頁的記錄 */
private ArrayList<Book> list;
// private static final Logger logger;
// static
// {
// logger = Logger.getLogger(com.booksearch.service.htmlparser.BookschinaParser.class);
// }
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定網頁,并轉化為dom對象
* Calls: no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url)throws Exception{
/* 生成html 解析器 */
DOMParser parser = new DOMParser();
/* 設置網頁的默認編碼 */
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"gb2312");
url = url.replace(" ", "+");
// url = url.replace("%20", "+");
URL u = new URL(url);
/* 建立與源網站的連接 */
URLConnection urlconn = u.openConnection();
//urlconn.connect();
//System.out.println(urlconn.getContentType());
urlconn.setReadTimeout(30000);
/* 獲得源網站的字節流,并轉化為字符流,設置編碼為utf-8 */
BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(), "gb2312"));
/* 進行解析,轉化為xml */
parser.parse(new InputSource(in));
/* 轉化為dom對象 */
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: Dom對轉化過來的xml進行解析,取得圖書的各個信息
* Calls: getBookImage(),getBookName(),getBookName(),getBookAuthor(),
* getBookPublisher(),getBookPublishTime(),getBookPrice()
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag) {
list = new ArrayList<Book>();
/* 取得所有<div>結點 */
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; i < servers.getLength(); i++) {
if(Node.ELEMENT_NODE == servers.item(i).getNodeType()){
Element serveritem = (Element) servers.item(i);
Book book = new Book();
Price price = new Price();
/* 過濾出<div class="pic">結點,也就是存放圖片記錄的結點 */
if ("pic".equals(serveritem.getAttribute("class"))) {
if(serveritem.hasChildNodes()){
NodeList childList1 = serveritem.getChildNodes();
Element childElement1 = (Element) childList1.item(0);
NodeList cdList1 = childElement1.getChildNodes();
for(int k = 0;k<cdList1.getLength();k++){
Node cdTem = cdList1.item(k);
if("IMG".equals(cdTem.getNodeName())){
Element cdElement1 = (Element) cdTem;
/* 取出圖書的圖片 */
String bookImage = this.getBookImage(cdElement1);
book.setBookImage(bookImage);
}
}
}
Element serveritem2 = (Element) servers.item(i+=1);
/*過濾出<div class="xunhuan">結點,也就是存放記錄的結點 */
if ("xunhuan".equals(serveritem2.getAttribute("class"))) {
NodeList childList = serveritem2.getChildNodes();
/* 循環遍歷<div class="xunhuan">的子結點 */
for (int j = 0; j < childList.getLength(); j++) {
Node childNode = childList.item(j);
/* 如果是元素結點(<span>,<li>),則進行分類處理,取出其中的文本值 */
if (childNode.getNodeType() == Node.ELEMENT_NODE) {
Element childElement = (Element) childNode;
NodeList cdList = childElement.getChildNodes();
//System.out.println("length>>"+cdList.getLength());
if(cdList.getLength()>7){
if(cdList.item(1).getNodeType() == Node.ELEMENT_NODE){
Element secElement = (Element) cdList.item(1);
/* 取出圖書的名字 */
String bookName = this.getBookName(secElement);
book.setBookName(bookName);
String bookUrl = this.getBookUrl(secElement);
price.setBookschinaUrl(bookUrl);
}
if(cdList.item(3).getNodeType() == Node.ELEMENT_NODE){
Element secElement1 = (Element) cdList.item(3);
/* 取出圖書的作者 */
String bookAuthor = this.getBookAuthor(secElement1);
book.setBookAuthor(bookAuthor);
}
if(cdList.item(5).getNodeType() == Node.ELEMENT_NODE){
Element secElement2 = (Element) cdList.item(5);
/* 取出圖書的出版社 */
String bookPublisher = this
.getBookPublisher(secElement2);
book.setBookPublisher(bookPublisher);
}
Element secElement3 = null;
if(cdList.item(7).getNodeType() == Node.ELEMENT_NODE){
secElement3 = (Element) cdList.item(7);
/* 取出圖書的出版時間 */
String bookPublishTime = this
.getBookPublishTime(secElement3);
if(!"".equals(bookPublishTime)&&null != bookPublishTime)
book.setBookPublishTime(bookPublishTime);
}
if(cdList.getLength()>12){
if(cdList.item(9).getNodeType() == Node.ELEMENT_NODE){
/* 圖書的ISBN */
Element temSecElement3 = (Element) cdList.item(9);
//System.out.println(temSecElement3.getNodeName());
String bookISBN = this.getBookISBN(temSecElement3);
book.setBookISBN(bookISBN);
}
if(cdList.item(11).getNodeType() == Node.ELEMENT_NODE
&&cdList.item(13).getNodeType() == Node.ELEMENT_NODE){
Element secElement4 = (Element) cdList.item(11);
/* 圖書的固定價格 */
String bookFixPrice = this.getBookFixPrice(secElement4);
book.setBookFixPrice(Double.valueOf(bookFixPrice));
Element secElement5 = (Element) cdList.item(13);
/* 圖書的市場價格 */
String bookPrice = this.getBookPrice(secElement5);
price.setBookschinaPrice(Double.valueOf(bookPrice));
/* 圖書的折扣 */
double discount=0;
discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
}
}else{
if(null !=secElement3&&cdList.item(11).getNodeType() == Node.ELEMENT_NODE){
/* 圖書的ISBN */
String bookISBN = this.getBookISBN(secElement3);
book.setBookISBN(bookISBN);
//System.out.println(secElement3.getTextContent());
Element secElement4 = (Element) cdList.item(9);
/* 圖書的固定價格 */
String bookFixPrice = this.getBookFixPrice(secElement4);
book.setBookFixPrice(Double.valueOf(bookFixPrice));
Element secElement5 = (Element) cdList.item(11);
/* 圖書的市場價格 */
String bookPrice = this.getBookPrice(secElement5);
price.setBookschinaPrice(Double.valueOf(bookPrice));
/* 圖書的折扣 */
double discount=0;
discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
}
}
}
}
}
if(!"".equals(book.getBookName())){
book.setPrice(price);
list.add(book);
}
}
}
}
return list;
}
/**
* Function: getDetailInfo
* Description: 獲得某本書的詳細信息,價格,折扣
* Calls: getBookUrl(),getBookFixPrice(),getBookPrice(),
* Called By: no
* @param doc as Document
* @return Price
* @throws no
*/
public Price getDetailInfo(Document doc) {
Price price = new Price();
/* 取得所有<div>結點 */
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; i < servers.getLength(); i++) {
Element serveritem = (Element) servers.item(i);
if ("xunhuan".equals(serveritem.getAttribute("class"))) {
NodeList childList = serveritem.getChildNodes();
/* 循環遍歷<div class="xunhuan">的子結點 */
for (int j = 0; j < childList.getLength(); j++) {
Node childNode = childList.item(j);
/* 如果是元素結點(<span>,<li>),則進行分類處理,取出其中的文本值 */
if (childNode.getNodeType() == Node.ELEMENT_NODE) {
Element childElement = (Element) childNode;
NodeList cdList = childElement.getChildNodes();
if(cdList.getLength()>=1){
Node secElement = (Node) cdList.item(1);
if(secElement.getNodeType() == Node.ELEMENT_NODE){
String bookUrl = this.getBookUrl((Element)secElement);
price.setBookschinaUrl(bookUrl);
}
}
if(cdList.getLength()>12){
if(cdList.item(11).getNodeType() == Node.ELEMENT_NODE
&&cdList.item(13).getNodeType() == Node.ELEMENT_NODE){
Element secElement4 = (Element) cdList.item(11);
/* 圖書的固定價格 */
String bookFixPrice = this.getBookFixPrice(secElement4);
Element secElement5 = (Element) cdList.item(13);
/* 圖書的市場價格 */
String bookPrice = this.getBookPrice(secElement5);
if(null !=bookPrice&&!"".equals(bookPrice))
price.setBookschinaPrice(Double.valueOf(bookPrice));
/* 圖書的折扣 */
double discount=0;
discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
}
}else if(cdList.getLength() == 11){
if(cdList.item(9).getNodeType() == Node.ELEMENT_NODE
&&cdList.item(11).getNodeType() == Node.ELEMENT_NODE){
Element secElement4 = (Element) cdList.item(9);
/* 圖書的固定價格 */
String bookFixPrice = this.getBookFixPrice(secElement4);
Element secElement5 = (Element) cdList.item(11);
/* 圖書的市場價格 */
String bookPrice = this.getBookPrice(secElement5);
price.setBookschinaPrice(Double.valueOf(bookPrice));
/* 圖書的折扣 */
double discount=0;
discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
}
}
}
}
break;
}
}
return price;
}
/**
* Function: getBookImage Description: 獲得圖書封面圖片地址
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookImage(Element bookElement) {
String bookImage = "";
if(bookElement.hasAttribute("src"))
bookImage = bookElement.getAttribute("src");
if(bookImage.length()>0)
bookImage = bookImage.trim();
return bookImage;
}
/**
* Function: getBookName
* Description: 獲得圖書名稱
* Calls: no Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookName(Element bookElement) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -