?? tsinghuaparser.java
字號:
package com.booksearch.service.htmlparser;
/************************************************************
FileName: Tsinghuaparser.java
Author: wang jiaqiang
Date:11/09/08
Description: 根據檢索關鍵字到www.tub.tsinghua.edu.cn抽取匹配內容
Class List: Tsinghuaparser
***********************************************************/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:Tsinghuaparser Description: 根據檢索關鍵字到www.tub.tsinghua.edu.cn抽取匹配內容
* extens:no implements:HtmlParser<Element>
*
* @author wang jiaqiang
* @since 11/10/08
*/
public class Tsinghuaparser implements HtmlParser<Element> {
/* 存放本網站某一頁的記錄 */
private ArrayList<Book> list;
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定網頁,并轉化為dom對象
* Calls:no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url) throws Exception{
/* 生成html 解析器 */
DOMParser parser = new DOMParser();
/* 設置網頁的默認編碼 */
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"gb2312");
URL u = new URL(url);
/* 建立與源網站的連接 */
URLConnection urlconn = u.openConnection();
urlconn.setReadTimeout(30000);
//urlconn.setConnectTimeout(30000);
//urlconn.connect();
/* 獲得源網站的字節流,并轉化為字符流,設置編碼為utf-8 */
BufferedReader in = new BufferedReader(new InputStreamReader(urlconn
.getInputStream(), "gb2312"));
/* 進行解析,轉化為xml */
parser.parse(new InputSource(in));
/* 轉化為dom對象 */
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: 用nekohtml解析器解析指定網頁,并轉化為dom對象
* Calls:getBookImage(),getBookName(),getBookName(),getBookAuthor(),getBookImage(),getBookISBN(),
* getBookPublisher(),getBookPublishTime(),getBookPrice()
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag) {
/* 取得所有<table>結點 */
NodeList servers = doc.getElementsByTagName("table");
list = new ArrayList<Book>();
for (int i = 0; i < servers.getLength(); i++) {
Element serveritem = (Element) servers.item(i);
/* 過濾出<table border="1" ... width=90%> 結點,也就是存放記錄的結點 */
if (("90%".equals(serveritem.getAttribute("width")))
&& ("1".equals(serveritem.getAttribute("border")))) {
NodeList childList = serveritem.getChildNodes();
/* 循環遍歷<table border="1" ... width=90%>的子結點 */
for (int j = 2; j < childList.getLength(); j++) {
Node childNode = childList.item(j);
/* 如果是元素結點(<tr>,<td>),取出其中的文本值 */
if (childNode.getNodeType() == Node.ELEMENT_NODE) {
Element childElement = (Element) childNode;
Book book = new Book();
Price price = new Price();
book.setBookPublisher("清華大學出版社");
price.setTsinghuaDiscount((float) 1.00);
NodeList trNode = childElement.getChildNodes();
for (int k = 0; k < trNode.getLength(); k++) {
Node tdNode = trNode.item(k);
if (tdNode.getNodeType() == Node.ELEMENT_NODE) {
Element tdElement = (Element) tdNode;
/* 取出圖書ISBN */
if (k == 2) {
String bookAuthor = this.getBookAuthor(tdElement);
book.setBookAuthor(bookAuthor);
/* 取出圖書名稱 */
} else if ("450pt".equals(tdElement.getAttribute("width"))) {
String bookName = this.getBookName(tdElement);
book.setBookName(bookName);
String bookUrl = this.getBookUrl(tdElement);
// book.setBookUrl(bookUrl);
price.setTsinghuaUrl(bookUrl);
/* 取出作者名 */
} else if (("60pt".equals(tdElement.getAttribute("width")))
&& ("left".equals(tdElement.getAttribute("align")))) {
String bookISBN = this.getBookISBN(tdElement);
book.setBookISBN(bookISBN);
/* 取出出版時間 */
} else if (("60pt".equals(tdElement.getAttribute("width")))
&& ("middle".equals(tdElement.getAttribute("align")))) {
String bookPublishTime = this.getBookPublishTime(tdElement);
if(!"".equals(bookPublishTime)&&null != bookPublishTime)
book.setBookPublishTime(bookPublishTime);
/* 取出圖書價格 */
} else if ("40pt".equals(tdElement.getAttribute("width"))) {
String bookPrice = this.getBookPrice(tdElement);
// book.setBookPrice(bookPrice);
// book.setBookFixPrice(bookPrice);
// book.setBookDiscount("1");
if(null != bookPrice&&!"".equals(bookPrice))
price.setTsinghuaPrice(Double.valueOf(bookPrice.trim()));
book.setBookFixPrice(Double.valueOf(bookPrice.trim()));
}
}
}
/* 放到存放結果鏈中 */
if (book.getBookName() != null) {
//book.setBookImage("http://www.tup.com.cn/images/nocover.jpg");
book.setPrice(price);
list.add(book);
}
}
}
}
}
return list;
}
public Price getDetailInfo(Document doc) {
Price price = new Price();
/* 取得所有<table>結點 */
NodeList servers = doc.getElementsByTagName("table");
list = new ArrayList<Book>();
for (int i = 0; i < servers.getLength(); i++) {
Element serveritem = (Element) servers.item(i);
/* 過濾出<table border="1" ... width=90%> 結點,也就是存放記錄的結點 */
if (("90%".equals(serveritem.getAttribute("width")))
&& ("1".equals(serveritem.getAttribute("border")))) {
NodeList childList = serveritem.getChildNodes();
/* 循環遍歷<table border="1" ... width=90%>的子結點 */
for (int j = 2; j < childList.getLength(); j++) {
Node childNode = childList.item(j);
/* 如果是元素結點(<tr>,<td>),取出其中的文本值 */
if (childNode.getNodeType() == Node.ELEMENT_NODE) {
Element childElement = (Element) childNode;
price.setTsinghuaDiscount((float) 1.00);
NodeList trNode = childElement.getChildNodes();
for (int k = 0; k < trNode.getLength(); k++) {
Node tdNode = trNode.item(k);
if (tdNode.getNodeType() == Node.ELEMENT_NODE) {
Element tdElement = (Element) tdNode;
if ("40pt".equals(tdElement.getAttribute("width"))) {
String bookPrice = this.getBookPrice(tdElement);
if(null != bookPrice&&!"".equals(bookPrice))
price.setTsinghuaPrice(Float.valueOf(bookPrice.trim()));
}else if ("450pt".equals(tdElement.getAttribute("width"))) {
String bookUrl = this.getBookUrl(tdElement);
price.setTsinghuaUrl(bookUrl);
/* 取出作者名 */
}
}
}
break;
}
}
break;
}
}
return price;
}
/**
* Function: getBookName
* Description: 獲得圖書名稱
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookName(Element bookElement) {
String bookName = "";
/* 取出<a>結點 */
if(bookElement.hasChildNodes()
&& Node.ELEMENT_NODE == bookElement.getFirstChild().getNodeType()){
Element firstElement = (Element) bookElement.getFirstChild();
if(firstElement.hasChildNodes())
bookName = firstElement.getFirstChild().getNodeValue();
}
return bookName;
}
/**
* Function: getBookAuthor
* Description: 獲得圖書作者
* Calls: no
* CalledBy:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -