?? dangdangparser.java
字號(hào):
package com.booksearch.service.htmlparser;
/************************************************************
FileName: Dangdangparser.java
Author: fengguang
Date:11/09/08
Description: 根據(jù)檢索關(guān)鍵字到www.dangdang.com抽取匹配內(nèi)容
Class List: Dangdangparser
***********************************************************/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:Dangdangparser
* Description: 根據(jù)檢索關(guān)鍵字到www.dangdang.com抽取匹配內(nèi)容
* extens:no
* implements:HtmlParser<Element>
* @author feng guang
* @since 11/09/08
*/
public class Dangdangparser implements HtmlParser<Element>{
/*存放本網(wǎng)站某一頁(yè)的記錄*/
private ArrayList<Book> list;
/*記錄日志*/
private static final Logger logger;
static
{
logger = Logger.getLogger(com.booksearch.service.htmlparser.Dangdangparser.class);
}
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定網(wǎng)頁(yè),并轉(zhuǎn)化為dom對(duì)象
* Calls: no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url)throws Exception{
/*生成html 解析器*/
DOMParser parser = new DOMParser();
/* 設(shè)置網(wǎng)頁(yè)的默認(rèn)編碼*/
parser.setProperty("http://cyberneko.org/html/properties/default-encoding","gb2312");
URL u = new URL(url);
/*建立與源網(wǎng)站的連接*/
URLConnection urlconn = u.openConnection();
//urlconn.connect();
urlconn.setReadTimeout(30000);
//urlconn.setConnectTimeout(30000);
/*獲得源網(wǎng)站的字節(jié)流,并轉(zhuǎn)化為字符流,設(shè)置編碼為utf-8*/
BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(),"gb2312"));
/*進(jìn)行解析,轉(zhuǎn)化為xml*/
parser.parse(new InputSource(in));
/*轉(zhuǎn)化為dom對(duì)象*/
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: Dom對(duì)轉(zhuǎn)化過(guò)來(lái)的xml進(jìn)行解析,取得圖書的各個(gè)信息
* Calls: getBookImage(),getBookName(),getBookName(),getBookAuthor(),
* getBookPublisher(),getBookPublishTime(),getBookPrice()
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag){
list = new ArrayList<Book>();
/*取得所有<div>結(jié)點(diǎn)*/
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; i < servers.getLength(); i++) {
Element serveritem = (Element) servers.item(i);
/*過(guò)濾出<div class="list_r_list">結(jié)點(diǎn),也就是存放記錄的結(jié)點(diǎn)*/
if (!"list_r_list".equals(serveritem.getAttribute("class"))) {
continue;
} else {
NodeList childList = serveritem.getChildNodes();
/*循環(huán)遍歷<div class="list_r_list">的子結(jié)點(diǎn)*/
/*接受抽取出來(lái)的信息*/
Book book = new Book();
Price price = new Price();
// if(flag){
// try {
// Thread.currentThread().sleep(1000);
// } catch (InterruptedException e1) {
// e1.printStackTrace();
// }
// }
for (int j = 0; j < childList.getLength(); j++) {
Node childNode = childList.item(j);
/*如果是元素結(jié)點(diǎn)(<span>,<h2>,<h4>,<h5>,<h6>),則進(jìn)行分類處理,取出其中的文本值*/
if (childNode.getNodeType() == Node.ELEMENT_NODE) {
Element childElement = (Element) childNode;
/*取出圖片地址*/
if ("list_r_list_book".equals(childElement.getAttribute("class"))) {
String bookImage = this.getBookImage(childElement);
book.setBookImage(bookImage);
/*取出圖書名稱*/
} else if ("H2".equals(childElement.getTagName())) {
String bookName = this.getBookName(childElement);
String bookUrl = this.getBookUrl(childElement);
//System.out.println(bookUrl);
if(flag){
try {
Thread.currentThread().sleep(2000);
DangdangparserSec dangdangSec = new DangdangparserSec();
String bookISBN = dangdangSec.getBookISBNSec(bookUrl);
//System.out.println(bookISBN);
book.setBookISBN(bookISBN);
}catch (Exception e) {
logger.error("==========當(dāng)當(dāng)網(wǎng)二次請(qǐng)求解析" + bookUrl + "時(shí)出錯(cuò)" + "==========" +e);
//book.setBookISBN("");
//e.printStackTrace();
}
}
book.setBookName(bookName);
//System.out.println(bookName);
price.setDangdangUrl(bookUrl);
/*取出作者名*/
} else if ("list_r_list_h4".equals(childElement.getAttribute("class"))) {
String bookAuthor = this.getBookAuthor(childElement);
book.setBookAuthor(bookAuthor);
/*取出出版社名稱*/
} else if (("H4".equals(childElement.getTagName()))
&& (-1 != (childElement.getFirstChild().getNodeValue().indexOf("出版社")))) {
String bookPublisher = this.getBookPublisher(childElement);
book.setBookPublisher(bookPublisher);
/*取出詳細(xì)內(nèi)容*/
}else if("H5".equals(childElement.getTagName())){
String bookContent = this.getBookContent(childElement);
book.setBookProspectus(bookContent);
//System.out.println(">>>>>>"+bookContent);
/*取出出版時(shí)間*/
} else if (("H4".equals(childElement.getTagName()))
&& (-1 != (childElement.getFirstChild().getNodeValue().indexOf("出版時(shí)間")))) {
String bookPublishTime = this.getBookPublishTime(childElement);
if(!"".equals(bookPublishTime)&&null!=bookPublishTime)
book.setBookPublishTime(bookPublishTime);
/*取出圖書價(jià)格*/
} else if ("H6".equals(childElement.getTagName())) {
String bookFixPrice = this.getBookFixPrice(childElement);
if(null !=bookFixPrice&&!"".equals(bookFixPrice))
book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
String bookPrice = this.getBookPrice(childElement);
//book.setBookPrice(bookPrice);
if(null != bookPrice&&!"".equals(bookPrice))
price.setDangdangPrice(Double.valueOf(bookPrice));
String bookDiscount = this.getBookDiscount(childElement);
//book.setBookDiscount(bookDiscount);
if(null != bookDiscount&&!"".equals(bookDiscount))
price.setDangdangDiscount(Float.valueOf(bookDiscount));
}
}
}
/*放到存放結(jié)果鏈中*/
if(book.getBookName() != null){
book.setPrice(price);
list.add(book);
}
}
}
return list;
}
/**
* Function: getBookImage
* Description: 獲得圖書封面圖片地址
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookImage(Element bookElement){
Element imageElement = (Element)bookElement.getFirstChild();
Element srcElement = (Element)imageElement.getFirstChild();
return srcElement.getAttribute("src");
}
/**
* Function: getBookName
* Description: 獲得圖書名稱
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookName(Element bookElement){
String bookName = "";
/*取出<a>結(jié)點(diǎn)*/
Element Firstelement = (Element)bookElement.getFirstChild();
NodeList nameList = Firstelement.getChildNodes();
/*循環(huán)遍歷<a>的子結(jié)點(diǎn),并取出其中的文本值*/
for(int i = 0;i<nameList.getLength();i++){
Node nameNode = nameList.item(i);
/*分元素結(jié)點(diǎn)和#text結(jié)點(diǎn)兩種不同情況進(jìn)行處理*/
if(nameNode.getNodeType() == Node.ELEMENT_NODE){
Element nameElement = (Element)nameNode;
/*元素結(jié)點(diǎn)取得其中的文本值*/
if(nameElement.hasChildNodes())
bookName += nameElement.getFirstChild().getNodeValue();
}else{
/*#text結(jié)點(diǎn)取得其中的文本值*/
bookName += nameNode.getNodeValue();
}
}
bookName = bookName.replaceAll(" ", "");
bookName = bookName.trim();
if(bookName.length()>64){
bookName = bookName.substring(0, 64);
}
//System.out.println(bookName);
return bookName;
}
/**
* Function: getBookAuthor
* Description: 獲得圖書作者
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookAuthor(Element bookElement){
String bookAuthor = "";
NodeList authorList = bookElement.getChildNodes();
for(int i = 0;i<authorList.getLength();i++){
Node nameNode = authorList.item(i);
if(nameNode.getNodeType() == Node.ELEMENT_NODE&&"A".equals(nameNode.getNodeName())){
Element nameElement = (Element)nameNode;
NodeList nameList = nameElement.getChildNodes();
for(int j = 0;j<nameList.getLength();j++){
Node temNode = nameList.item(j);
if(temNode.getNodeType() == Node.ELEMENT_NODE){
Element temElement = (Element)temNode;
if(temElement.hasChildNodes())
bookAuthor += temElement.getFirstChild().getNodeValue() + " ";
}else
bookAuthor += temNode.getNodeValue() + " ";
}
}
}
if(bookAuthor.length()>1)
bookAuthor = bookAuthor.substring(0, bookAuthor.length() - 1);
if(bookAuthor.length()>64){
bookAuthor = bookAuthor.substring(0, 64);
}
bookAuthor = bookAuthor.replace(",", " ");
bookAuthor = bookAuthor.replace(",", " ");
bookAuthor = bookAuthor.replace("等", "");
bookAuthor = bookAuthor.replace("著", "");
bookAuthor = bookAuthor.replace("編", "");
return bookAuthor;//.substring(bookAuthor.indexOf("作 者:"+4));
}
/**
* Function: getBookPublisher
* Description: 獲得圖書出版社
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookPublisher(Element bookElement){
String bookPublisher = "";
NodeList publisherNode = bookElement.getChildNodes();
for(int i = 0;i<publisherNode.getLength();i++){
Node publisherNameNode = publisherNode.item(i);
if(publisherNameNode.getNodeType() == Node.ELEMENT_NODE&&"A".equals(publisherNameNode.getNodeName())){
Element elementPublisher = (Element)publisherNameNode;
if(elementPublisher.hasChildNodes()){
if(elementPublisher.getChildNodes().getLength()>1){
NodeList temNodeList = elementPublisher.getChildNodes();
for(int j = 0;j<temNodeList.getLength();j++){
Node temNode = temNodeList.item(j);
if(temNode.getNodeType()==Node.ELEMENT_NODE){
Element temElement = (Element)temNode;
if(temElement.hasChildNodes())
bookPublisher += temElement.getFirstChild().getNodeValue();
}else{
bookPublisher += temNode.getNodeValue();
}
}
}else{
if("FONT".equals(elementPublisher.getFirstChild().getNodeName())){
Node temNode = elementPublisher.getFirstChild();
if(temNode.getNodeType() == Node.ELEMENT_NODE){
Element temElement = (Element)temNode;
if(temElement.hasChildNodes())
bookPublisher = temElement.getFirstChild().getNodeValue();
}
}else
bookPublisher = elementPublisher.getLastChild().getNodeValue();
}
}
}
}
//System.out.println("bookPublisher:"+bookPublisher);
// bookPublisher = bookPublisher.trim();
// if(bookPublisher.length()>64){
// bookPublisher = bookPublisher.substring(0, 64);
// }
return bookPublisher.trim();
}
/**
* Function: getBookPublishTime
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -