?? parser.java
字號:
import java.lang.*;
import java.net.*;
import java.sql.*;
import java.io.*;
import java.sql.*;
import java.util.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import com.sun.org.apache.xalan.internal.xsltc.compiler.sym;
public class Parser extends ParserCallback //HTML解析函數
{
protected static ArrayList element = new ArrayList();
public Parser()
{
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
handleStartTag(t, a, pos);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if((t == HTML.Tag.A) && (t != HTML.Tag.BASE))
{
String herf= (String)a.getAttribute(HTML.Attribute.HREF);
if (herf!= null && (herf.indexOf("http")==0) && herf.length()<30)
{
if(!element.contains(herf))
element.add(herf); //先把第一層的URL放到容器中,然后再把他們一個個提取出來,
}
}
}
private static void startParse(String sHtml)
{
try
{
ParserDelegator ps = new ParserDelegator();
HTMLEditorKit.ParserCallback parser = new Parser();
ps.parse(new StringReader(sHtml), parser, true);
}
catch(Exception e)
{
e.printStackTrace();
}
}
static class FileReader2 extends InputStreamReader
{
/*
這里我重新寫了一個函數,解決漢字的編碼編碼問題
*/
public FileReader2(String fileName,String charSetName) throws FileNotFoundException, UnsupportedEncodingException
{
super(new FileInputStream(fileName),charSetName);
}
}
public static void main(String args[]) throws Exception
{
String RL = "jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=TESTDB";
String user ="sa";//這里替換成你自已的數據庫用戶名
String password ="";//這里替換成你自已的數據庫用戶密碼
PreparedStatement psInsert =null;
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
System.out.println( "類實例化成功!" );
Connection con = DriverManager.getConnection(RL,user,password);
System.out.println( "創建連接對像成功!" );
Statement stmt = con.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
BufferedWriter out=null;
PrintWriter out1=null;
// BufferedReader br=new BufferedReader(new FileReader2("d:\\1.htm","UTF-8")); //從文件中獲得
URL url = new URL("http://www.sohu.com");
HttpURLConnection uc=(HttpURLConnection)url.openConnection();
System.out.println("openConnection connect sucessful");
BufferedReader br = new BufferedReader(new InputStreamReader(uc.getInputStream())); //連網從網絡上獲得
ArrayList html=new ArrayList();
ArrayList html1=new ArrayList(); //不要因為可以放到一個容器中
ArrayList allnum=new ArrayList();
String s1;
String s ;
while((s=br.readLine())!=null)
{
html.add(s);
s = br.readLine();
}
s = new String(html.toString().getBytes("GBK"));
startParse(s);
psInsert=con.prepareStatement("Insert INTO url1 Values (?)");
ArrayList link = element;
element.clear();
System.out.println("第一次清空成功");
for(int i3=0;i3<link.size();i3++)
{
String s5=link.get(i3).toString();
System.out.println(s5);
/* if(!allnum.contains(s5))
{
element.add(s5);
}*/
psInsert.setString(1,s5);
psInsert.executeUpdate();
}
ResultSet rs1 = stmt.executeQuery("select * from url1");
ArrayList as = new ArrayList();
while(rs1.next())
{
as.add(rs1.getString(1));
}
element.clear();
System.out.println("第一次插入成功");
System.out.println("最原始的是已經解析出來");
System.out.println("已經導入放到AS容器中");
System.out.println("已把element中的數據清空");
for(int i=0;i<as.size();i++) //已替換
{
System.out.println("i等于"+i);
System.out.println(as.get(i));
String s3=as.get(i).toString();
//從這里開始做第二層的URL
URL url2=new URL(s3);
HttpURLConnection uc2=(HttpURLConnection)url2.openConnection();
System.out.println("openConnection1 connect sucessful");
BufferedReader br1=new BufferedReader(new InputStreamReader(uc2.getInputStream()));
while((s1=br1.readLine())!=null)
{
html1.add(s1); //加入到不同的容器中;
s1=br1.readLine();
}
s1=new String(html1.toString().getBytes("GBK"));
//問題問題問題問題就是出現在這里,因為element是一個靜態的全局變量。它不會因為函數的消失而沒有
startParse(s1);
System.out.println("第二次解析成功");
/*ResultSet rs = stmt.executeQuery("SELECT * from url1");
rs.last();
System.out.println("成功移動動動到最后一行");
ArrayList link1=element;
for(int i1=0;i1<link1.size();i1++)
{
String s4=link1.get(i1).toString();
System.out.println(s4);
psInsert.setString(1,s4);
psInsert.executeUpdate();
}*/
ArrayList link1=element;
for(int i1=0;i1<link1.size();i1++)
{
String s4=link1.get(i1).toString();
//System.out.println(s4);
if(!allnum.contains(s4))
allnum.add(s4);
}
link1.clear();
element.clear();
html1.clear();//問題問problem 就是出現在這這there;
System.out.println("link已經清空"+i);
System.out.println("element已經清空");
System.out.println("link已經清空");
}
for(int i5=0;i5<allnum.size();i5++)
{
String s5=allnum.get(i5).toString();
System.out.println(s5);
psInsert.setString(1,s5);
psInsert.executeUpdate();
}
out1.flush();
out1.close();
br.close();
con.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
}
//http://java.chinaitlab.com/base/732677.html
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -