?? pageoperate.java

?? 本程序可從網上利用百度搜索引擎下載和輸入關鍵詞有關的網頁
?? JAVA
?? 第 1 頁 / 共 2 頁
字號:
12 下一頁



package spider;

import spider.SaveToDataBase;
import spider.photoOperate;

import java.net.*;
import java.io.*;
import java.util.*;

public class PageOperate
{
	private String myUrl;// 定義URL
	private String myHtml;// 定義從URL站點得到的Html字符串
	private String myAddress;// 定義保存Html字符串的文件的地址
	private int myFileNumber;// 定義保存的文件的個數
	private String myFileName;// 定義保存Html字符串的文件名
	private List myHttpList = new ArrayList();//保存從html字符串中提取的url

	public void setMyUrl(String myUrl)// 設置myUrl要連接的url地址
	{
		this.myUrl = myUrl;
	}

	public void setMyUrl(String baidu,String searchWord)// 設置搜索時的前綴和搜索的關鍵詞
	{
		this.myUrl =baidu+java.net.URLEncoder.encode(searchWord);
	}
	
	public String getMyUrl()// 取得myUrl
	{
		return myUrl;  
	}

	public void setHtml()// 從指定的網址url取得html字符流設置myHtml //此處用了上面定義的私有變量myUrl
	{
		if(myUrl!=null)
		{
			try 
			{
				URL ul = new URL(myUrl);// 此處用了上面定義的私有變量myUrl				
				BufferedReader br = new BufferedReader(new InputStreamReader(ul.openStream())); // 打開字符流
				System.out.println("連接完成");
				String s = "";
				StringBuffer sb = new StringBuffer("");
				while ((s = br.readLine()) != null) 
				{
					sb.append(s + "\r\n");
			    }
				br.close();
				myHtml = sb.toString();//.toLowerCase();// 網頁字符流付值給myHtml//轉換成小寫的字母
				this.charSet();//轉換編碼
				System.out.println("讀取完成");
			} 
			catch (Exception e)
			{
				myHtml = null;
				System.out.println("error open url   " + myUrl+ "  and HTML is null");
				e.printStackTrace();
			}
		}
		else
		{
			System.out.println("myUrl為空");
		}
		
	}

	public String getHtml()// 取得myHtml
	{
		return myHtml;
	}

	public void setMyAddress(String oneAddress)// 設置保存文件的地址
	{
		myAddress = oneAddress;
	}

	public String getMyAddress()// 取得保存文件的地址
	{
		return myAddress;
	}

	public void setMyFileNumber(int oneFileNumber)// 設置文件數
	{
		myFileNumber = oneFileNumber;
	}

	public int getMyFileNumber()// 取得文件數
	{
		return myFileNumber;
	}

	public void setFileName(String myFileName)// 設置文件名 //此處用了上面定義的私有變量myHtml和myUrl
	{
		this.myFileName=myFileName;
	}
	public void setFileName()// 設置文件名 //此處用了上面定義的私有變量myHtml和myUrl
	{
		try// 用頁面的title當存儲地址
		{
			if(myHtml!=null)
			{
				//System.out.println(myHtml);
				myFileName = myHtml.substring(myHtml.indexOf("<title") + 7, myHtml
						.indexOf("</title>"));// 提取標題的內容
				String replaceUrl = myFileName.replace('.', '_').replace(':', '_')
						.replace('/', '_').replace('/', '_').replace('?', '_')
						.replace('=', '_').replace('|', '_').replace('&', '_').replace(' ', '_').replace('，', '_').replace('《', '_').replace('》', '_').replace('"', '_').replace('>', '_').replace('-', '_').replace(',', ' ').replaceAll(","," ");				
				myFileName = replaceUrl+ myFileNumber ;//+ ".html";// 保存的文件最終名稱
				System.out.println("文件名myFileName:"+myFileName);
			}
			else
			{
				System.out.println("myHtml是空的");
			}		
		} 
		catch (Exception e)// 如果頁面的title不能用，則用域名做地址
		{
			//System.out.println(e.toString());
			System.out.println("出現異常myFileName:"+myFileName);				
		}
		
		if(myFileName==null)
		{
			if(myUrl!=null)
			{
				System.out.println("error title  filename but use the www");
				 
				String replaceUrl = myUrl.replace('.', '_').replace(':', '_')
						.replace('/', '_').replace('/', '_').replace('?', '_')
						.replace('=', '_').replace('|', '_').replace('&', '_');														              
				myFileName = replaceUrl ;//+ ".html";

			}
			else
			{
				System.out.println("myUrl是空的");
			}
		}
		
		if(myUrl.indexOf(".asp")!=-1)
		{
			myFileName=myFileName+".asp";
		}
		else
		{
			if(myUrl.indexOf(".jsp")!=-1)
			{
				myFileName=myFileName+".jsp";
			}
			else
			{
				if(myUrl.indexOf(".htm")!=-1)
				{
					myFileName=myFileName + ".htm";
				}
				else
				{						
					myFileName=myFileName.trim()+".html";	
					System.out.println(myFileName);
				}
			}					
		}					
	}

	public String getFileName()// 取得文件名
	{
		return myFileName;
	}

	public void saveHtmlToFile()// 把myHtml保存成指定的文件 //用到了上面定義的myFileName 和// myAddress								
	{
		if (myHtml != null && myAddress!=null && myFileName!=null )
		{
			try
			{
				
				File dir = new File(myAddress );//目錄是否存在
				if(!dir.exists())//不存在則創建，可創建他的父目錄
				   {
					   dir.mkdirs();
				   }								
				 File write = new File(dir,myFileName); // 用到了上面定義的myFileName																// 和 myAddress
				 if(!write.exists())
				   {
					   write.createNewFile();
				   }
				   //System.out.println(myFileName);
				   BufferedWriter bw = new BufferedWriter(new FileWriter(write,	true));
				   String[] someHm = myHtml.split("\n");
				   for (int i = 0; i < someHm.length; i++)
				   {
					 bw.write(someHm[i]);
					 bw.newLine();
				   }

				  bw.close();
				  System.out.println("保存完畢");
			}
			catch (Exception e) 
			{
				e.printStackTrace();
				System.out.println("出現異常 error save html to file");
			}

		}
		else
		{
			System.out.println("myHtml is null");
		}

	}

	public String http(int beginIndex, int endIndex, String spiderHtml)//從spiderHtml中取出從beginIndex到endIndex的一段字符串
	{
		String oneHttp = " ";
		try 
		{
			oneHttp = spiderHtml.substring(beginIndex, endIndex);
		} 
		catch (Exception e) 
		{
			System.out.println("此處有異常" + spiderHtml);
		}
		return oneHttp;
	}


	
	//首先處理切分后沒有http://的情況（首先判斷第一個字符是否是單雙引號，接著派段下面是否是http://）。再同統一處理有http://的情況
	public void setHttpList(String splitWord)// 從httl頁面中提取url存放到LIst中，用特定的詞splitWord分割網頁
	{
		if (myHtml != null) 
		{							
			String[] splitHtmlUsehref = myHtml.split(splitWord); // href=切割// 用到了上面定義的myHtml	

			System.out.println(splitHtmlUsehref.length);
12 下一頁
?? 文件大小 178 K
?? 上傳用戶 rubyist
?? 所屬分類 Jsp/Servlet
??? 相關標簽

#程序 #百度搜索引擎 #輸入
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? pageoperate.java

?? 快捷鍵說明