?? spider.java
字號:
}
}
if (url != null && isloopget)
{
if (url.startsWith("http://")
|| (url.startsWith("https://") && groksHTTPS)) {
// verify we're on the same host and port
URL u = new URL(url);
if (u.getHost().equals(summary.url.getHost())
&& u.getPort() == summary.url.getPort()) {
url = chopOffNamedAnchor(url);
if (indexedURLs.get(url) == null)
urls.add(url);
}
} else if (url.indexOf("://") == -1
&& !url.startsWith("mailto:")
&& !url.startsWith("#")
&& !url.startsWith("javascript:")) {
// parse relative url
url = formURL(summary.url, url);
url = chopOffNamedAnchor(url);
if (indexedURLs.get(url) == null)
urls.add(url);
}
}
} else if (obj instanceof TextToken) {
if(isIgnoreText) continue;
TextToken t = (TextToken) obj;
String text = t.getText();
if (text != null && text.trim().length() > 0)
desc.append(text.trim()).append(" ");
}
}
if (desc.length() > descSize)
desc.setLength(descSize);
summary.desc = desc.toString();
String list[] = new String[urls.size()];
urls.toArray(list);
return list;
}
private String chopOffNamedAnchor(String url) {
int pos = url.indexOf("#");
if (pos == -1)
return url;
else
return url.substring(0, pos);
}
// converts relative URL to absolute URL
private String formURL(URL origURL, String newURL) {
StringBuffer base = new StringBuffer(origURL.getProtocol());
base.append("://").append(origURL.getHost());
if (origURL.getPort() != -1) {
base.append(":").append(origURL.getPort());
}
if (newURL.startsWith("/")) {
base.append(newURL);
} else if (newURL.startsWith("..")) {
String file = origURL.getFile();
} else {
String file = origURL.getFile();
int pos = file.lastIndexOf("/");
if (pos != -1)
file = file.substring(0, pos);
while (newURL.startsWith("../")) {
pos = file.lastIndexOf("/");
file = file.substring(0, pos);
newURL = newURL.substring(3);
}
base.append(file).append("/").append(newURL);
}
return base.toString();
}
/**
* 解析content type , text/html;charset=utf-8
*
* @param strcontenttype :: text/html;charset=utf-8
* @return ret[0]= text/html ret[1]= utf-8
*/
private String [] parseContentType(String strcontenttype)
{
String straret [] = new String[2];
// 默認數(shù)值。
straret[0] ="text/html";
straret[1] ="gb2312";
try
{
if(strcontenttype !=null)
{
int npos = strcontenttype.indexOf(";");
if(npos ==-1)
straret[0] =strcontenttype;
else
{
straret[0] =strcontenttype.substring(0,npos);
npos = strcontenttype.indexOf("=");
if(npos !=-1)
{
straret[1] =strcontenttype.substring(npos+1);
}
}
}
}catch(Exception se)
{
}
return straret;
}
private URLSummary loadURL(String url) {
HttpURLConnection uc;
String ct = "";
URLSummary summary = null;
GetMethod get =null;
try {
get = new GetMethod(url);
get.setFollowRedirects(true);
int iGetResultCode = httpclient.executeMethod(get);
if(iGetResultCode ==200)
{
ct = get.getResponseCharSet();
String strGetResponseBody = get.getResponseBodyAsString();
// System.out.println("ddddddddddddddddddddd");
if("ISO-8859-1".equals(ct))
{
//不確定的編碼.
byte [] atemp = strGetResponseBody.getBytes("ISO-8859-1");
String strcharset = getFileEncoding(new ByteArrayInputStream(atemp));
System.out.println("charset=="+ct+"strcharset="+strcharset);
if("big5".equalsIgnoreCase(strcharset) || "gb2312".equalsIgnoreCase(strcharset))
{
strGetResponseBody = new String(atemp, "GBK");
// System.out.println(strGetResponseBody);
}
}
//java.io.ByteArrayInputStream binput =
// System.out.println(strGetResponseBody);
summary = new URLSummary();
summary.url = new URL(url);
summary.body =strGetResponseBody;
}
//ct = uc.getContentType();
} catch (Exception e) {
// 404
summary=null;
}
finally
{
if(get!=null) get.releaseConnection();
}
// String contdata[] = parseContentType(ct);
return summary;
}
public static String getFileEncoding(InputStream imp) throws Exception
{
String rv = "ASCII";
boolean found = false ;
//System.out.println("befoer getFileEncoding");
nsDetector det = new nsDetector(2) ;
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver() {
public void Notify(String charset) {
HtmlCharsetDetector.found = true ;
}
});
//URL url = new URL(argv[0]);
//BufferedInputStream imp = new BufferedInputStream(fin);
byte[] buf = new byte[1024] ;
int len;
boolean done = false ;
boolean isAscii = true ;
while( (len=imp.read(buf,0,buf.length)) != -1) {
// Check if the stream is only ascii.
if (isAscii)
isAscii = det.isAscii(buf,len);
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
done = det.DoIt(buf,len, false);
}
det.DataEnd();
if (isAscii)
{
rv = "ASCII";
found = true ;
}
if (!found)
{
String prob[] = det.getProbableCharsets() ;
if(prob.length > 0 )
rv = prob[0];
}
imp.close();
//System.out.println("after getFileEncoding");
return rv;
}
private void parseArgs(String argv[]) {
for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-u"))
urls.add(argv[++i]);
else if (argv[i].equals("-d"))
indexDir = argv[++i];
else if (argv[i].equals("-i"))
include.add(argv[++i]);
else if (argv[i].equals("-e"))
exclude.add(argv[++i]);
else if (argv[i].equals("-v"))
verbose = true;
else if (argv[i].equals("-a"))
incremental = true;
else if (argv[i].equals("-m"))
mimeTypes.put(argv[++i], Boolean.TRUE);
else if (argv[i].equals("-t"))
threads = Integer.parseInt(argv[++i]);
else if (argv[i].equals("-s"))
descSize = Integer.parseInt(argv[++i]);
}
if (urls.size() == 0)
throw new IllegalArgumentException(
"Missing required argument: -u [start url]");
if (indexDir == null)
throw new IllegalArgumentException(
"Missing required argument: -d [index dir]");
if (threads < 1)
throw new IllegalArgumentException("Invalid number of threads: "
+ threads);
if (mimeTypes.size() == 0) {
// add default MIME types
mimeTypes.put("text/html", Boolean.TRUE);
mimeTypes.put("text/plain", Boolean.TRUE);
}
}
private void print(String str) {
System.out.println(str);
}
public static void main(String argv[]) throws Exception
{
Spider a = new Spider();
a.spiderURL("http://www.sina.com.cn/");
}
}
class URLSummary {
URL url;
String body;
String desc = "";
String title = "Untitled";
public String toString() {
return "URL=" + url.toString() + "\r\ndesc=" + desc + "\r\ntitle="
+ title + "\r\n";
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -