亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

? 歡迎來到蟲蟲下載站! | ?? 資源下載 ?? 資源專輯 ?? 關于我們
? 蟲蟲下載站

?? crawl.cpp

?? 搜索引擎部分代碼
?? CPP
?? 第 1 頁 / 共 3 頁
字號:
#include "Crawl.h"#include "Url.h"#include "Md5.h"#include <list.h>#include <hlink.h>#include <uri.h>extern pthread_mutex_t mymutex;extern map<string,string> mapCacheHostLookup;extern vector<string> vsUnreachHost;extern char **ParseRobot( char *data, char len);set<string> setVisitedUrlMD5;set<string> setVisitedPageMD5;set<string> setUnvisitedUrlMD5;set<string> setUnreachHostMD5;multimap<string, string, less<string> > replicas;pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER;	// unvisited urlspthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;//pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;map<unsigned long,unsigned long> mapIpBlock;bool b_fOver;//multimap<string,string, less<string> > mmapUrls;multimap<string,string > mmapUrls;typedef map<unsigned long,unsigned long>::value_type valTypeIpBlock;typedef map<string,string>::value_type mvalType;voidSaveReplicas(const char* filename);struct package{	CCrawl *crawl;	CPage *page;};vector<string> vsParsedLinks;int onfind(const char *elem, const char *attr, struct uri *uri, void *arg){	struct package *p=(struct package*)arg;	char buff[URL_LEN+1];	//	if (uri_recombine(uri, buff, URL_LEN+1, C_URI) >= 0)	if (uri_recombine(uri, buff, URL_LEN+1, C_SCHEME| C_AUTHORITY| C_PATH| C_QUERY ) >= 0)	{		vsParsedLinks.push_back(buff);		if( !p->page->IsFilterLink(buff) )		{			// accept "a,link,frame,iframe,img,area"			if (strcasecmp(elem, "img") == 0)			{				pthread_mutex_lock(&mutexLink4HistoryFile);				if( p->crawl->m_ofsLink4HistoryFile ){					p->crawl->m_ofsLink4HistoryFile << buff << endl;				}				pthread_mutex_unlock(&mutexLink4HistoryFile);			} else {				p->crawl->AddUrl( buff );			}/*			else if (strcasecmp(elem, "img") == 0)			{				pthread_mutex_lock(&mutexLink4HistoryFile);				if( p->crawl->m_ofsLink4HistoryFile ){					p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;;				}				pthread_mutex_unlock(&mutexLink4HistoryFile);			}*/		}	}	uri_destroy(uri);	free(uri);	return 1;}/*********************************************************************** * Function name: start * Input argv: * 	-- arg: the CCrawl handle * Output argv: * 	-- * Return:***********************************************************************/void* start(void *arg){	( (CCrawl*)arg )->fetch(arg);}/***************************************************************** * Function name: SaveUnvisitedUrl * Input argv: *      -- * Output argv: *      -- * Return: * Function Description: Save teh Unvisited Url * Version: 1.0 * Be careful: ****************************************************************/void SaveUnvisitedUrl(){	ofstream ofsUnvisitedUrl;	ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(), ios::in|ios::out|ios::trunc|ios::binary);        if (!ofsUnvisitedUrl) {		cerr << "cannot open " << UNVISITED_FILE << "for output" << endl;		exit (-1);	}	multimap<string,string>::iterator it = mmapUrls.begin();	for (; it!=mmapUrls.end(); it++) {		ofsUnvisitedUrl << ((*it).second).c_str() << "\n";	}	ofsUnvisitedUrl << endl;	ofsUnvisitedUrl.close();}/*********************************************************************** * Function name: fetch * Input argv: * 	-- arg: the CCrawl handle * Output argv: * 	-- * Return:***********************************************************************/void CCrawl::fetch(void *arg){	string strUrl,host;	int	nGSock = -1;	string	strGHost = "";	// create a Tianwang file for output the raw page data	string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self());	CTianwangFile tianwangFile(ofsName);	// create a Link4SE file for output the raw link data	ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self());	CLink4SEFile link4SEFile(ofsName);	int iSleepCnt=0;	for(;;){		pthread_mutex_lock(&mutexCollection);		//if( !mmapUrls.empty() ){		int cnt = mmapUrls.size();		if(cnt > 0){			cout << "collection has: " << cnt << " unvisited urls" << endl;			multimap<string,string>::iterator it=mmapUrls.begin();			if( it != mmapUrls.end() ){				// get an URL				strUrl = (*it).second;				// remove it from the collection				mmapUrls.erase( it );				pthread_mutex_unlock(&mutexCollection);				// parse URL				CUrl iUrl;				if( iUrl.ParseUrlEx(strUrl) == false ){					cout << "ParseUrlEx error in fetch(): " << strUrl << endl;					continue;				}				if( strGHost != iUrl.m_sHost ){					close( nGSock );					nGSock = -1;					strGHost = iUrl.m_sHost;				}				(( CCrawl* )arg)->DownloadFile(&tianwangFile,&link4SEFile,iUrl,nGSock);				cnt = 0;			} else {				pthread_mutex_unlock(&mutexCollection);			}		} else {			pthread_mutex_unlock(&mutexCollection);			usleep(1000);			iSleepCnt++;		}		if( b_fOver == true && iSleepCnt==200)			break;		/*		if( b_fOver == true ){			break;		} else if( cnt == 100 ) {			cout << "w.";			cnt = 0;		}		*/	}	tianwangFile.Close();	link4SEFile.Close();}/*********************************************************************** * Function name: DownloadFile * Input argv: * 	-- pTianwang: the CCrawl handle * 	-- pLink4SE: the CCrawl handle * 	-- iUrl: the URL for crawling * 	-- nGSock: the previous global socket * Output argv: * 	-- * Return:***********************************************************************/void CCrawl::DownloadFile(CTianwangFile *pTianwangFile,	CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock){	char	*downloaded_file = NULL,		*fileHead = NULL,		*location = NULL;	int file_length = 0;	string strUrlLocation = "";	int nSock = nGSock;	cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl;	CHttp http;	file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead, &location, &nSock);	#ifdef DEBUG	// just download		cout << "######file length: ######" << file_length << endl;		cout << "######head: ######" << fileHead << endl;	#endif	int nCount = 0;	while( file_length == -300 ){ // moved to an another place		if( strlen(location) > URL_LEN-1 || nCount == 3 || strlen(location)==0 ){			if( location )			{				//pthread_mutex_lock(&mutexMemory); 				free( location ); location = NULL;				//pthread_mutex_unlock(&mutexMemory);			}			file_length = -1;			break;		}		strUrlLocation = location;		if(location)		{			//pthread_mutex_lock(&mutexMemory);			free(location); location = NULL;			//pthread_mutex_unlock(&mutexMemory);		}		string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http");		if( idx1 != 0 ){			char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length()-1);			char c2 = strUrlLocation.at(0);			if( c2 == '/' ){				strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation;			}else if(  c1!='/' && c2!='/'){				string::size_type idx;                                                                                                                                        idx = iUrl.m_sUrl.rfind('/');                                if( idx != string::npos ){                                        if( idx > 6 ){ // > strlen("http://..")                                                strUrlLocation = iUrl.m_sUrl.substr(0, idx+1) + strUrlLocation;                                        } else {                                                strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;                                        }                                                                                                                                        } else {					file_length = -1;					break;                                }			} else {				if( c1=='/' ){                                        strUrlLocation = iUrl.m_sUrl + strUrlLocation;                                } else {                                        strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;                                }                        }		}		CPage iPage;		if( iPage.IsFilterLink(strUrlLocation) ){			file_length = -1;			break;		}		cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl;		file_length = http.Fetch( strUrlLocation, &downloaded_file, &fileHead, &location, &nSock);		nCount++;	}	nGSock = nSock;	if(file_length == -1){ // unreachable, skipped.		cout << "!-: " << iUrl.m_sUrl << endl;		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "-unreach host: " << iUrl.m_sHost << endl;;		return;	}	if(file_length == -2){ // out of ip block .		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		// save unreach host                SaveUnreachHost(iUrl.m_sHost);		cout << "-out of block host: " << iUrl.m_sHost << endl;;		return;	}	if(file_length == -3) { // invalid host or ip		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "-invalid host: " << iUrl.m_sHost << endl;		return;	}	if(file_length == -4) {	// MIME is image/xxx		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		if( m_ofsLink4HistoryFile ){			pthread_mutex_lock(&mutexLink4HistoryFile);			m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;;			pthread_mutex_unlock(&mutexLink4HistoryFile);		}		cout << "-imgage host: " << iUrl.m_sHost << endl;		return;	}	/* still experiment	char **dir;	dir =  ParseRobot( downloaded_file, file_length);	for( int i = 0; dir[i] != NULL ; i++){		cout << dir[i] << endl;		free( dir[i] );	}	exit(1);	*/	// so small, maybe some unuseful info, skipped	//if(file_length < 40){	// for ImgSE, /*	if(file_length < 256){	// for SE		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "#";		return;	}*/	// deal with normal page	if (!fileHead || !downloaded_file)	{		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		close(nGSock);		nGSock = -1;		cout << "-size0 host: " << iUrl.m_sHost << endl;		return;	}	CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, file_length);	//pthread_mutex_lock(&mutexMemory);	if (fileHead)	{		free(fileHead); fileHead=NULL;	}	if (downloaded_file)	{		free(downloaded_file); downloaded_file=NULL;	}	//pthread_mutex_unlock(&mutexMemory);	iPage.ParseHeaderInfo(iPage.m_sHeader);	if( iPage.m_bConnectionState == false ){		close(nGSock);		nGSock = -1;	}	// when crawling images for ImgSE, remember to comment the paragraph	// when crawling plain text for SE, remember to open the paragraph	// paragraph begin	// iPage.m_sContentType != "text/css" &&	if( iPage.m_sContentType != "text/html" && 		iPage.m_sContentType != "text/plain" &&		iPage.m_sContentType != "text/xml" &&		iPage.m_sContentType != "application/msword" &&		iPage.m_sContentType != "application/pdf" &&		iPage.m_sContentType != "text/rtf" &&		iPage.m_sContentType != "application/postscript" &&		iPage.m_sContentType != "application/vnd.ms-execl" &&		iPage.m_sContentType != "application/vnd.ms-powerpoint" ){

?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频
1024成人网| 欧美一级艳片视频免费观看| 国产精品欧美久久久久无广告| 国产一区二区三区在线观看精品| 精品久久久久一区| 国产精品一区二区黑丝| 国产嫩草影院久久久久| 91亚洲精品久久久蜜桃| 亚洲免费在线播放| 欧美日韩黄色影视| 看片网站欧美日韩| 精品成人免费观看| 成人一级片在线观看| 自拍偷拍欧美激情| 欧美美女喷水视频| 精品一区二区久久久| 欧美韩国日本综合| 欧美在线一区二区| 美国av一区二区| 国产精品久久福利| 在线播放中文一区| 高清在线观看日韩| 亚洲国产视频直播| 精品成人一区二区三区| 91小视频免费观看| 激情亚洲综合在线| 一区二区三区欧美日| 在线观看日韩电影| 日韩影院在线观看| 国产视频911| 欧美色国产精品| 国产一区 二区 三区一级| 亚洲欧美色综合| 欧美大黄免费观看| 欧美亚洲高清一区| 国产黄色91视频| 亚洲成人中文在线| 欧美激情一区三区| 日韩欧美国产综合| 日本久久一区二区| 岛国一区二区在线观看| 日本免费在线视频不卡一不卡二| 国产精品你懂的在线欣赏| 欧美日韩精品一区二区天天拍小说 | 95精品视频在线| 激情久久久久久久久久久久久久久久| 亚洲日本丝袜连裤袜办公室| 欧美精品一区二区不卡| 精品视频一区二区不卡| 成人av电影观看| 久久国内精品视频| 午夜精品免费在线观看| 最新成人av在线| 国产欧美精品在线观看| 欧美成人一区二区三区在线观看| 欧美日韩在线播| 91精品91久久久中77777| 国产91精品露脸国语对白| 麻豆91小视频| 天天操天天干天天综合网| 亚洲精品中文字幕乱码三区| 国产精品沙发午睡系列990531| 2021国产精品久久精品| 欧美一区二区免费视频| 欧美日韩一区二区在线视频| 91免费国产在线观看| 懂色av一区二区在线播放| 精彩视频一区二区三区| 久久av中文字幕片| 日本aⅴ精品一区二区三区| 亚洲成人资源在线| 亚洲国产另类精品专区| 亚洲大片精品永久免费| 性感美女极品91精品| 亚洲成a人v欧美综合天堂| 亚洲愉拍自拍另类高清精品| 亚洲综合色自拍一区| 亚洲图片一区二区| 亚洲国产另类精品专区| 五月激情丁香一区二区三区| 日韩精品电影一区亚洲| 麻豆国产精品视频| 日本不卡123| 激情文学综合网| 国产成人精品三级| 成人在线视频首页| av激情成人网| 在线国产电影不卡| 欧美日韩一区在线观看| 欧美一区二区精美| 久久久精品中文字幕麻豆发布| 久久人人97超碰com| 国产日韩欧美a| 国产精品萝li| 亚洲国产综合色| 久久精品99国产精品| 国产精品系列在线观看| 99久久综合精品| 欧美三片在线视频观看 | 99国产精品视频免费观看| 99精品欧美一区二区三区小说| 色综合天天综合狠狠| 欧美日韩亚洲国产综合| 日韩久久免费av| 中文字幕不卡的av| 亚洲成av人影院| 麻豆传媒一区二区三区| 岛国一区二区在线观看| 欧美午夜精品电影| 欧美不卡视频一区| 中文字幕亚洲区| 日韩电影在线观看网站| 成人综合婷婷国产精品久久免费| 91色乱码一区二区三区| 欧美一级日韩一级| 国产精品久久久久久福利一牛影视| 亚洲va在线va天堂| 国产成人亚洲综合色影视| 欧美中文字幕不卡| 26uuu欧美日本| 亚洲一区在线观看视频| 国产精品亚洲第一| 精品视频在线免费看| 亚洲国产精品精华液ab| 午夜亚洲国产au精品一区二区| 狠狠色狠狠色综合日日91app| 91香蕉视频mp4| 久久蜜桃av一区二区天堂| 亚洲线精品一区二区三区八戒| 国内精品写真在线观看| 欧美日韩三级一区二区| 国产精品妹子av| 日韩av不卡一区二区| 色噜噜久久综合| 欧美国产精品一区| 久久精品国产77777蜜臀| 欧美日韩国产在线观看| 国产精品久99| 国产夫妻精品视频| 欧美www视频| 日本亚洲视频在线| 91福利社在线观看| 国产精品福利一区二区| 国产成人精品免费网站| 精品欧美一区二区久久| 日本在线观看不卡视频| 欧美色老头old∨ideo| 亚洲乱码中文字幕| 成人久久视频在线观看| 久久中文娱乐网| 免费久久99精品国产| 欧美中文字幕一区二区三区亚洲 | eeuss鲁一区二区三区| 国产亚洲精品福利| 激情偷乱视频一区二区三区| 欧美成人高清电影在线| 日韩精品一级中文字幕精品视频免费观看 | 91国产免费看| 亚洲另类一区二区| 99热精品一区二区| 中文字幕一区二区三区四区| 国产成人日日夜夜| 国产午夜精品一区二区三区嫩草| 韩国v欧美v日本v亚洲v| 精品免费国产一区二区三区四区| 日韩电影在线一区二区三区| 制服丝袜亚洲色图| 午夜av电影一区| 制服丝袜在线91| 日韩高清一级片| 欧美一区二区在线视频| 日韩不卡在线观看日韩不卡视频| 69成人精品免费视频| 麻豆久久一区二区| 久久久噜噜噜久久人人看| 国产不卡一区视频| 日韩毛片一二三区| 在线观看亚洲专区| 日韩激情视频在线观看| 制服丝袜亚洲网站| 国产主播一区二区| 中文字幕一区二区三区在线观看| 91一区二区三区在线观看| 一区二区三区av电影| 欧美高清你懂得| 狠狠色丁香婷婷综合久久片| 中文字幕av一区二区三区| 91视频在线观看免费| 亚洲成人免费影院| 久久网这里都是精品| 91在线观看成人| 日韩精品福利网| 久久九九99视频| 色香蕉久久蜜桃| 美女网站色91| 中文字幕亚洲欧美在线不卡| 欧美日韩高清一区二区| 久久99蜜桃精品| 一区在线观看视频| 欧美精品1区2区|