?? http.cpp
字號:
#include <stdlib.h>#include <stdio.h>#include <string.h>#include <strings.h>#include <errno.h>#include <netdb.h>#include <unistd.h>#include <netinet/in.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/time.h>#include <fcntl.h>#include <iostream>#include "Http.h"#include "Tse.h"#include "CommonDef.h"#include "Url.h"#include "Page.h"#include "StrFun.h"char *userAgent = NULL;int timeout = DEFAULT_TIMEOUT;int hideUserAgent = 0;CHttp::CHttp(){}CHttp::~CHttp(){} /* * Actually downloads the page, registering a hit (donation) * If the fileBuf passed in is NULL, the url is downloaded and then * freed; otherwise the necessary space is allocated for fileBuf. * Returns size of download on success, -1 on error is set, -2 out of ip block, -3 invalid host, -4 MIME is imag/xxx -300 on 301. */int CHttp::Fetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock ){ char *tmp, *url, *requestBuf, *pageBuf; const char *host, *path; int sock, bytesRead = 0, bufsize = REQUEST_BUF_SIZE; int ret = -1, tempSize, selectRet; int port = 80; if( strUrl.empty() ){ cout << "strUrl is NULL" << endl; return -1; } /* Copy the url passed in into a buffer we can work with, change, etc. *//* url = (char*)malloc(strUrl.length()+1); if( url == NULL ){ cout << "can not allocate enought memory for url" << endl; return -1; } else { memset(url, 0,strUrl.length()+1); memcpy(url, strUrl.c_str(), strUrl.length() ); }*/ //pthread_mutex_lock(&mutexMemory); url = strdup(strUrl.c_str()); //pthread_mutex_unlock(&mutexMemory); if( url == NULL ){ cout << "!error: stdup() in Fetch()" << endl; return -1; } // parse the url CUrl u; if( u.ParseUrlEx(url) == false ){ cout << "ParseUrlEx error in Fetch(): " << strUrl << endl; return -1; } host = u.m_sHost.c_str(); path = u.m_sPath.c_str(); if( u.m_nPort > 0 ) port = u.m_nPort; /* Compose a request string */ //pthread_mutex_lock(&mutexMemory); requestBuf = (char*)malloc(bufsize); //pthread_mutex_unlock(&mutexMemory); if(requestBuf == NULL){ if (url) { //pthread_mutex_lock(&mutexMemory); free(url); url=NULL; //pthread_mutex_unlock(&mutexMemory); } cout << "can not allocate enought memory for requestBuf" << endl; return -1; } requestBuf[0] = 0; if( strlen(path) < 1 ){ /* The url has no '/' in it, assume the user is making a root-level * request */ tempSize = strlen("GET /") + strlen(HTTP_VERSION) +2;/* if( tempSize > bufsize ){ free(url); free(requestBuf); cout << "tempSize larger than bufsize" << endl; return -1; }*/ if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0 ) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "1.checkBuffSize(&requestBuf..) error" << endl; return -1; } }else{ tempSize = strlen("GET ") + strlen(path) + strlen(HTTP_VERSION) + 4; if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET %s %s\r\n", path, HTTP_VERSION) < 0) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "2._checkBuffSize(&requestBuf..) error" << endl; return -1; } } /* Use Host: even though 1.0 doesn't specify it. Some servers * won't play nice if we don't send Host, and it shouldn't hurt anything */ tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;/* +3 for "\r\n\0" */ if(checkBufSize(&requestBuf, &bufsize, tempSize + 128)){ //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "3._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "Host: "); strcat(requestBuf, host); strcat(requestBuf, "\r\n"); if(!hideUserAgent && userAgent == NULL) { tempSize = (int)strlen("User-Agent: ") + (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 4; if(checkBufSize(&requestBuf, &bufsize, tempSize)) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "4._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "User-Agent: "); strcat(requestBuf, DEFAULT_USER_AGENT); strcat(requestBuf, "/"); strcat(requestBuf, VERSION); strcat(requestBuf, "\r\n"); } else if(!hideUserAgent) { tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3; if(checkBufSize(&requestBuf, &bufsize, tempSize)) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "5._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "User-Agent: "); strcat(requestBuf, userAgent); strcat(requestBuf, "\r\n"); } //tempSize = (int)strlen("Connection: Close\n\n"); tempSize = (int)strlen("Connection: Keep-Alive\r\n\r\n"); if(checkBufSize(&requestBuf, &bufsize, tempSize)) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "6._checkBuffSize(&requestBuf..) error" << endl; return -1; } //strcat(requestBuf, "Connection: Close\n\n"); strcat(requestBuf, "Connection: Keep-Alive\r\n\r\n"); /* Now free any excess memory allocated to the buffer */ //pthread_mutex_lock(&mutexMemory); tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1); //pthread_mutex_unlock(&mutexMemory); if(tmp == NULL){ //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "realloc for tmp error" << endl; return -1; } requestBuf = tmp; if( *nPSock != -1 ){ sock = *nPSock; cout << "using privous socket " << *nPSock << endl; }else{ // cout << "1.get a new one" << endl; sock = CreateSocket( host, port ); if(sock == -1) { // invalid host //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); return -3; } if(sock == -2) { // out of ip block //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); //cout << "2.not able to MakeSocket" << endl; return -2; } } ret = write(sock, requestBuf, strlen(requestBuf)); if( ret == 0 ){ cout << "requestBuf is " << requestBuf << endl; cout << "write nothing" << endl; //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); close(sock); *nPSock = -1; return -1; } if( ret == -1){ //cout << "write error" << endl; // sock is invalid,we should make a new one close(sock); *nPSock = -1; cout << "2.close previous socket " << *nPSock << " and get a new one" << endl; //maybe sock is dead,try again sock = CreateSocket( host, port ); if(sock == -1) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "3.not able to MakeSocket" << endl; return -1; } if(sock == -2) { //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "4.not able to MakeSocket" << endl; return -1; } if(write(sock, requestBuf, strlen(requestBuf)) == -1){ //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); close(sock); *nPSock = -1; cout << "write error" << endl; return -1; } } //pthread_mutex_lock(&mutexMemory); if (url) { free(url); url=NULL; } if (requestBuf) { free(requestBuf); requestBuf=NULL; } //pthread_mutex_unlock(&mutexMemory); char headerBuf[HEADER_BUF_SIZE]; /* Grab enough of the response to get the metadata */ memset( headerBuf,0,HEADER_BUF_SIZE ); //cout << "old sock is " << sock << endl; ret = read_header(sock, headerBuf); //cout << "ret = " << ret << endl; if(ret < 0) { close(sock); *nPSock = -1; return -1; } //cout << headerBuf << endl; if( strlen(headerBuf) == 0 ){ cout << "strlen(headerBuf) = 0" << headerBuf << endl; cout << "strUrl: " << strUrl << endl << endl;; close(sock); *nPSock = -1; return -1; } CPage iPage; iPage.ParseHeaderInfo(headerBuf); if (iPage.m_nStatusCode == -1) { close(sock); *nPSock = -1; cout << "headerBuf: " << headerBuf << endl; cout << "!header error: not find HTTP" << endl; return -1; } #ifdef DEBUG // http return code cout <<"######Http return code: ######" << endl << i << endl; #endif // deal with http://net.cs.pku.edu.cn/~cnds if (iPage.m_nStatusCode == 301 || iPage.m_nStatusCode == 302) { if (iPage.m_sLocation.empty() || iPage.m_sLocation.size()>URL_LEN) { close(sock); *nPSock = -1; cout << headerBuf << endl; cout << "!error: Location" << endl; return -1; } else{ //pthread_mutex_lock(&mutexMemory); char *loc=strdup(iPage.m_sLocation.c_str()); //pthread_mutex_unlock(&mutexMemory); *location = loc; close(sock); *nPSock = -1; return -300; } } if(iPage.m_nStatusCode<200 || iPage.m_nStatusCode>299 ){ close(sock); *nPSock = -1; cout << "!header code = " << iPage.m_nStatusCode << endl; return -1; } // when crawling images for ImgSE, remember to comment the paragraph // when crawling plain text for SE, remember to open the paragraph // paragraph begin
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -