?? httptse.cpp
字號:
#include <stdlib.h>#include <stdio.h>#include <string.h>#include <strings.h>#include <errno.h>#include <netdb.h>#include <unistd.h>#include <netinet/in.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/time.h>#include <fcntl.h>#include <iostream>#include "HttpTse.h"#include "Tse.h"#include "Url.h"#include "Page.h"#include "StrFun.h"int _checkBufSize(char **buf, int *bufsize, int more);using namespace std;char *userAgent = NULL;int timeout = DEFAULT_TIMEOUT;int hideUserAgent = 0;map<string,string> mapCacheHostLookup;typedef map<string,string>::value_type valTypeCHL;extern map<unsigned long,unsigned long> mapIpBlock;extern vector<string> vsUnreachHost; /* * Actually downloads the page, registering a hit (donation) * If the fileBuf passed in is NULL, the url is downloaded and then * freed; otherwise the necessary space is allocated for fileBuf. * Returns size of download on success, -1 on error is set, * -2 on 301. */int HttpFetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock ){ //const char *url_tmp = strUrl.c_str(); char *tmp, *url, *host, *charIndex, *requestBuf, *pageBuf; int sock, bytesRead = 0, contentLength = -1, bufsize = REQUEST_BUF_SIZE; int ret = -1, i=-1, tempSize, selectRet; //if(url_tmp == NULL){ if( strUrl.empty() ){ cout << "strUrl is NULL" << endl; return -1; } /* Copy the url passed in into a buffer we can work with, change, etc. */ //url = (char*)malloc(strlen(url_tmp)+1); url = (char*)malloc(strUrl.length()+1); if(url == NULL){ cout << "can not allocate enought memory for url" << endl; return -1; } memset(url, 0,strUrl.length()+1); //memset(url, strlen(url_tmp)+1, 0); //strncpy(url, url_tmp, strlen(url_tmp) + 1); memcpy(url, strUrl.c_str(), strUrl.length() ); charIndex = strstr(url, "://"); if(charIndex != NULL){ /* url contains a protocol field */ charIndex += strlen("://"); host = charIndex; charIndex = strchr(charIndex, '/'); }else{ host = (char *)url; charIndex = strchr(url, '/'); } /* Compose a request string */ requestBuf = (char*)malloc(bufsize); if(requestBuf == NULL){ free(url); cout << "can not allocate enought memory for requestBuf" << endl; return -1; } requestBuf[0] = 0; if(charIndex == NULL){ /* The url has no '/' in it, assume the user is making a root-level * request */ tempSize = strlen("GET /") + strlen(HTTP_VERSION) +1; if( tempSize > bufsize ){ free(url); free(requestBuf); cout << "tempSize larger than bufsize" << endl; return -1; } if(_checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET / %s\n", HTTP_VERSION) < 0 ) { free(url); free(requestBuf); cout << "1._checkBuffSize(&requestBuf..) error" << endl; return -1; } //requestBuf = "GET / " + (string)HTTP_VERSION + "\n"; }else{ tempSize = strlen("GET ") + strlen(charIndex) + strlen(HTTP_VERSION) + 3; if(_checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET %s %s\n", charIndex, HTTP_VERSION) < 0) { free(url); free(requestBuf); cout << "2._checkBuffSize(&requestBuf..) error" << endl; return -1; } //requestBuf = "GET / " + (string)charIndex + HTTP_VERSION + "\n"; } /* Null out the end of the hostname if need be */ if(charIndex != NULL){ *charIndex = 0; } /* Use Host: even though 1.0 doesn't specify it. Some servers * won't play nice if we don't send Host, and it shouldn't hurt anything */ tempSize = (int)strlen("Host: ") + (int)strlen(host) + 2;/* +2 for "\n\0" */ if(_checkBufSize(&requestBuf, &bufsize, tempSize + 128)){ free(url); free(requestBuf); cout << "3._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "Host: "); strcat(requestBuf, host); strcat(requestBuf, "\n"); if(!hideUserAgent && userAgent == NULL) { tempSize = (int)strlen("User-Agent: ") + (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 3; if(_checkBufSize(&requestBuf, &bufsize, tempSize)) { free(url); free(requestBuf); cout << "4._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "User-Agent: "); strcat(requestBuf, DEFAULT_USER_AGENT); strcat(requestBuf, "/"); strcat(requestBuf, VERSION); strcat(requestBuf, "\n"); } else if(!hideUserAgent) { tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 2; if(_checkBufSize(&requestBuf, &bufsize, tempSize)) { free(url); free(requestBuf); cout << "5._checkBuffSize(&requestBuf..) error" << endl; return -1; } strcat(requestBuf, "User-Agent: "); strcat(requestBuf, userAgent); strcat(requestBuf, "\n"); } //tempSize = (int)strlen("Connection: Close\n\n"); tempSize = (int)strlen("Connection: Keep-Alive\n\n"); if(_checkBufSize(&requestBuf, &bufsize, tempSize)) { free(url); free(requestBuf); cout << "6._checkBuffSize(&requestBuf..) error" << endl; return -1; } //strcat(requestBuf, "Connection: Close\n\n"); strcat(requestBuf, "Connection: Keep-Alive\n\n"); /* Now free any excess memory allocated to the buffer */ tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1); if(tmp == NULL){ free(url); free(requestBuf); cout << "realloc for tmp error" << endl; return -1; } requestBuf = tmp; if( *nPSock != -1 ){ sock = *nPSock; cout << "using privous socket" << *nPSock << endl; }else{ cout << "1.get a new one" << endl; sock = MakeSocket(host); if(sock == -1) { free(url); free(requestBuf); cout << "1.not able to MakeSocket" << endl; return -1; } if(sock == -2) { free(url); free(requestBuf); cout << "2.not able to MakeSocket" << endl; return -1; } } //cout << "requestBuf is " << requestBuf << endl; if(write(sock, requestBuf, strlen(requestBuf)) == -1){ cout << "write error" << endl; close(sock); *nPSock = -1; cout << "2.close previous socket " << *nPSock << " and get a new one" << endl; //maybe sock is dead,try again sock = MakeSocket(host); if(sock == -1) { free(url); free(requestBuf); cout << "3.not able to MakeSocket" << endl; return -1; } if(sock == -2) { free(url); free(requestBuf); cout << "4.not able to MakeSocket" << endl; return -1; } if(write(sock, requestBuf, strlen(requestBuf)) == -1){ close(sock); *nPSock = -1; free(url); free(requestBuf); cout << "write error" << endl; return -1; } } free(url); free(requestBuf); char headerBuf[HEADER_BUF_SIZE]; /* Grab enough of the response to get the metadata */ memset( headerBuf,0,HEADER_BUF_SIZE ); //cout << "old sock is " << sock << endl; ret = _http_read_header(sock, headerBuf); //cout << "ret = " << ret << endl; if(ret < 0) { close(sock); *nPSock = -1; cout << "_http_read() error " << endl; return -1; } //cout << headerBuf << endl; charIndex = strstr(headerBuf, "HTTP/"); if(charIndex == NULL){ close(sock); *nPSock = -1; cout << headerBuf << endl; cout << "strstr() error " << endl; return -1; } while(*charIndex != ' '){ charIndex++; } charIndex++; ret = sscanf(charIndex, "%i", &i); if(ret != 1){ close(sock); *nPSock = -1; cout << "sscanf() error" << endl; return -1; } #ifdef DEBUG // http return code cout <<"######Http return code: ######" << endl << i << endl; #endif // deal with http://net.cs.pku.edu.cn/~cnds if(i == 301 || i == 302){ char *loc; loc = (char*)malloc(URL_LEN); if(loc == NULL){ close(sock); *nPSock = -1; cout << "malloc error" << endl; return -1; } charIndex = strstr(headerBuf, "Location:"); if(charIndex != NULL){ //ret = sscanf(charIndex + strlen("Location: "), "%(URL_LEN-1)s",loc); ret = sscanf(charIndex + strlen("Location: "), "%255s",loc); if(ret != 1){ close(sock); *nPSock = -1; cout << headerBuf << endl; cout << "sscanf() error" << endl; return -1; } else{ *location = loc; close(sock); *nPSock = -1; //cout << "sscanf() else error" << endl; return -2; } } } if(i<200 || i>299 ){ close(sock); *nPSock = -1; cout << "ret code = " << i << " < 200 or > 299" << endl; return -1; } charIndex = strstr(headerBuf, "Content-Length:"); if(charIndex == NULL){ charIndex = strstr(headerBuf, "Content-length:"); } if(charIndex == NULL){ /* Allocate enough memory to hold the page */ //if(contentLength == -1){ contentLength = DEFAULT_PAGE_BUF_SIZE; //} }else{ ret = sscanf(charIndex + strlen("content-length: "), "%i", &contentLength); if(ret < 1){ close(sock); *nPSock = -1; cout << "sscanf() error" << endl; return -1; } } if(contentLength < 20){ contentLength = DEFAULT_PAGE_BUF_SIZE; } if(contentLength > MAX_PAGE_BUF_SIZE){ cout << "the page discarde due to its size " << contentLength << " is larger than " << MAX_PAGE_BUF_SIZE << endl; //close(sock); return -1; } #ifdef DEBUG // http content length cout <<"######Content length: ######" << endl << contentLength << endl; #endif pageBuf = (char *)malloc(contentLength); if(pageBuf == NULL){ close(sock); *nPSock = -1; cout << "malloc for pageBuf" << endl; return -1; } /* Begin reading the body of the file */ fd_set rfds; struct timeval tv; int flags; flags=fcntl(sock,F_GETFL,0); if(flags<0){ close(sock); *nPSock = -1; free(pageBuf); cout << "1.fcntl() error " << endl; return -1; } flags|=O_NONBLOCK; if(fcntl(sock,F_SETFL,flags)<0){ close(sock); *nPSock = -1; free(pageBuf); cout << "2.fcntl() error " << endl; return -1; } int pre_ret=0; while(ret > 0){ FD_ZERO(&rfds); FD_SET(sock, &rfds); if( bytesRead == contentLength ){ tv.tv_sec = 1; }else{ tv.tv_sec = timeout; } tv.tv_usec = 0;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -