?? extract.cpp
字號:
#include "StdAfx.h"
#include "Extract.h"
#include "windows.h"
#include ".\extract.h"
Extract::Extract()
{
strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
}
//--------------------------------------------------------------------------------
// FUNCTION: 打開網頁文件,去除標簽,提取內容。
// IN : 網頁文件(*.htm/html)。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcess(string &filestring)
{
string strLine,strResult;
int i = 1 , j = 1 ;
strResult =filestring ;
funConvCapital(strResult);
funDelete2(strResult,"SCRIPT",0); //去除<SCRIPT></SCRIPT>間的內容
funDelete2(strResult,"STYLE",0); //去除<STYLE></STYLE>間的內容
funDelete2(strResult,"XML",0); //去除<XML></XML>間的內容
if ( STR_A ) funDelete2(strResult,"A",0);
funDelete(strResult,"<A"); //將<A ...>變為<A>
funDelete(strResult,"<P"); //將<P ...>變為<P>
funDelete(strResult,"<TD");
if ( STR_REPLACE_VERTICAL ) funReplace(strResult,"|"," ");
//if ( STR_SPACE ) funReplace(strResult," ",""); //是否刪除所有空格
funDelUselessLink(strResult); //刪除無用鏈接
funKeepListLabel(strResult,0); //0表示從第一個字符開始查找
funReplace(strResult," "," "); //用空格替換TAB
funReplace(strResult," "," "); //最多保留一個空格
funDelBegAnd(strResult,0); //替換以&開頭以;結束的字符串
filestring = funReplaceReturnAndEnter(strResult.c_str());
}
//--------------------------------------------------------------------------------
// FUNCTION: 將小寫字母轉為大寫。
// IN : 一行。
// OUT :
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funConvCapital(string& strLine)
{
if(STR_TI_CAPITAL)
{
funReplace(strLine,"<ti","<TI");
funReplace(strLine,"</ti","</TI");
}
if(STR_TR_CAPITAL)
{
funReplace(strLine,"<tr","<TR");
funReplace(strLine,"</tr","</TR");
}
if(STR_TD_CAPITAL)
{
funReplace(strLine,"<td","<TD");
funReplace(strLine,"</td","</TD");
}
if(STR_P_CAPITAL)
{
funReplace(strLine,"<p","<P");
}
if(STR_BR_CAPITAL)
{
funReplace(strLine,"<br","<BR");
}
if(STR_DI_CAPITAL)
{
funReplace(strLine,"<di","<DI");
funReplace(strLine,"</di","</DI");
}
if(STR_XML_CAPITAL)
{
funReplace(strLine,"xml","XML");
}
if(STR_A_CAPITAL )
{
funReplace(strLine,"<a","<A");
funReplace(strLine,"</a>","</A>");
}
if(STR_SPACE )
{
funReplace(strLine," ","");
}
funReplace(strLine,"script","SCRIPT");
funReplace(strLine,"style","STYLE");
}
//--------------------------------------------------------------------------------
// FUNCTION: 對strLine進行預處理。
// IN : 一行。
// OUT : 將結果保存在變量中。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funPreProcess(string& strLine)
{
wchar_t ws[1024];
char ss[2048];
int nTrans = MultiByteToWideChar(CP_ACP, 0, strLine.c_str(), strLine.size(), ws, 1024 );
ws[nTrans] =0;
wstring wss=ws;
funReplace(wss,L" ",L" "); //將" "(全角的空格)替換成空格
nTrans = WideCharToMultiByte(CP_ACP, 0, wss.c_str(), wss.size(), ss, 2048, NULL, NULL );
ss[nTrans] = 0;
strLine=ss;
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除<>間的字符。
// IN : strResult,以及分隔符。
// OUT : 傳址
// CALL : funDelete(strResult,"<TD");
// NOTE : Eg:<TD style="FONT-SIZE: 24pt" width="100%"> 變為:<TD>
//--------------------------------------------------------------------------------
void Extract::funDelete(string& strResult,string separator)
{
string strTemp1,strTemp2;
int nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find(separator);
nPos2 = strResult.find(">",nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果沒有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,"<"); //n1、n2至少等于1
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+1);
funDelete(strTemp2,separator);
strResult=strTemp1+separator+">"+strTemp2;
}
//--------------------------------------------------------------------------------
// FUNCTION: 返回位置i,j間的symbol個數。
// IN :
// OUT :
// CALL : int n = funCountSymbol(strResult,4,19,"<"); 結果n=4
// NOTE : Eg:<TD><A<B<BR>>BC</B>>DD</A></TD> 變為:<TD>
// 0123456789012345678901234567890
// 1 2 3
//--------------------------------------------------------------------------------
int Extract::funCountSymbol(string strResult,int i,int j,string symbol)
{
int nPos;
int nCount = 0 ;
nPos = strResult.find(symbol,i);
while((nPos != -1) && (nPos <= j))
{
nCount ++ ;
nPos = strResult.find(symbol,nPos+1);
}
return nCount ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除<><>間的字符。
// IN : strResult,以及分隔符。
// OUT : 傳址
// NOTE : Eg:abc<SCRIPT language=JavaScript>……</SCRIPT>abc 變為:abcabc
// NOTE : 存在嵌套的情況
// abc<SCRIPT><SCRIPT>……</SCRIPT>……</SCRIPT>abc 變為:abcabc
//--------------------------------------------------------------------------------
void Extract::funDelete2(string& strResult,string separator,int nPos)
{
string strTemp1,strTemp2;
string sepa1 = "<"+separator ;
string sepa2 = "</"+separator+">" ;
int nPos1,nPos2;
int n1,n2;
nPos1=strResult.find(sepa1,nPos);
nPos2=strResult.find(sepa2,nPos1);
if ((nPos1 == -1) || (nPos2 == -1)) return; //如果沒有<或>,直接退出。
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
while (n1 != n2)
{
nPos2 = strResult.find(sepa2,nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,sepa1);
n2 = funCountSymbol(strResult,nPos1,nPos2+sepa2.length(),sepa2);
}
if(nPos2 == -1) return; //注意:如果找不到,要返回。
strTemp1=strResult.substr(0,nPos1);
strTemp2=strResult.substr(nPos2+sepa2.length());
funDelete2(strTemp2,separator,0);
strResult=strTemp1 + strTemp2 ;
}
//--------------------------------------------------------------------------------
// FUNCTION: 保留指定標簽之間的字符。
// IN : strResult
// OUT : strResult
// NOTE : Eg:<HTML><TD><FONT>a<A>b</A>c</FONT></TD></HTML> 變為:<TD>a<A>bc
// NOTE : Eg:<TABLE <BR>><TD>a</TD></TABLE> 變為:_a (_表示空格)
//--------------------------------------------------------------------------------
void Extract::funKeepListLabel(string& strResult,int nPosOld)
{
//string strUseful = "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>";
string strTemp1,strTemp2;
string sepa;
int nPos,nPos1,nPos2;
int n1,n2;
nPos1 = strResult.find("<");
nPos2 = strResult.find(">",nPos1);
while ((nPos1 != -1) && (nPos2 != -1)) {
sepa=strResult.substr(nPos1,3);
nPos=strUseful.find(sepa);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
while (n1 != n2)
{
nPos2 = strResult.find(">",nPos2+1);
n1 = funCountSymbol(strResult,nPos1,nPos2,"<");
n2 = funCountSymbol(strResult,nPos1,nPos2,">");
}
if(nPos2 == -1) //<TITLE>博客園 - webcool</</TITLE>
{
nPos2 = strResult.find(">",nPos1);
}
strTemp1 = strResult.substr(0,nPos1);
strTemp2 = strResult.substr(nPos2+1);
funDelSideSpace(strTemp1);
funDelSideSpace(strTemp2);
if ( nPos != -1 )
{
if (sepa == "<TD") strTemp1 += " " ;
//else if (sepa == "<DI") strTemp1 += " " ; //遇div,回車
else strTemp1 += "<BR" ;
}
strResult = strTemp1+strTemp2;
nPos1=strResult.find("<",strTemp1.length());
nPos2=strResult.find(">",strTemp1.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 更改擴展名。
// IN : 網頁文件(*.htm/html)。
// OUT : 原文件名,擴展名為txt。
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDestFilename(const char* filename,char* sResultFile)
{
strcpy(sResultFile,filename);
unsigned int nLen=strlen(filename);
if (filename[strlen(filename)-1] == 'l') //.html
{
strncpy(sResultFile+nLen-5,".txt",10);
}
else //.htm
{
strncpy(sResultFile+nLen-4,".txt",10);
}
sResultFile[nLen+4]=0;
}
//--------------------------------------------------------------------------------
// FUNCTION: 將strLine中的字符串strOld用strNew替換。
// IN : 輸入一行
// OUT : 替換后的行
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(string& strLine,string strOld,string strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 將strLine中的字符串strOld用strNew替換。重載函數,處理寬字符。
// IN : 輸入一行
// OUT : 替換后的行
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funReplace(wstring& strLine,wstring strOld,wstring strNew)
{
int nPos=0;
while ((nPos=strLine.find(strOld)) != -1) {
strLine=strLine.substr(0,nPos)+strNew+strLine.substr(nPos+strOld.length());
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除strLine兩邊的空格。
// IN : strLine
// OUT : strLine
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funDelSideSpace(string& strLine)
{
int nPos;
if (strLine.empty()) return;
nPos=strLine.find_first_not_of(" ");
if ( nPos == -1 ) {
strLine = "";
return;
}
strLine=strLine.substr(nPos);
nPos=strLine.find_last_not_of(" ");
strLine=strLine.substr(0,nPos+1);
}
//--------------------------------------------------------------------------------
// FUNCTION: 去除以&開頭以;結束的字符串。
// IN : strResult
// OUT : strResult
// NOTE : 1." "-->" " 2."<"-->"<" 3.">"-->">"
// 4."&"-->"&" 5."""-->"\" 6."©"-->"(C)"
// 7."®"-->"?" 8."™"-->"TM" 9."•"-->"·"
//--------------------------------------------------------------------------------
void Extract::funDelBegAnd(string& strResult,int nPos) //從第nPos個位置開始
{
//a & ;; 編輯
string strTmp1,strTmp2;
string strSymbol;
int nPos1,nPos2;
nPos1 = strResult.find("&",nPos);
nPos2 = strResult.find(";",nPos1);
while ((nPos1 != -1) || (nPos2 != -1))
{
strSymbol = strResult.substr(nPos1+1,nPos2-nPos1-1);
if(strSymbol == "nbsp") strSymbol = " ";
else if(strSymbol == "lt") strSymbol = "<";
else if(strSymbol == "gt") strSymbol = ">";
else if(strSymbol == "amp") strSymbol = "&";
else if(strSymbol == "quot") strSymbol = "\"";
else if(strSymbol == "copy") strSymbol = "(C)";
else if(strSymbol == "reg") strSymbol = "?";
else if(strSymbol == "trade") strSymbol = "TM";
else if(strSymbol == "#8226") strSymbol = "·";
else if(strSymbol == "#149") strSymbol = "·";
else
{
nPos1 = strResult.find("&",nPos1+1); //找下一個&
nPos2 = strResult.find(";",nPos1);
continue;
}
strTmp1 = strResult.substr(0,nPos1) ;
strTmp2 = strResult.substr(nPos2 + 1) ;
strResult = strTmp1 + strSymbol + strTmp2;
nPos1 = strResult.find("&",strTmp1.length());
nPos2 = strResult.find(";",nPos1);
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 刪除無用鏈接(在<TR>和</TR>間除鏈接外,沒有其他內容)
// IN : strResult
// OUT : 傳址
// NOTE : Eg:a<TR><TD><A>bc</A></TD></TR><TR><TD>d</TD></TR>e 變為:
// a<TR><TD>d</TD></TR>e
//--------------------------------------------------------------------------------
void Extract::funDelUselessLink(string& strResult)
{
string strTmp1,strTmp2,strTmp3;
int nPos1,nPos2;
nPos1 = strResult.find("<TR");
nPos2 = strResult.find("</TR>");
while ((nPos1 != -1) && (nPos2 != -1)) {
strTmp1 = strResult.substr(0,nPos1);
strTmp2 = strResult.substr(nPos1,nPos2 - nPos1 + 5);
strTmp3 = strResult.substr(nPos2 + 5);
funDelete2(strTmp2,"A",0); //刪除<A>與</A>間的內容
funKeepListLabel(strTmp2,0);
funReplace(strTmp2,"<BR","");
funReplace(strTmp2," ","");
if (strTmp2.empty())
{
strResult = strTmp1 + strTmp3 ;
nPos1 = strResult.find("<TR",strTmp1.length());
nPos2 = strResult.find("</TR>",nPos1);
continue;
}
nPos1 = strResult.find("<TR",nPos2 + 5);
nPos2 = strResult.find("</TR>",nPos1);
}
}
Extract::~Extract()
{
}
//替換\n \r字符
char * Extract::funReplaceReturnAndEnter(const char * szReslut)
{
register int x ,y ;
int len;
unsigned char *str;
len =strlen(szReslut);
str = (unsigned char *) new char[ len + 1];
if(str == NULL){
return NULL;
}
for (x = 0 ,y =0; x < len ; x++ )
{
unsigned char szT = (unsigned char) szReslut[x];
if (szT == 10 || szT == 13 )
{
str[y] = ' ' ;
}
else if (szT >= 0xa1 )
{
str[y] = ' ' ;
}
else if (szT == 0x20 )
{
str[y] = ' ' ;
}
else if (szT == 'B' )
{
str[y] = ' ' ;
}
else if (szT == 'R' )
{
str[y] = ' ' ;
}
else if (szT == '<' ) //把前面的兩個逗號去除
{
if (y > 2)
{
str[y] = ',' ;
y++ ;
}
else
str[y] = ' ' ;
}
else
{
str[y] = szT ;
y++ ;
}
}
str[y] = '\0';
return ((char *) str);
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -