?? extract.cpp
字號(hào):
//================================================================================
// CLASS : Extract
// FUNCTION: 提取網(wǎng)頁(yè)內(nèi)容,去除html標(biāo)簽。
// AUTHOR : 2006-05-21 Created by navy
// REFER : 輸入文件名(*.htm/html)
// NOTE : Extract.cpp , implementation file
//================================================================================
#include "StdAfx.h"
#include "Extract.h"
#include "windows.h"
Extract::Extract()
{
strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
}
//--------------------------------------------------------------------------------
// FUNCTION: 處理文件。
// IN : 文件名。
// OUT : .txt文件
// AUTHOR : 2006-05-24 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcessFile(string strFileName)
{
funProcess(strFileName);
}
//--------------------------------------------------------------------------------
// FUNCTION: 處理目錄中所有擴(kuò)展名為strExteName的文件
// IN : 源目錄、目標(biāo)目錄、擴(kuò)展名
// OUT : txt文件。
// AUTHOR : 2006-05-24 Created by navy .
// NOTE : 目標(biāo)目錄為空時(shí),默認(rèn)為源目錄
//--------------------------------------------------------------------------------
void Extract::funProcessDirectory(string strDirSour,string strDirDest,string strExteName)
{
long h;
_finddata_t fdata;
string filename;
int nCount = 0 ;
if(strDirDest.empty())
{
strDirDest = strDirSour ;
}
strDirSource = strDirSour + "\\" ;
strDirDestination = strDirDest + "\\" ;
h = _findfirst((strDirSource+"\\*."+strExteName).c_str(),&fdata);
if (h == -1)
{
//cout << "No match files .";
return;
}
//funProcess( strDirDest+"\\"+fdata.name);
if (fdata.size < 3*1024*1024)
{
nCount ++ ;
cout << nCount << " : " <<"processing " << fdata.name << " ";
funProcess(fdata.name);
}
else
{
//nCount ++ ;
cout << "********************************************************" << endl;
cout << "* size over 3M !" << endl;
cout << "* file name : " << fdata.name << " did not process . " << endl;
cout << "********************************************************" << endl;
}
while (1)
{
if (_findnext(h,&fdata) == 0)
{
//funProcess( strDirDest+"\\"+fdata.name);
if (fdata.size < 3*1024*1024)
{
nCount ++ ;
cout << nCount << " : " <<"processing " << fdata.name << " ";
funProcess(fdata.name);
}
else
{
//nCount ++ ;
cout << "********************************************************" << endl;
cout << "* size over 3M !" << endl;
cout << "* file name : " << fdata.name << " did not process . " << endl;
cout << "********************************************************" << endl;
}
}
else
{
break;
}
}
_findclose(h);
}
//--------------------------------------------------------------------------------
// FUNCTION: 打開(kāi)網(wǎng)頁(yè)文件,去除標(biāo)簽,提取內(nèi)容。
// IN : 網(wǎng)頁(yè)文件(*.htm/html)。
// OUT : txt文件。
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funProcess(string filename)
{
ifstream fin((strDirSource + filename).c_str());
string strLine,strResult;
int i = 1 , j = 1 ;
char *sResultFile;
sResultFile=new char[100];
//strUseful = STR_USEFUL ; // "<TITLE>,<TR>,<TD>,<P>,<BR>,<DIV>"
funDestFilename(filename.c_str(),sResultFile);
cout << "." ; //輸出“.”號(hào)
if (!fin)
{
cout<<"Can't open the file !"<<endl ;
return ;
}
while ( getline(fin,strLine) )
{
if (i%3000 == 0) cout << "." ;
if (i++ == 10000)
{
cout <<" " << j++ << "W";
i = 1 ;
}
funConvCapital(strLine);
//funPreProcess(strLine);
while((strLine[strLine.size()-1] =='\r') || (strLine[strLine.size()-1] =='\n')) //\r是換行,\n回車
{
strLine = strLine.substr(0, strLine.size()-1);
}
strResult+=strLine;
}
cout << "." ; //輸出“.”號(hào)
funDelete2(strResult,"SCRIPT",0); //去除<SCRIPT></SCRIPT>間的內(nèi)容
funDelete2(strResult,"STYLE",0); //去除<STYLE></STYLE>間的內(nèi)容
funDelete2(strResult,"XML",0); //去除<XML></XML>間的內(nèi)容
if ( STR_A ) funDelete2(strResult,"A",0);
funDelete(strResult,"<A"); //將<A ...>變?yōu)?lt;A>
funDelete(strResult,"<P"); //將<P ...>變?yōu)?lt;P>
funDelete(strResult,"<TD");
if ( STR_REPLACE_VERTICAL ) funReplace(strResult,"|"," ");
if ( STR_SPACE ) funReplace(strResult," ",""); //是否刪除所有空格
funDelUselessLink(strResult); //刪除無(wú)用鏈接
cout << "." ; //輸出“.”號(hào)
funKeepListLabel(strResult,0); //0表示從第一個(gè)字符開(kāi)始查找
cout << "." ; //輸出“.”號(hào)
funReplace(strResult," "," "); //用空格替換TAB
funReplace(strResult," "," "); //最多保留一個(gè)空格
cout << "." ; //輸出“.”號(hào)
funResult(strResult);
cout << "." ; //輸出“.”號(hào)
funOutput(sResultFile);
/**debug*******************/
// ofstream fout(sResultFile);
// fout<< strResult << endl;
// fout.close();
/**debug*******************/
//delete sResultFile;
fin.close();
}
//--------------------------------------------------------------------------------
// FUNCTION: 將處理結(jié)果存入向量中。
// IN : strResult
// OUT : vecResult
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funResult(string strResult)
{
int nPos;
string strTmp;
funDelBegAnd(strResult,0); //替換以&開(kāi)頭以;結(jié)束的字符串
vecResult.clear();
nPos = strResult.find("<BR");
while (nPos != -1)
{
strTmp = strResult.substr(0,nPos);
funDelSideSpace(strTmp); //去除前后空格
if ( !strTmp.empty() ) {
vecResult.push_back(strTmp);
}
strResult = strResult.substr(nPos+3);
nPos = strResult.find("<BR");
}
if (!strResult.empty())
{
vecResult.push_back(strResult);
}
}
//--------------------------------------------------------------------------------
// FUNCTION: 將vecResult輸出到filename.txt中。
// IN : filename
// OUT : 將結(jié)果保存在文件中。
// AUTHOR : 2006-05-22 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funOutput(char* filename)
{
ofstream fout;
int i,n;
string strTmp,strTmpOld;
fout.open((strDirDestination + filename).c_str());
n=vecResult.size();
for ( i=0;i<n;i++ )
{
strTmpOld = strTmp ;
strTmp = vecResult[i] ; //cout << strTmp <<endl;
funDelSideSpace(strTmp); //cout << strTmp <<endl;
if((i > 0) && (strTmp == strTmpOld)) continue;
if ( strTmp == "·" ) continue;
//fout << "" << strTmp << endl;
//fout << " " << strTmp << endl; //加空格,好看一些。
fout << strTmp << endl; //不加空格
}
cout << " Finish !" << endl;
fout.close();
}
//--------------------------------------------------------------------------------
// FUNCTION: 將小寫字母轉(zhuǎn)為大寫。
// IN : 一行。
// OUT :
// AUTHOR : 2006-05-27 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funConvCapital(string& strLine)
{
if(STR_TI_CAPITAL)
{
funReplace(strLine,"<ti","<TI");
funReplace(strLine,"</ti","</TI");
}
if(STR_TR_CAPITAL)
{
funReplace(strLine,"<tr","<TR");
funReplace(strLine,"</tr","</TR");
}
if(STR_TD_CAPITAL)
{
funReplace(strLine,"<td","<TD");
funReplace(strLine,"</td","</TD");
}
if(STR_P_CAPITAL)
{
funReplace(strLine,"<p","<P");
}
if(STR_BR_CAPITAL)
{
funReplace(strLine,"<br","<BR");
}
if(STR_DI_CAPITAL)
{
funReplace(strLine,"<di","<DI");
funReplace(strLine,"</di","</DI");
}
if(STR_XML_CAPITAL)
{
funReplace(strLine,"xml","XML");
}
if(STR_A_CAPITAL )
{
funReplace(strLine,"<a","<A");
funReplace(strLine,"</a>","</A>");
}
if(STR_SPACE )
{
funReplace(strLine," ","");
}
funReplace(strLine,"script","SCRIPT");
funReplace(strLine,"style","STYLE");
}
//--------------------------------------------------------------------------------
// FUNCTION: 對(duì)strLine進(jìn)行預(yù)處理。
// IN : 一行。
// OUT : 將結(jié)果保存在變量中。
// AUTHOR : 2006-05-21 Created by navy .
// NOTE :
//--------------------------------------------------------------------------------
void Extract::funPreProcess(string& strLine)
{
wchar_t ws[1024];
char ss[2048];
int nTrans = MultiByteToWideChar(CP_ACP, 0, strLine.c_str(), strLine.size(), ws, 1024 );
ws[nTrans] =0;
wstring wss=ws;
funReplace(wss,L" ",L" "); //將" "(全角的空格)替換成空格
nTrans = WideCharToMultiByte(CP_ACP, 0, wss.c_str(), wss.size(), ss, 2048, NULL, NULL );
ss[nTrans] = 0;
strLine=ss;
}
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -