?? eigenvectorselect.cpp
字號:
/***************************************************************
* 工程: 自然語言處理綜合系統
* 作者: 李赟(liyun@nlu.caai.cn)
* 修改者 李衛
* 描述: 特征抽取的部分實現
* 主要函數:IDFEigentVector 、FindClassNames 、CHIEigentVector
G_UpdateKeyWords、G_UpdateIDVector、G_UpdateDocs
G_LoadKeyWords 等
* 版本: 1.0
* 修改: 遍歷文件夾下所有目錄內文件
* 參考文獻:
**************************************************************/
#include "EigenVectorSelect.h"
//-----------------------------------------------------------//
// 功能: 單個類別DF特征抽取并以詞的形式保存特征文件
// 參數:
// (入口)const string &classname,類別名稱
// unsigned int kwdnum 該類別需要抽取的的特征數
// const string& vecfiledir 類別文件存放路徑
// 文件輸入vecfiledir+"\\"+classname+".vec"
// (出口) map<string,unsigned int> &allwords,
// 用于編號的總詞表,執行該函數時更新
// 文件輸出vecfiledir+"\\df\\"+classname+".vei"
// 返回: 正常true 錯誤false
// 主要思路:DF特征抽取
// 調用方法:全局函數
// 日期: 2006年3月
//----------------------------------------------------------//
bool IDFEigentVector(const string &classname,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
//一個用于排序的map表 key為Doc數 value為Doc為key的特征列表
map<unsigned int,vector<string>,greater<unsigned int> > wordmap;
//特征抽取前的特征列表 格式為:詞:篇章數
string ifilename = vecfiledir+"\\"+classname+".vec";
//特征抽取后的詞表 格式為:詞:篇章數
string ofilename = vecfiledir+"\\df\\"+classname+".vei";
ifstream fin(ifilename.c_str());
ofstream fout(ofilename.c_str());
if(!fin || !fout){
cout<<"vec/eivfile can not open "<<endl;
return false;
}
pair<unsigned int,string> wordpair;
unsigned int count =0;
unsigned int totaldocnum;
fin>>totaldocnum;
//更新wordmap用于排序
while(fin>>wordpair.second>>wordpair.first){
count++;
map<unsigned int,vector<string>,greater<unsigned int> > ::iterator itmv =
wordmap.find(wordpair.first);
if(itmv==wordmap.end()){
pair<unsigned int,vector<string> > tmpwordpair;
tmpwordpair.first = wordpair.first;
tmpwordpair.second.push_back(wordpair.second);
wordmap.insert(tmpwordpair);
}else{
itmv->second.push_back(wordpair.second);
}
}
fin.close();
if(totaldocnum==0 || count ==0){
// cout<<"Error! totaldocnum ="<<totaldocnum<<",wordcount ="<<count<<endl;
return false;
}else{
// cout<<count<<"words in. wordlist.size() = "<<wordmap.size()<<endl;
}
int kwdnum_count =kwdnum;
//輸出總Doc數
fout<<totaldocnum<<endl;
//抽取DF靠前的kwdnum_count個特征詞并輸出
for(map<unsigned int,vector<string>,greater<unsigned int> >::const_iterator
itc=wordmap.begin();itc!=wordmap.end();itc++){
//cout<<itc->first<<" ";
for(vector<string>::const_iterator itcvs =itc->second.begin();itcvs!=itc->second.end();itcvs++)
if(kwdnum_count-- >0){
// fout<<itc->second<<" "<<itc->first/totaldocnum<<endl;
//cout<<*itcvs<<" ";
//更新用于生成序號的特征列表
allwords.insert(pair<string,unsigned int>(*itcvs,0));
//
fout<<*itcvs<<" "<<itc->first<<endl;
}
//cout<<endl;
}
fout.close();
return true;
}
//-----------------------------------------------------------//
// 功能: 尋找某目錄包含的訓練類別名稱
// 參數:
// (入口)const string& vecfiledir 類別文件路徑
// const string& extstr 擴展名
// 文件輸入vecfiledir目錄下的*.extstr,(文件名)
// (出口) set<string>& namevec 返回類別名稱
// 用于編號的總詞表,執行該函數時更新
// 返回: 找到的類別數,錯誤返回0
// 主要思路:所有訓練類別按“類別名稱.特定擴展名”形式存放在在一個目錄下,
// 根據指定文件后綴尋找文件名(不包含擴展名)
// 調用方法: 全局函數
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned short FindClassNames(set<string>& namevec,const string& vecfiledir,const string& extstr){
string FileNameTmp="";
unsigned short Filecount =0;
namevec.clear();
long hFile;
string tmpName1=vecfiledir+"\\*."+extstr;
struct _finddata_t TrainFile;
if((hFile = _findfirst(tmpName1.c_str(), &TrainFile )) == -1L){
// cout<<"路徑無法訪問!"<<endl;
return 0;
}
do
{
FileNameTmp=TrainFile.name;
if(FileNameTmp=="." || FileNameTmp=="..") continue;
int findpoint;
if((findpoint = FileNameTmp.rfind("."))<=0
||FileNameTmp.substr(findpoint,FileNameTmp.size()-findpoint) !=string(string(".")+extstr)
){
continue;
}
FileNameTmp = FileNameTmp.substr(0,findpoint);
namevec.insert(FileNameTmp);
Filecount++;
// printf(">");
}while(! _findnext( hFile, &TrainFile ) );
_findclose( hFile );
return Filecount;
}
//-----------------------------------------------------------//
// 功能: CHI特征抽取并以詞的形式保存特征文件(針對所有類別)
// 參數:
// (入口)const set<string>& namevec,當前類別名稱列表
// unsigned int kwdnum 該類別需要抽取的的特征數
// const string& vecfiledir 類別文件存放路徑
// 文件輸入vecfiledir+"\\"+(*itc)+".vec"
// (出口) map<string,unsigned int> &allwords,
// 用于編號的總詞表,執行該函數時更新
// 文件輸出vecfiledir+"\\chi\\"+evevector[k].classname+".vei";
// vecfiledir + "\\chi\\_all_words.lst";
// vecfiledir + "\\_all_version.lst";
// 返回: 正常true 錯誤false
// 主要思路:CHI特征抽取
// 調用方法:全局函數
// 日期: 2006年3月
//----------------------------------------------------------//
bool CHIEigentVector(const set<string>& namevec,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
unsigned int allclassdocnum =0;
vector<EVenty> evevector;
//遍歷所有類別未抽取的特征列表文件,并記錄到vector<EVenty>中
//每個EVenty對應一個類別。內部以特征的音序升序排列
for(set<string>::const_iterator itc=namevec.begin();itc!=namevec.end();itc++){
string ifilename = vecfiledir+"\\"+(*itc)+".vec";
ifstream fin(ifilename.c_str());
if(!fin){
cout<<"vecfile can not open "<<endl;
continue;
}
evevector.push_back(EVenty());
vector<EVenty>::reverse_iterator iteve = evevector.rbegin();
iteve->classname = *itc;
fin>>iteve->totalword;
allclassdocnum += iteve->totalword;
pair<unsigned int,string> wordpairtmp;
unsigned int count =0;
while(fin>>wordpairtmp.second>>wordpairtmp.first){
count++;
iteve->wordspair.push_back(wordpairtmp);
}
cout<<"class:"<<iteve->classname<<"wordnum"<<count;
fin.close();
}
if(evevector.size() <= 1){
cout<<"not enough classes"<<endl;
return false;
}else{
cout<<"classes num "<<evevector.size()<<"totaldocnum"<<allclassdocnum<<endl;
//getchar();
}
unsigned int count =0;
string minword ="",minword_old="";
//根據CHI的要求生成ABCD4個值(chivalue[0-3])并計算chivalue的相關值
//由于計算牽涉到vector<EVenty>中多個EVenty(多個類別的有序的特征列表),
//這里采用了最小詞對齊方法,每次從多個EVenty的當前詞中選取一最小的計算
//下次去掉已經計算過的詞,再重復上面的步驟,直到所有EVenty都處理完
while(1){
for(unsigned int i=0;i<evevector.size();i++){
//該evevector已到結尾
if(evevector[i].isstop == true) continue;
//更新chivalue后已計算過的最小特征詞,開始下一輪選最小詞
if(minword_old ==evevector[i].wordspair[evevector[i].curindex].second) { //count the result
double chivalue =
evevector[i].chivalue[0] * evevector[i].chivalue[4]
- evevector[i].chivalue[2] * evevector[i].chivalue[3] ;
// do sth
if(chivalue >0 && evevector[i].chivalue[0] >3){
//分母
double chivalue2 =
(evevector[i].chivalue[0] + evevector[i].chivalue[2])
*(evevector[i].chivalue[1] + evevector[i].chivalue[3])
*(evevector[i].chivalue[0] + evevector[i].chivalue[1])
*(evevector[i].chivalue[2] + evevector[i].chivalue[3]) ;
if(chivalue2 == 0) chivalue2 = 1;
//參見CHI相關公式
double chivalue3 = allclassdocnum * chivalue * chivalue / chivalue2 ;
chivalue3 *= log((float)evevector[i].chivalue[0]);
//chivalue3 *= evevector[i].chivalue[0];
//if(chivalue3 >1e+13)
//cout<<minword_old<<" "
// <<evevector[i].classname<<evevector[i].chivalue[0]<<" "<<
// evevector[i].chivalue[1]<<" "<<evevector[i].chivalue[2]<<" "<<evevector[i].chivalue[3]<<" "
//<<chivalue<<" "<<chivalue2<<" "<<chivalue3<<endl;
//MAP排序表,形式為key權重 value對應的詞列表,按Key從大到小排序
map<double,vector<string>,greater<double> > ::iterator itmvd =
evevector[i].wordmap.find(chivalue3);
if(itmvd==evevector[i].wordmap.end()){
pair<double,vector<string> > tmpwordpair;
tmpwordpair.first = chivalue3;
tmpwordpair.second.push_back(minword_old);
evevector[i].wordmap.insert(tmpwordpair);
}else{
itmvd->second.push_back(minword_old);
}
}//end of if(chivalue >0 && evevector[i].chivalue[0] >3)...
//切換到下一個特征,開始下一輪選最小詞
if(evevector[i].curindex >=evevector[i].wordspair.size()-1){
evevector[i].isstop = true;
continue;
}else{
evevector[i].curindex ++;
}
}//end of minword_old ==evevector[i].w ....
//look for min word
//如果找到更小的最小詞,則更新
evevector[i].chivalue[0] =evevector[i].chivalue[1] =evevector[i].chivalue[2] =evevector[i].chivalue[3] =0;
if(minword == "" || minword > evevector[i].wordspair[evevector[i].curindex].second) {
minword = evevector[i].wordspair[evevector[i].curindex].second;
}
} //end of for
// no more words of all entrys
//while1的結束條件,沒有特征需要處理
if(minword_old == minword){
cout<<"search end ,total wordnum ="<<count<<endl;
break; //break from while 1
}else{
//清空最小詞,開始下一輪尋找
minword_old = minword;
minword = "";
count ++;
//cout<<minword_old<<endl;
//cout<<">";
}
for(unsigned int i2=0;i2<evevector.size();i2++){
unsigned int docinclass =0;
if(minword_old == evevector[i2].wordspair[evevector[i2].curindex].second)
docinclass = evevector[i2].wordspair[evevector[i2].curindex].first;
for(unsigned int j=0;j<evevector.size();j++){
evevector[j].chivalue[(i2 ==j)?0:1] += docinclass;
evevector[j].chivalue[(i2 ==j)?2:3] += (evevector[i2].totalword - docinclass);
}
}
}//end of while 1
//根據排序輸出文件結果并更新排序編號特征列表
for(unsigned int k=0;k<evevector.size();k++){
string ofilename = vecfiledir+"\\chi\\"+evevector[k].classname+".vei";
ofstream fout(ofilename.c_str());
if(!fout){
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -