?? cnaivebayes.cpp
字號:
#include "CNaiveBayes.h"
#include "ssps.h"
#include <iostream>
#include <math.h>
#include <fstream>
#include <windows.h>
using namespace std;
extern Segment(int, string *,string ,string *);
//-------------------------------------------------
//CNaiveBayes類的構造函數
//-------------------------------------------------
CNaiveBayes::CNaiveBayes()
{
m_nClassNum = 0; //類別總數
m_psClassName = NULL;
m_mapClassName2ID.clear();
m_ppfTrainRes = NULL;
m_pnTrainNum = NULL;
m_pfPrC = NULL;
m_nCorrectResNum =0;
m_pnResNum = NULL;
m_pnCorrectResNum = NULL;
}
//-----------------------------------------------------------------------------------------//
// 功能: CNaiveBayes類的析構函數。
//-----------------------------------------------------------------------------------------//
CNaiveBayes::~CNaiveBayes()
{
if(m_psClassName != NULL)
{
delete[] m_psClassName;
}
m_psClassName = NULL;
m_mapClassName2ID.clear();
if( m_ppfTrainRes != NULL)
{
for(int i=0;i<m_nClassNum;i++)
{
delete[] m_ppfTrainRes[i];
m_ppfTrainRes[i] = NULL;
}
delete[] m_ppfTrainRes;
}
m_ppfTrainRes = NULL;
if(m_pnTrainNum != NULL)
{
delete[] m_pnTrainNum;
}
m_pnTrainNum = NULL;
if(m_pfPrC != NULL)
{
delete[] m_pfPrC;
}
m_pfPrC = NULL;
m_vTestRes.clear();
if(m_pnResNum != NULL)
{
delete[] m_pnResNum;
}
m_pnResNum = NULL;
if(m_pnCorrectResNum != NULL)
{
delete[] m_pnCorrectResNum;
}
m_pnCorrectResNum = NULL;
}
//-----------------------------------------------------------------------------------------//
// 功能: 讀入訓練文本,訓練分類器
//----------------------------------------------------------------------------------------//
int CNaiveBayes::Train()
{
//從文件_all_ids.lst中讀取訓練語料的特征項詞頻,訓練特征項的類內先驗概率
ifstream tfile("..\\Dic\\DF\\_all_ids.lst");
string strLine; //讀入一行為一個字符串
string::size_type pos=0, prev_pos=0;
string word;
int tatal_num = 0;
string sTemp="";
int nClassIndex=0;
while(getline(tfile,strLine,'\n')!=NULL)//讀取行
{
//取類名
string::size_type classname_pos=0;
pos=0;
prev_pos=0;
classname_pos = strLine.find_first_of( ' ',classname_pos );//從字符串開始尋找空格,返回空格所在位置。
sTemp = strLine.substr( prev_pos, classname_pos - prev_pos );
//取對應的類標號
nClassIndex = m_mapClassName2ID[sTemp];
string::size_type num_pos;
//取類別中文章的數量
num_pos = ++classname_pos;
num_pos = strLine.find_first_of( ' ',num_pos );
sTemp = strLine.substr( classname_pos,num_pos-classname_pos );
m_pnTrainNum[nClassIndex] = atoi(sTemp.c_str()); //每個類別中文章的數量
pos=++num_pos;
prev_pos=pos;
int nWordNum = 0;//類中詞的數量
while((pos = strLine.find_first_of( ' ', pos ))!=string::npos)
{
//取得每個word 的key:weight
string::size_type key_pos=0, weight_pos=0;
word = strLine.substr( prev_pos, pos - prev_pos );
prev_pos = ++pos;
//取得每個word的key,weight存入鏈表elem
key_pos = word.find_first_of( ':', key_pos );
string str_key = word.substr( 0 , key_pos );
string str_weight = word.substr( key_pos+1, string::npos - key_pos );
int key = atoi(str_key.c_str());
int weight = atoi(str_weight.c_str());
m_ppfTrainRes[nClassIndex][key]=weight;
nWordNum += weight;
}
//處理最后一個空格后的word
word = strLine.substr( prev_pos, pos - prev_pos );
//cout << word << endl;
string::size_type key_pos=0;
key_pos = word.find_first_of( ':', key_pos );
string str_key = word.substr( 0 , key_pos );
string str_weight = word.substr( key_pos+1, string::npos - key_pos );
int key = atoi(str_key.c_str());
int weight = atoi(str_weight.c_str());
m_ppfTrainRes[nClassIndex][key]=weight;
nWordNum += weight;
char szPrWFile[20];
sprintf(szPrWFile,"Pr\\PrW in C%d.txt",nClassIndex);
ofstream Fou;
Fou.open(szPrWFile,ios::out);
for(int k=0;k<m_nFeatureNum;k++)
{
m_ppfTrainRes[nClassIndex][k]=(m_ppfTrainRes[nClassIndex][k]+1)/(nWordNum+m_nFeatureNum);
char szTemp[100];
memset(szTemp,0,100);
sprintf(szTemp,"%d\t%f\n",k,m_ppfTrainRes[nClassIndex][k]);
Fou << szTemp;
}
Fou.close();
tatal_num += m_pnTrainNum[nClassIndex];
}
//-------------------------------------------------------------
//計算每個類的先驗概率PrC
//-------------------------------------------------------------
ofstream FouPrc;
FouPrc.open("Pr\\Prc.txt",ios::out);
for(int k=0 ; k<m_nClassNum ; k++)
{
m_pfPrC[k] = (float)m_pnTrainNum[k]/(float)tatal_num;
char szPrcTemp[100];//存儲類的先驗概率
memset(szPrcTemp,0,100);
sprintf(szPrcTemp,"%d %s %f\n",k,m_psClassName[k].c_str(),m_pfPrC[k]);
FouPrc << szPrcTemp;
}
FouPrc.close();
return m_nClassNum;
}
//-----------------------------------------------------------------------------------------//
// 功能: 初始化測試結果
//----------------------------------------------------------------------------------------//
bool CNaiveBayes::InitTestRes()
{
m_pnResNum = new int[m_nClassNum];
memset(m_pnResNum,0,m_nClassNum*sizeof(int));
m_pnCorrectResNum = new int[m_nClassNum];
memset(m_pnCorrectResNum,0,m_nClassNum*sizeof(int));
return true;
}
//----------------------------------------------------------------------------------------//
// 功能: 對測試文本進行分類
// 參數: 無
// (入口)
// string sTestFile 待測試文本的文件名
// bool bFlag=false 是否已知該測試文本的所屬類別,是true,否false
// int nClassID=0 測試文本所屬類別的ID號,從0開始編號(與m_mapClassName2ID中的ID號保持一致)
// (出口) 無
// 返回: 測試文本數量
//----------------------------------------------------------------------------------------//
int CNaiveBayes::Test(string sTestFile,bool bFlag,int nClassID)
{
m_vTestRes.clear();
m_nCorrectResNum = 0;
string strLine; //讀入一行為一個字符串
double *pro=new double[m_nClassNum];//存儲各特征項取log后的和
memset(pro,0,m_nClassNum*sizeof(double));
string::size_type pos=0, prev_pos=0;
string word;
//讀取文件內容
ifstream tfile1(sTestFile.c_str());
//每次處理一個測試文本
while(getline(tfile1,strLine,'\n')!=NULL)
{
string::size_type wordnum_pos=0;
wordnum_pos = strLine.find_first_of( ' ',wordnum_pos );
//每篇文章的詞數nWordNum
string sWordNum = strLine.substr( 0,wordnum_pos );
int nWordNum = atoi( sWordNum.c_str() );
pos=++wordnum_pos;
prev_pos=pos;
while((pos = strLine.find_first_of( ' ', pos ))!=string::npos)
{
//取得每個word 的key:weight
string::size_type key_pos=0, weight_pos=0;
word = strLine.substr( prev_pos, pos - prev_pos );
prev_pos = ++pos;
//取得每個word的key并計算在類中的先驗概率
key_pos = word.find_first_of( ':', key_pos );
string str_key = word.substr( 0 , key_pos );
int key = atoi(str_key.c_str());
for(int i=0;i<m_nClassNum;i++)
{
pro[i]=pro[i]+log(m_ppfTrainRes[i][key]);
}
}
//處理最后一個空格后的word
word = strLine.substr( prev_pos, pos - prev_pos );
string::size_type key_pos=0;
key_pos = word.find_first_of( ':', key_pos );
string str_key = word.substr( 0 , key_pos );
int key = atoi(str_key.c_str());
for(int i=0;i<m_nClassNum;i++)
{
pro[i]=pro[i]+log(m_ppfTrainRes[i][key])+log(m_pfPrC[i]);
}
//對測試文本進行分類
double t;
t = pro[0];
int max_pro_num = 0;
for(int s = 1 ; s < m_nClassNum ; s++ )
{
double k = pro[s];
if( t < k && k != 0 && t != 0 )
{
max_pro_num = s;
t = k; //cout << "t=" << t << "k=" << k << endl;
}
}
//打開C[max_pro_num].txt,記錄下該文檔的絕對路徑
m_vTestRes.push_back(max_pro_num);
if (bFlag)
{
m_pnResNum[max_pro_num]++;
if (max_pro_num==nClassID)
{
m_nCorrectResNum++;
m_pnCorrectResNum[nClassID]++;
}
}
}
return m_vTestRes.size(); //返回文章數量
}
//-----------------------------------------------------------------------------------------//
// 功能: 將分類結果輸出。
//----------------------------------------------------------------------------------------//
void CNaiveBayes::OutputRes(int nClassID,int nDocNum)
{
cout << "類別 : " << m_psClassName[nClassID] << endl;
cout << "總文檔數: " << nDocNum << endl;
cout << "劃分為該類的文檔數 = " << m_pnResNum[nClassID] << endl;
cout << "正確歸檔數 = " << m_pnCorrectResNum[nClassID] << endl;
double percent = (float)m_pnCorrectResNum[nClassID]/(float)m_pnResNum[nClassID];
cout << "準確率: " << percent*100 << "%" << endl;
double recall = (float)m_pnCorrectResNum[nClassID]/(float)nDocNum;
cout << "召回率: " << recall*100 << "%" << endl;
}
//-----------------------------------------------------------------------------------------//
// 功能: 對一個文件夾內的文檔進行測試。
// 參數:
// (入口)
// string sTestFilesPath 待測試文件夾
//----------------------------------------------------------------------------------------//
void CNaiveBayes::TestFiles(string sTestFilesPath)
{
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -