?? wjc.cpp
字號:
#pragma warning(disable: 4786)
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <vector>
#include <string>
#include <map>
using namespace std;
map <string,int>word_index;
map <string,int>pos_index;
vector <string> dict_word ;
vector <string> test_word ;
vector <string> test_pos ;
vector <string> dict_pos ; //cunfang zidian zhong de suoyou cixing
int dict_word_size,dict_pos_size;
#define ping_hua 0.1;
struct pos_node
{
int pos_num;
float pos_pro;
};
void read_dict()
{
string s1,s_pos,s_word,s;
ifstream fin;
fin.open("dict.txt");
if(!fin)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
int index_word=0,index_pos=0;
while(getline(fin,s1))
{
int k=0;
for(int i=0;i<s1.length();i++)
{
if(s1.compare(i,1,"\t"))
k++;
else
{
s_pos=s1.substr(k+1); //提取位置K+1后的子串,cixing
break;
}
}
s_word=s1.substr(0,k); //提取漢字詞到TEMP
dict_word .push_back(s_word);
word_index[string(s_word)]=index_word;
index_word++;
for(int j=0;j<s_pos.length();j++)
{
if(s_pos.compare(j,1," ")&&s_pos.compare(j,1,"\t"))
{
if(j+1<s_pos.length())
{
if(!s_pos.compare(j+1,1," ")||!s_pos.compare(j+1,1,"\t")) //該詞性只有一個字母組成
s=s_pos.substr(j,1); //提取一個字符
else
s=s_pos.substr(j,2); //提取一個詞性,兩個字
}
else
s=s_pos.substr(j,1); //提取一個字符
}
if(!pos_index.count(s)) //mei zhaodao ,charu
{
pos_index[s]=index_pos;
index_pos++;
dict_pos.push_back(s);
}
}
}
dict_word_size=word_index.size();
dict_pos_size=pos_index.size();
for(int i=0;i<pos_index.size();i++)
cout<<pos_index[dict_pos[i]]<<" "<<dict_pos[i]<<endl;
fin.close();
}
void count_pi_A_B(float **pos_to_pos,float **pos_to_word)
{
string s1,s2,s_fir_pos,s_sce_pos,s_end_pos,s_word,fir_str,sce_str,end_str,end_pre_pos;
int k=0,fir_null,sce_null,end_null,fir=1,from,to;
ifstream fin1("corpus.txt");
if(!fin1)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
while(getline(fin1,s2))
{
if(s2.length()!=0)
{
end_null=s2.rfind(" ");
if(fir!=1)//shangyihang de zuihou yige zhuandao xia yi hang de diyige
{
fir_null=s2.find(" ");
fir_str=s2.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_fir_pos=fir_str.substr(k+1,2);
from=pos_index[s_end_pos];
to=pos_index[s_fir_pos]+1;
pos_to_pos[from][to]++; //di yi lie fang pi
}
fir_null=s2.find(" ");
fir_str=s2.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_word=fir_str.substr(0,k); //tiqu yige ci, qiu b.........
s_fir_pos=fir_str.substr(k+1,2); //tiqu chu yige cixing
from=word_index[s_word]; //xuoying ,dedao b
to=pos_index[s_fir_pos];
pos_to_word[from][to]++;
pos_to_pos[pos_index[s_fir_pos]][0]++;//qiu pi
while(fir_null!=end_null)
{
sce_null=s2.find(" ",fir_null+2);
sce_str=s2.substr(fir_null+2,sce_null-fir_null-2); //tiqu dierge chuan
k=sce_str.find_first_of('/');
s_word=sce_str.substr(0,k); //tiqu chu yige ci qiu b.................
s_sce_pos=sce_str.substr(k+1,2); //tiqu chu yige cixing
from=word_index[s_word]; //xuoying ,dedao b
to=pos_index[s_sce_pos];
pos_to_word[from][to]++;
from=pos_index[s_fir_pos];
to=pos_index[s_sce_pos];
pos_to_pos[from][to+1]++; //diyilie fang pi, a
fir_null=sce_null;
s_fir_pos=s_sce_pos;
}
s_end_pos=s_sce_pos;//jilu xia shangyihang zuihou yige cixing
}
fir=0;
}
}
void pinghua(float **pos_to_pos,float **pos_to_word)
{
int i,j;
float all_pi=0,all_a=0,all_b=0,ph,size;
ph=(float)ping_hua;
for(i=0;i<dict_pos_size;i++) //pi_pro
{
pos_to_pos[i][0]=pos_to_pos[i][0]+ph;
all_pi=all_pi+pos_to_pos[i][0];
}
for(i=0;i<dict_pos_size;i++)
{
pos_to_pos[i][0]=pos_to_pos[i][0]/all_pi;
}
for(i=0;i<dict_pos_size;i++) //a_pro xianxing chazhi
{
all_a=0;
for(j=1;j<=dict_pos_size;j++)
all_a=pos_to_pos[i][j]+all_a;
for(j=1;j<=dict_pos_size;j++)
pos_to_pos[i][j]=((1-ph)*pos_to_pos[i][j]/all_a)+ph;
}
for(j=0;j<dict_word_size;j++)//b_pro
{
size=0;
all_b=0;
for(i=0;i<dict_pos_size;i++)
{
all_b=pos_to_word[j][i]+all_b; //c(word[j])
if(pos_to_word[j][i]!=0)
size++;
}
for(i=0;i<dict_pos_size;i++)
pos_to_word[j][i]= (float)(pos_to_word[j][i]+1)/(all_b+size);
}
}
float max(float **i_j_pro,float **pos_to_pos,int i,int j,int &rec_k)
{
float max=0;
for(int k=0;k<dict_pos.size();k++)
{
if(i_j_pro[i][k]*pos_to_pos[k][j+1]>max)
{
max=i_j_pro[i][k]*pos_to_pos[k][j+1];
rec_k=k;
}
}
return max;
}
void viterbi(float **pos_to_pos,float **pos_to_word,float **i_j_pro,string **most_state,string *new_state)
{
int i,j,index,rec_k=0;
float mmax;
string state;
if(word_index.count(test_word[0])) //ci yizai yuliaoku zhong chuxian
{
index=word_index[test_word[0]];
for(j=0;j<dict_pos_size;j++) //pi
i_j_pro[0][j]=pos_to_pos[j][0]*pos_to_word[index][j]; //pi[j]*...
}
else //gaici wei wei denglu ci ,renwei meige cixing changsheng gaici de gailv dou yiyang
{
for(j=0;j<dict_pos_size;j++) //pi
i_j_pro[0][j]=pos_to_pos[j][0];
}
///////////////////////////////////////////////////
for(i=1;i<test_word.size();i++) //ci de gesh
{
if(word_index.count(test_word[i]))
{
index=word_index[test_word[i]];
for(j=0;j<dict_pos_size;j++) //pi
{
mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k);
i_j_pro[i][j]=mmax*pos_to_word[index][j];
most_state[i][j]=dict_pos[rec_k];
}
}
else //gaici wei wei denglu ci
{
for(j=0;j<dict_pos_size;j++) //pi
{
mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k);
i_j_pro[i][j]=mmax;
most_state[i][j]=dict_pos[rec_k];
}
}
}
mmax=i_j_pro[test_pos.size()-1][0]; //qiu zuidazhi pos_to_pos de diyilie fang pi
state=dict_pos[0];
for(int k=1;k<dict_pos.size();k++)
{
if(i_j_pro[test_word.size()-1][k]>mmax)
{
mmax=i_j_pro[test_pos.size()-1][k];
rec_k=k;
}
}
state=dict_pos[rec_k]; //Xn
new_state[test_pos.size()-1]=state;
for(k=test_pos.size()-1;k>=1;k--)
{
new_state[k-1]=most_state[k][rec_k];
rec_k=pos_index[new_state[k-1]];
}
}
void tag_test(float **pos_to_pos,float **pos_to_word)
{
string s ,s2,one_sentence,fir_str,s_fir_pos,s_sce_pos,s_word,sce_str;
int end_null,fir_null,sce_null,k,i;
ifstream fin1("test.txt");
if(!fin1)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
ofstream fout("result.txt");
if(!fout)
{
cerr<<"error 100 opening result.txt";
exit(100);
}
int line=1;
float all_cor=0,all=0;
while(getline(fin1,s))
{
if(s.length()!=0) //tiqu yige juzi de ci he cixing ,bing cunru dongtai shuzu
{
end_null=s.rfind(" ");
fir_null=s.find(" ");
fir_str=s.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_word=fir_str.substr(0,k); //tiqu yige ci, qiu b.........
s_fir_pos=fir_str.substr(k+1,2); //tiqu chu yige cixing
test_pos.push_back(s_fir_pos); //cunchu cixing
test_word.push_back(s_word); //cunchu ci
while(fir_null!=end_null)
{
sce_null=s.find(" ",fir_null+2);
sce_str=s.substr(fir_null+2,sce_null-fir_null-2); //tiqu dierge chuan
k=sce_str.find_first_of('/');
s_word=sce_str.substr(0,k); //tiqu chu yige ci qiu b.................
s_sce_pos=sce_str.substr(k+1,2); //tiqu chu yige cixing
test_pos.push_back(s_sce_pos); //cunchu cixing
test_word.push_back(s_word); //cunchu ci
fir_null=sce_null;
}
string **most_state=new string* [test_word.size()];
for(i=0;i<test_word.size();i++)
most_state[i]=new string [dict_pos_size];
string *new_state=new string [test_word.size()] ;
float **i_j_pro=new float* [test_word.size()] ;
for(i=0;i<test_word.size();i++)
i_j_pro[i]=new float [dict_pos_size];
viterbi(pos_to_pos,pos_to_word,i_j_pro,most_state,new_state); ////diaoyong vertebi
float num=0.0;
float pro;
for(int i=0;i<test_word.size();i++)
{
all++;
if(test_pos[i]==new_state[i])
{
num++;
all_cor++;
}
}
pro=num/test_word.size();
fout<<"line"<<line<<" "<<num<<" "<<test_word.size()<<" "<<pro<<endl;
line++;
test_word.clear(); ///qingkong
test_pos.clear();
}
}
fout<<"the totla pro of correct is "<<all_cor<<"/"<<all<<"="<<all_cor/all<<endl;
fin1.close();
fout.close();
}
int main()
{
read_dict(); //讀詞典
float **pos_to_pos=new float* [dict_pos_size] ; //詞性轉移 ,jian yige erwei shuzu
for(int row=0;row<dict_pos_size;row++)
{
pos_to_pos[row]=new float [dict_pos_size+1]; //di 0 lie fang pi
for(int x=0;x<=dict_pos_size;x++)
pos_to_pos[row][x]=0;
}
float **pos_to_word=new float* [dict_word_size];
int x;
for(row=0;row<dict_word_size;row++)
{
pos_to_word[row]=new float [dict_pos_size];
for(x=0;x<dict_pos_size;x++)
pos_to_word[row][x]=0;
}
cout<<"GENERATING HMM,WAIT FOR A MOMENT,PLEASE...."<<endl;
count_pi_A_B(pos_to_pos,pos_to_word); //dedao HMM
pinghua(pos_to_pos,pos_to_word); //平滑
tag_test(pos_to_pos,pos_to_word); //標注
return 0;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -