?? languagemodel.cpp
字號:
#include "LanguageModel.h"
using namespace std;
extern bool printmore;
LanguageModel::LanguageModel(double prob)
{
UNKPROB = prob;
}
bool LanguageModel::load(string fileName, Vocab *vocab)//加載LM, srilm,文件為arpa-standard format
{
string strTmp;
vcb = vocab;
if (!vcb) {
cout << "english vocab is NULL!" << endl;
return 0;
}
input.open(fileName.c_str(), std::ios::in);
if (!input)
{
cout << "Open Language Model" << fileName << "Error" << endl;
return 0;
}
while (getline(input, strTmp))
{
int tabPositionStart = strTmp.find_first_of('\t');
if (tabPositionStart != string::npos)
{
string prob(strTmp, 0, tabPositionStart);
int tabPositionEnd = strTmp.find_last_of('\t');
ProbAndBO tmpProbAndBO;
if (tabPositionEnd != string::npos)
{
string backoffProb(strTmp, tabPositionEnd + 1, strTmp.length() - tabPositionEnd + 1);
string gram(strTmp, tabPositionStart + 1, tabPositionEnd - tabPositionStart - 1);
tmpProbAndBO.prob = atof(prob.c_str());
tmpProbAndBO.backoffWeight = atof(backoffProb.c_str());
vector<int> gramIDs;
int numSpace = vcb->getIndices(gram, gramIDs);
if (numSpace == 1) {
uniGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
else if (numSpace == 2) {
biGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
else if (numSpace == 3) {
triGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
}
else
{
string gram(strTmp, tabPositionStart + 1, strTmp.length() - tabPositionStart + 1);
tmpProbAndBO.prob = atof(prob.c_str());
tmpProbAndBO.backoffWeight = 0;
vector<int> gramIDs;
int numSpace = vcb->getIndices(gram, gramIDs);
if (numSpace == 1) {
uniGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
else if (numSpace == 2) {
biGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
else if (numSpace == 3) {
triGram.insert(make_pair(gramIDs, tmpProbAndBO));
}
}
}
}
input.clear();
input.close();
if(printmore) {
cout << "Loading language model finished!" << endl;
}
return 1;
}
double LanguageModel::uniProb(int word)
{
vector<int> vecTmp;
vecTmp.push_back(word);
mapNgram::iterator pos = uniGram.find(vecTmp);
double p = INFINITE;
if (pos != uniGram.end()) {
p = (pos->second).prob * LN;
}
if (p - UNKPROB < avs) {
return UNKPROB;
}
return p;
}
double LanguageModel::uniBO(int word)
{
vector<int> vecTmp;
vecTmp.push_back(word);
mapNgram::iterator pos = uniGram.find(vecTmp);
if (pos != uniGram.end()) {
return (pos->second).backoffWeight * LN;
}
else
{
return UNKBO;
}
}
//p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)
// else bo_wt_1(wd1)*p_1(wd2)
double LanguageModel::biProb(int word1, int word2)
{
vector<int> vecTmp;
vecTmp.push_back(word1);
vecTmp.push_back(word2);
mapNgram::iterator pos = biGram.find(vecTmp);
double p = INFINITE;
if (pos != biGram.end()) {
p = (pos->second).prob * LN;
}
else {
p = uniBO(word1) + uniProb(word2);
}
if (p - UNKPROB < avs) {
return UNKPROB;
}
return p;
}
double LanguageModel::biBO(int word1, int word2)
{
vector<int> vecTmp, vecTmp1, vecTmp2;
vecTmp.push_back(word1);
vecTmp.push_back(word2);
mapNgram::iterator pos = biGram.find(vecTmp);
if (pos != biGram.end())
{
return (pos->second).backoffWeight * LN;
}
}
//p(wd3|wd1,wd2)= if(trigram exists) p_3(wd1,wd2,wd3)
// else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
// else p(wd3|w2)
double LanguageModel::triProb(int word1, int word2, int word3)
{
vector<int> vecTmp, vecTmp12;
vecTmp.push_back(word1);
vecTmp.push_back(word2);
vecTmp.push_back(word3);
vecTmp12.push_back(word1);
vecTmp12.push_back(word2);
mapNgram::iterator pos = triGram.find(vecTmp);
double p = INFINITE;
if (pos != triGram.end()) {
p = (pos->second).prob * LN;
}
else if (biGram.find(vecTmp12) != biGram.end())
{
p = (biBO(word1, word2) + biProb(word2, word3)) ;
}
else
{
p = biProb(word2, word3);
}
if (p - UNKPROB < avs) {
return UNKPROB;
}
return p;
}
double LanguageModel::wordProb(vector<int> Ephrase)
{
int len = Ephrase.size();
if (len == 1) {
return uniProb(Ephrase[0]);
}
else if (len ==2) {
return biProb(Ephrase[0], Ephrase[1]);
}
else if (len == 3) {
return triProb(Ephrase[0], Ephrase[1],Ephrase[2]);
}
else {
return triProb(Ephrase[len - 3], Ephrase[len - 2],Ephrase[len - 1]);
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -