?? hypothesis.cpp
字號(hào):
#include "Hypothesis.h"
#include <cmath>
#include <cctype>
#include <iterator>
using namespace std;
extern bool printmore;
extern VECPOOL vecHypo;
extern VECUNSED vecNotUsed;
Hypothesis::Hypothesis(double thresholdI, int stackThreshold, int nBest, double dislimit, double lmlimit, int len)
{
threshold = log(thresholdI);
eachStackSize = stackThreshold;
NBEST = nBest;
DISTORTIONLIMIT = dislimit;
LMLimit = lmlimit;
DISLENGHT = len;
}
bool Hypothesis::load(string inifileName)
{
config.load(inifileName, para);
cout << "set the parameters ... " << endl;
enVcb = new Vocab();
cnVcb = new Vocab();
if ((!enVcb) || (!cnVcb)) {
cout << "new Vocab error!" << endl;
}
lm = new LanguageModel(LMLimit);
cout << "load language model from " << para.lmodel_file << endl;
time_t lmold, lmnew;
time(&lmold);
if(!lm->load(para.lmodel_file, enVcb))
{
cout << "load language model " << para.lmodel_file << " error !" << endl;
return 0;
}
time(&lmnew);
cout << "language model load finished, it takes " << difftime(lmnew, lmold) << " seconds!" << endl;
cout << "load phrase table from " << para.ttable_file << endl;
to = new TransOptions(para);
time_t toold, tonew;
time(&toold);
if (!to->load(para.ttable_file, lm, enVcb, cnVcb)) {
cout << "load phrase table " << para.ttable_file << " error !" << endl;
return 0;
}
time(&tonew);
cout << "phrase table load finished, it takes "<< difftime(tonew, toold) << " seconds!" << endl;
return 1;
}
void Hypothesis::initialize(string sentence)
{
cnVcb->senToIDs(sentence, sentenceIDS);
enVcb->unkTMP = cnVcb->unkTMP;
stackSize = to->getEPhrase(sentenceIDS, phraseSnippet, phraseSnippetPosition);
hypothesisStack.resize(stackSize + 1);
THRESHOLD.resize(stackSize + 1);
int first = findProper(vecNotUsed);
vecHypo[first].lastEWI = -1;
vecHypo[first].lastEWII = enVcb->getIndex("<s>");
vecHypo[first].lmScore = 0.0;
vecHypo[first].prev = -1;
vecHypo[first].thisID = 0;
vecHypo[first].lastID = 0;
hypothesisStack[0].push_back(first); //
}
void Hypothesis::clear()
{
//release the resource
phraseSnippet.clear();
phraseSnippetPosition.clear();
THRESHOLD.clear();
HypothesisStack::iterator pos;
for(pos = hypothesisStack.begin(); pos != hypothesisStack.end(); ++pos)
{
pos->clear();
}
hypothesisStack.clear();
vecNotUsed.clear();
int poolsize = vecHypo.size();
for (int n = 0; n < poolsize; n++)
{
vecNotUsed.push_back(n);
}
arc.clear();
sentenceIDS.clear();
}
inline void Hypothesis::split(const string& line, vector<string>& strs)
{
istrstream ist(line.c_str());
string w;
while(ist>>w) strs.push_back(w);
}
double Hypothesis::lmCal(int last1, int last2, vector<int> newPhrase, int& newlast1, int& newlast2)
{
double lmScore = 0.0;
if (newPhrase.size() == 0)
{
return 0;
}
else
{
vector<int> vecPhrase;
if (last1 == -1)
{
vecPhrase.push_back(last2);
int l = newPhrase.size();
for(int i = 0; i < l; i++)
{
vecPhrase.push_back(newPhrase[i]);
}
int len = vecPhrase.size();
newlast1 = vecPhrase[len - 2];
newlast2 = vecPhrase[len - 1];
int endS = enVcb->getIndex("</s>");
if (newPhrase[0] != endS) {
vector<int>::iterator pos = vecPhrase.end();
for(int i = 0; i < len - 1; i++)
{
double lmTmp = lm->wordProb(vecPhrase) * para.weight_l;
if(printmore) {
cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmTmp << endl;
}
lmScore += lmTmp;
vecPhrase.erase(--pos);
}
}
else {
lmScore = lm->wordProb(vecPhrase) * para.weight_l;
if(printmore) {
cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmScore << endl;
}
}
}
else
{
vecPhrase.push_back(last1);
vecPhrase.push_back(last2);
// split(newPhrase, vecPhrase);
int l = newPhrase.size();
for(int i = 0; i < l; i++)
{
vecPhrase.push_back(newPhrase[i]);
}
int len = vecPhrase.size();
newlast1 = vecPhrase[len - 2];
newlast2 = vecPhrase[len - 1];
int endS = enVcb->getIndex("</s>");
if (newPhrase[0] != endS) {
vector<int>::iterator pos = vecPhrase.end();
for(int i = 0; i < len - 2; i++)
{
double lmTmp = lm->wordProb(vecPhrase) * para.weight_l;
if(printmore) {
cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmTmp << endl;
}
lmScore += lmTmp;
vecPhrase.erase(--pos);
}
// lmScore = lmScore + 1;
}
else {
lmScore = lm->wordProb(vecPhrase) * para.weight_l;
if (printmore){
cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmScore << endl;
}
}
}
}
return lmScore;
}
double Hypothesis::fcCal(set<int> phraseID, int stackSize)
{
set<int>::iterator pos = phraseID.begin();
set<int>::iterator posOld = phraseID.begin();
pos++;
posOld++;
int len = phraseID.size();
double fcScore = 0.0;
if (*pos != 0) {
fcScore += to->getFutureCost(0, *pos - 1);
posOld = pos;
pos++;
}
for(; pos != phraseID.end(); ++pos)
{
if(*pos - 1 > *posOld)
{
fcScore += to->getFutureCost(*posOld + 1, *pos - 1);
}
posOld = pos;
}
if (stackSize - 1 >= *posOld + 1) {
int i = *posOld + 1;
fcScore += to->getFutureCost(i, stackSize - 1);
}
return fcScore;
}
string Hypothesis::decoder(string fileName)
{
for(int i = 0; i < stackSize; i++)
{
if ((i - 1 > 0) && (hypothesisStack[i].size() > eachStackSize)) {
cutStack(hypothesisStack[i], eachStackSize, i);
}
multisetHYE::iterator posI = hypothesisStack[i].begin();
for(; posI != hypothesisStack[i].end(); ++posI)
{
HypothesisElement hyp = vecHypo[*posI];
int t = hyp.oldphrase.size();
set<int> setTmp;
if (t > 0) {
setTmp = hyp.oldphrase;
}
int len = phraseSnippet.size();
for(int j = 0; j < len; j++)
{
fPosition posTmp = phraseSnippetPosition[j];
if((setTmp.find(posTmp.start) == setTmp.end()) && (setTmp.find(posTmp.end) == setTmp.end()))
{
candiPhrase::iterator pos;
for(pos = phraseSnippet[j]->begin(); pos != phraseSnippet[j]->end(); ++pos)
{
aboutEPhrase onePhrase = **pos;
// HypothesisElement *newhyp = new HypothesisElement();
int newhyp = findProper(vecNotUsed);
if (newhyp == -1) {
int oldsize = vecHypo.size();
vecHypo.resize(2 * oldsize);
for (int n = oldsize; n < 2 * oldsize; n++)
{
vecNotUsed.push_back(n);
}
// vecNotUsed.resize(2 * oldsize);
newhyp = oldsize;
}
vecHypo[newhyp].thisID = ++HypothesisElement::baseID;
vecHypo[newhyp].lastID = hyp.thisID;
vecHypo[newhyp].baseScore = hyp.totalScore - hyp.futureScore;
vecHypo[newhyp].transScore = onePhrase.pC;
vecHypo[newhyp].newPhrase = onePhrase.ephrase;
int translationCost = abs(hyp.lastPos.end + 1 - posTmp.start);
if ((DISLENGHT == 0) || (translationCost <= DISLENGHT)) {
vecHypo[newhyp].distortionScore = translationCost * para.weight_d * (-1);
}
else
{
vecHypo[newhyp].distortionScore = DISTORTIONLIMIT;//log(0.1)
}
vecHypo[newhyp].wordsPenalty = onePhrase.ephrase.size() * para.word_penalty * (-1);
vecHypo[newhyp].lastPos = posTmp;
vecHypo[newhyp].oldphrase = hyp.oldphrase;
if(printmore) {
cout << "creating hypothesis " << vecHypo[newhyp].thisID << " from "<< vecHypo[newhyp].lastID << endl;
cout << "\tbase score " << vecHypo[newhyp].baseScore << endl;
cout << "\ttranslation cost " << vecHypo[newhyp].transScore << endl;
cout << "\tdistortion cost " << vecHypo[newhyp].distortionScore << endl;
}
for(int t = posTmp.start; t <= posTmp.end; t++)
{
vecHypo[newhyp].oldphrase.insert(t);
}
vecHypo[newhyp].lmScore = lmCal(hyp.lastEWI, hyp.lastEWII, onePhrase.ephrase, vecHypo[newhyp].lastEWI, vecHypo[newhyp].lastEWII);
if (vecHypo[newhyp].oldphrase.size() == stackSize + 1) {//擴(kuò)展完所有的外文詞后,在英文句尾添加</s>
int str1, str2; //加1是因?yàn)橹安迦肓艘粋€(gè)-1
int endofsent = enVcb->getIndex("</s>");
vector<int> vecSentEnd;
vecSentEnd.push_back(endofsent);
double tail = lmCal(vecHypo[newhyp].lastEWI, vecHypo[newhyp].lastEWII, vecSentEnd, str1, str2) ;
vecHypo[newhyp].lmScore += tail;
}
vecHypo[newhyp].futureScore = fcCal(vecHypo[newhyp].oldphrase, stackSize);
vecHypo[newhyp].totalScore = vecHypo[newhyp].baseScore + vecHypo[newhyp].transScore + vecHypo[newhyp].distortionScore + vecHypo[newhyp].lmScore
+ vecHypo[newhyp].wordsPenalty + vecHypo[newhyp].futureScore;
if(printmore) {
cout << "\tword penalty " << vecHypo[newhyp].wordsPenalty << endl;
cout << "\tscore " << vecHypo[newhyp].totalScore - vecHypo[newhyp].futureScore << " + futureCost " << vecHypo[newhyp].futureScore << " = " << vecHypo[newhyp].totalScore << endl;
}
//假設(shè)入棧
vecHypo[newhyp].prev = *posI;
recombineAndbeam(newhyp);
}
}
}
}
}
if(printmore) {
cout << "decode finished !" << endl;
}
if (NBEST == 1) {
return findBest();
}
else {
string suffix = ".";
int itmp = stackSize;
while (itmp--) {
suffix += "0";
}
suffix = fileName + suffix;
findNBest(suffix);
return suffix;
}
}
void Hypothesis::recombineAndbeam(int newHyp)
{
int len = vecHypo[newHyp].oldphrase.size() - 1;//which stack to input
int stackIsize = hypothesisStack[len].size();
if (stackIsize != 0)
{
if (vecHypo[newHyp].totalScore - THRESHOLD[len] + threshold > avs) {
THRESHOLD[len] = vecHypo[newHyp].totalScore + threshold;
if(printmore) {
cout << "new best estimate for this stack" << endl;
}
}
if (vecHypo[newHyp].totalScore - THRESHOLD[len] > avs)
{
//THRESHOLD = newHyp.totalScore + threshold;
multisetHYE::iterator POS = hypothesisStack[len].begin();
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -