?? lmodel.c
字號:
x = (UShort) feStack[j]->ndx; WriteShort(f,(short *)&x,1,TRUE); } x = (UShort) se->ndx; WriteShort(f,(short *)&x,1,TRUE); } if (flags&HAS_BOWT) WriteFloat(f,&bowt,1,TRUE); /* back-off weight */ } else { fprintf(f, "%+.4f",prob); fprintf(f, "\t%s",context); word = lm->binMap[se->ndx]->name; if (htkEsc) word = ReWriteString(word,NULL,ESCAPE_CHAR); fprintf(f, "%s",word); if (has_bowt) fprintf(f, "\t%+.4f",bowt); fprintf(f, "\n"); } nItem++; } } return nItem;}/* SaveNGram: write LM in to file f */static int SaveNGram(FILE *f, int G, BackOffLM *lm){ int total; Byte fsize; FLEntry *feStack[LM_NSIZE]; Boolean useIntID; if (lm->vocSize > USHRT_MAX) { if (sizeof(LM_Id) <= sizeof(UShort)) HError(15445,"SaveNGram: vocSize = %d but using %d-byte IDs", lm->vocSize, sizeof(LM_Id)); useIntID = TRUE; } else { useIntID = defIntID; } fprintf(f, "\n\\%d-grams:\n", G); feStack[0] = &(lm->root); total = WriteNGram(f,lm,feStack,1,G,useIntID); if (lm->gInfo[G].fmt==LMF_BINARY) { /* write out 2 zero bytes */ fsize = 0; fwrite(&fsize, sizeof(unsigned char), 1, f); fwrite(&fsize, sizeof(unsigned char), 1, f); } if (trace&T_SAVE) printf("Wrote %d %d-grams\n", total, G); return total;}/* SaveLangModel: save language model lm to fn */void SaveLangModel(char *lmFn, BackOffLM *lm){ char c=' '; int i,n; FILE *f; NGramInfo *gi; Boolean isPipe,isUltra;#ifdef HTK_CRYPT if (lm->encrypt) { TMP_OPEN(f,lmFn,HError(15411,"SaveLangModel: Cannot create lm file %s",lmFn)); } else#endif if ((f = FOpen(lmFn, LangModOFilter, &isPipe))==NULL) HError(15411,"SaveLangModel: Unable to open output file %s",lmFn); WriteHeaderInfo(f,lm); fprintf(f, "\\data\\\n"); isUltra = FALSE; for (gi=lm->gInfo+1,i=1; i<=lm->nSize; i++,gi++) { switch (gi->fmt) { case LMF_TEXT: c = '='; break; case LMF_BINARY: c = '~'; break; case LMF_ULTRA: c = '#'; isUltra = TRUE; break; default: HError(15490,"SaveLangModel: Unknown LM file format (%d) for %d-gram",gi->fmt,i); } fprintf(f, "ngram %d%c%d\n",i,c,gi->nEntry); } if (isUltra) {#ifdef ULTRA_LM ultraKey[KEY_LENGTH-1] = (vaxOrder && natWriteOrder) ? 1 : 0; fprintf(f,"KEY: "); for (i=0; i<KEY_LENGTH; i++) fprintf(f,"%02x ",ultraKey[i]); fprintf(f,"\n"); SaveNGram(f,1,lm); SaveUltraNGrams(f,lm);#else HError(15490,"SaveLangModel: Ultra format LMs not supported");#endif } else { for (i=1; i<=lm->nSize; i++) { if ((n=SaveNGram(f,i,lm))!=lm->gInfo[i].nEntry) { HError(-15490,"SaveLangModel: %d-gram nEntry = %d, actual saved %d", i,lm->gInfo[i].nEntry,n); lm->gInfo[i].nEntry = n; } } } fprintf(f, "\n\\end\\\n");#ifdef HTK_CRYPT if (lm->encrypt) { FILE *crf; TMP_REWIND(f); if ((crf = FOpen(lmFn,LangModOFilter,&isPipe)) == NULL) { TMP_CLOSE(f,lmFn); HError(15411,"SaveLangModel: Cannot create LM file %s",lmFn); } EncryptFile(lmFn,crf,f); FClose(crf,isPipe); TMP_CLOSE(f,lmFn); } else#endif FClose(f,isPipe);}/*---------------------- N-gram access ---------------------- *//* EXPORT-> GetNGramProb: generic LM access V2 */float GetNGramProb(BackOffLM *lm, NameId *words, int nSize){ int i; float prob; SMEntry *se; FLEntry *fe; AccessInfo *acs; LMProbType ptype; char *s, sbuf[256]; static int rLev = -1; float prob_mult = 0.0; /* NGram probability lookup works like this: 1) We see if we're looking for a unigram and if so search for an appropriate leaf SMEntry at the root level. If we don't find one then we must abort with an error at this point. 2) For other lengths we search for the path down the tree to the FLEntry for the given history. If we don't find a full history path we reduce the context and call ourselves recursively. 3) If we found the context then we look at the SMEntry elements at the FLEntry node to see if we can find our word with the given history. If we can then we return the stored probability otherwise we recursively call ourselves again with a reduced history, multiplying by the back-off weight associated with the given history (at the FLEntry node) when we return. */ /* If we're using a class-based language model then we still get passed a word history which must be converted into a class history */ if (lm->classLM) { /* Retrieve word|class probability for word we want to predict */ prob_mult = ((WordProb*)(words[nSize-1]->ptr))->prob; if (trace&T_PROB) { if (lm->probType & LMP_FLOAT) { /* this never happens in practice */ printf("<w|c mult=%5.2f> ", UNLOG_NATURAL(prob_mult)); } else { printf("<w|c mult=%5.2f> ", prob_mult); } } /* Convert word N-gram into class N-gram */ for (i=0; i<nSize; i++) { words[i] = ((WordProb*)(words[i]->ptr))->class; } } rLev++; ptype = lm->probType; if (nSize > lm->nSize) { words += nSize-lm->nSize; nSize = lm->nSize; } acs = lm->gInfo[nSize].aInfo; acs->count++; if (trace&T_PROB) { printf("[ "); printf("(%s",words[nSize-1]->name); if (nSize > 1) { printf(" |"); for(i=0; i<nSize-1; i++) printf(" %s",words[i]->name); } printf(") "); } if (nSize==1) { /* lookup unigram separately */ if ((se = FindSE(lm->root.sea,0,lm->root.nse,LM_INDEX(words[0])))==NULL) HError(15490,"GetNGramProb: Unable to find %s in unigrams",words[0]->name);#ifdef LM_COMPACT prob = Shrt2Prob(se->prob) * lm->gScale;#else prob = se->prob;#endif if (trace&T_PROB) printf("exact, "); } else { /* generic n-gram lookup, n>1 */ for (fe=&(lm->root), i=0; i<nSize-1; i++) { if ((fe=FindFE(fe->fea, 0, fe->nfe, LM_INDEX(words[i])))==NULL) break; } if ((fe == NULL) || (fe->nse == 0)) { if (lm->classLM) { lm->classLM = FALSE; prob = GetNGramProb(lm,words+1,nSize-1); lm->classLM = TRUE; } else prob = GetNGramProb(lm,words+1,nSize-1); if (trace&T_PROB) printf("replaced, "); acs->nmiss++; if ((trace&T_TOP) && (fe != NULL) && (fe->nse == 0)) { for (s = sbuf, i=0; i<nSize-1; i++) { sprintf(s,"%s ",words[i]->name); s+=strlen(s); } HError(-15492, "GetNGramProb: FLEntry.nse==0; original ARPA LM?\n%s",sbuf); } } else { if ((se = FindSE(fe->sea, 0, fe->nse, LM_INDEX(words[nSize-1])))!=NULL) {#ifdef LM_COMPACT prob = Shrt2Prob(se->prob) * lm->gScale;#else prob = se->prob;#endif if (trace&T_PROB) printf("exact, "); acs->nhits++; } else { if (lm->classLM) { lm->classLM = FALSE; prob = GetNGramProb(lm,words+1,nSize-1); lm->classLM = TRUE; } else prob = GetNGramProb(lm,words+1,nSize-1); if (ptype==LMP_FLOAT) prob *= fe->bowt; else prob += fe->bowt; if (trace&T_PROB) printf("backed-off %.4f, ",fe->bowt); acs->nboff++; } } } if (lm->classLM) { if (lm->probType & LMP_FLOAT) { /* This looks nasty but in fact we never execute this */ prob *= UNLOG_NATURAL(prob_mult); } else { prob += prob_mult; } } acs->prob += prob; acs->prob2 += prob*prob; if (trace&T_PROB) printf("prob %.4f ]%s",prob,(rLev==0) ? "\n" : " "); rLev--; return prob;}/* EXPORT-> LMTrans: calls GetNGramProb, but instead of taking a full n-gram of context we take a pointer to a context and a single word; we also return a langage model context state */LogFloat LMTrans2(LModel *LM, LMState src, LabId word, LMState *dest){ NameId ngram[LM_NSIZE], ngramRev[LM_NSIZE]; int nSize; float prob; NameId nid; LogFloat prob_mult = 0.0; FLEntry *context, *fe; SMEntry *se; BackOffLM *lm; float bo_weight; LMProbType ptype; int i, index; int nShorten; /* Amount to shorten n-gram by when searching for prob */ lm = LM->data.hlmModel; ptype = lm->probType; if (src) { context = (FLEntry *) src; } else { context = &(lm->root); /* No context yet */ } /* Convert word text to NameId */ if (lm->classLM) { /* class model */ nid = GetNameId(lm->classH, word->name, FALSE); if (!nid) HError(15499, "LMTrans: Attempt to predict token '%s' which is not in vocabulary", word); /* Find word-given-class probability and convert to a class */ prob_mult = ((WordProb*)(nid->ptr))->prob; if (trace&T_PROB) { if (ptype & LMP_FLOAT) { /* this first never happens in practice */ printf("<w|c mult=%5.2f> ", UNLOG_NATURAL(prob_mult)); } else { printf("<w|c mult=%5.2f> ", prob_mult); } } } else { /* not a class model */ nid = GetNameId(lm->htab, word->name, FALSE); if (!nid) HError(15499, "LMTrans: Attempt to predict token '%s' which is not in vocabulary", word); } /* We need to reconstruct the context later so do it now incase we need to back off */ fe = context; nSize = 0; while (fe && fe!=&(lm->root) && nSize<LM_NSIZE) { ngramRev[nSize] = lm->binMap[fe->ndx]; fe = fe->parent; nSize++; } if (nSize>=LM_NSIZE) HError(15499, "LMTrans: Context rebuilt to longer than compiled ngram size limit of %d", LM_NSIZE); /* And now we know the length we can reverse it */ for (i=0; i<nSize; i++) ngram[i] = ngramRev[nSize-(i+1)]; ngram[nSize] = nid; nSize++; /* For debugging purposes, print out the full ngram */ /*printf("nsize=%d ", nSize); for (i=0; i<nSize; i++) printf("%s ", ngram[i]->name); printf("\n");*/ /* Search for probability */ if (ptype & LMP_FLOAT) bo_weight = 1; else bo_weight = 0; se = FindSE(context->sea, 0, context->nse, LM_INDEX(nid)); nShorten = 0; fe = context; while (!se) { /* Multiply BO weight and shorten context */ if (ptype & LMP_FLOAT) bo_weight *= fe->bowt; else bo_weight += fe->bowt; nShorten++; if (nShorten==nSize) { /* Unigram probability */ se = FindSE(lm->root.sea, 0, lm->root.nse, LM_INDEX(nid)); if (!se) HError(15490, "LMTrans: Unable to find %s in unigrams", nid->name); } else { /* n>1 */ fe = &(lm->root); for (i=nShorten; i<nSize-1; i++) { fe = FindFE(fe->fea, 0, fe->nfe, LM_INDEX(ngram[i])); if (!fe) HError(15491, "LMTrans: Unable to find shortened context in LM"); } se = FindSE(fe->sea, 0, fe->nse, LM_INDEX(ngram[i])); } }#ifdef LM_COMPACT prob = Shrt2Prob(se->prob) * lm->gScale;#else prob = se->prob;#endif if (ptype & LMP_FLOAT) { prob = prob * bo_weight; } else { prob = prob + bo_weight; } /* Now look for FLEntry for new context for any further following word */ /* Decide from which point in the context we start searching */ if (nSize == lm->nSize) index = 1; else index = 0; do { fe = &(lm->root); for (i=index; i<nSize; i++) { fe = FindFE(fe->fea, 0, fe->nfe, LM_INDEX(ngram[i])); if (!fe) { /* Context not found, so shorten and retry */ index++; break; } } } while (!fe); /* Works because if no context then we don't execute inner loop and fe=&(lm->root) */ *dest = fe; if (lm->classLM) { if (lm->probType & LMP_FLOAT) { /* This looks nasty but in fact it never executes in practice */ prob *= UNLOG_NATURAL(prob_mult); } else { prob += prob_mult; } } return prob;}/* EXPORT-> GetNGramAddress: same as GetNGramProb but returns address of structure. This is used to provide a unique id for a particular context. This is used with Lattice Toolkit. The final word in words[] is a dummy entry which is never used. Its value is undefined and should not be interpreted. (ie. words[nSize-1]). It works like this in order to parallel GetNGramProb() */void *GetNGramAddress(BackOffLM *lm, NameId *words, int nSize){ int i; FLEntry *fe; char *s, sbuf[256]; static int rLev
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -