?? lmodel.c
字號:
ptype = lm->probType; if (FRead(&cneCnt,sizeof(int32),1,src)!=1) HError(15450,"LoadUltraNGrams: Unable to read CNEntry count"); if (mustSwap) SwapInt32(&cneCnt); if (FRead(&seCnt,sizeof(int32),1,src)!=1) HError(15450,"LoadUltraNGrams: Unable to read SEntry count"); if (mustSwap) SwapInt32(&seCnt); if (trace&T_LOAD) printf("Loading %d NEntry(s)\n",cneCnt); cneBuf = (CNEntry *) New(&gstack,cneCnt*sizeof(NEntry)); if (FRead(cneBuf,sizeof(CNEntry),cneCnt,src)!=cneCnt) HError(15450,"LoadUltraNGrams: Unable to read CNEntry array"); if (mustSwap) { for (cne=cneBuf,i=0; i<cneCnt; i++,cne++) SWAP_CE(cne); } if (trace&T_LOAD) printf("Loading %d SEntry(s)\n",seCnt); /* create and read SMEntry block */ smeBuf = (SMEntry *) New(lm->heap,seCnt*sizeof(SMEntry)); for (sme=smeBuf,i=0; i<seCnt; i++,sme++) { if (FRead(&se,sizeof(SEntry),1,src)!=1) HError(15450,"LoadUltraNGrams: Unable to read SEntry (%d)",i); if (mustSwap) SWAP_SE((&se));#ifdef LM_COMPACT sme->prob = se.prob;#else prob = PROB_SHORT_TO_LOG(se.prob); sme->prob = (ptype==LMP_FLOAT) ? LOG10_TO_FLT(prob) : prob * scale;#endif sme->ndx = se.word; } /* create table of pointers to SMEntry arrays */ smeTab = (SMEntry **) New(&gstack,cneCnt*sizeof(SMEntry *)); for (sme=smeBuf,j=0; j<cneCnt; j++) { smeTab[j] = (cneBuf[j].nse==0) ? NULL : (SMEntry *) sme; sme += cneBuf[j].nse; } /* create and sort lookup index array */ cneTab = (int *) New(&gstack,sizeof(int)*cneCnt); for (i=0; i<cneCnt; i++) cneTab[i] = i; qs_cneBuf = cneBuf; qsort(cneTab,cneCnt,sizeof(int),nep_cmp); feBuf = (FLEntry *) New(lm->heap,cneCnt*sizeof(FLEntry)); parent = &lm->root; CNE2FE(cneTab[0],parent); parent->nfe = 0; parent->fea = cfe = feBuf; parent->parent = 0; for (i=0; i<NSIZE-1; i++) context[i] = cneBuf[cneTab[1]].word[i]; for (i=1; i<cneCnt; i++) { cne = cneBuf+cneTab[i]; for (newCTX=FALSE,j=1; j<NSIZE-1; j++) { if (context[j]!=cne->word[j]) { newCTX=TRUE; break; } } if (newCTX) { for (parent=&lm->root,j=NSIZE-2; j>0; j--) { if ((idx=cne->word[j])==0) continue; if ((parent = FindFE(parent->fea,0,parent->nfe,idx))==NULL) { HError(15450,"LoadUltraNGrams: Items not in order %d",i); } } parent->fea = cfe; parent->nfe = 0; for (j=0; j<NSIZE-1; j++) context[j] = cne->word[j]; } parent->nfe++; CNE2FE(cneTab[i],cfe); cfe++; } Dispose(&gstack,cneBuf);}static int WriteNEntry(FILE *f, BackOffLM *lm, int lev, FLEntry **feBuf, FLEntry **feTab, int *fetCount){ int i,total; CNEntry ne; float scale,bowt; LMProbType ptype; FLEntry *fe,*tgtFE; Boolean mustSwap = (vaxOrder && !natWriteOrder); if (lev==lm->nSize) return 0; ptype = lm->probType; scale = 1.0/(lm->gScale*LN10); tgtFE = feBuf[lev-1]; total = 0; INIT_CNE(ne); for (i=1; i<lev; i++) ne.word[lev-i] = feBuf[i]->ndx; for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++) { if (fe->nse==0) continue; feTab[(*fetCount)++] = fe; ne.word[0] = fe->ndx; ne.nse = fe->nse; bowt = (ptype==LMP_FLOAT) ? FLT_TO_LOG10(fe->bowt) : fe->bowt*scale; /* convert to LOG10 */ ne.bowt = BOWT_LOG_TO_SHORT(bowt); /* compress LOG10 to short */ if (mustSwap) { SWAP_CE((&ne)); fwrite(&ne,sizeof(CNEntry),1,f); SWAP_CE((&ne)); } else { fwrite(&ne,sizeof(CNEntry),1,f); } total++; } if (++lev < lm->nSize) { for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++) { feBuf[lev-1] = fe; total += WriteNEntry(f,lm,lev,feBuf,feTab,fetCount); } } return total;}static int WriteSEntry(FILE *f,BackOffLM *lm,FLEntry **feTab, int fetCount){ SEntry se; SMEntry *sme; FLEntry *fe; int i,j,total = 0; float scale,prob; LMProbType ptype; Boolean mustSwap = (vaxOrder && !natWriteOrder); ptype = lm->probType; scale = 1.0/(lm->gScale*LN10); total = 0; for (i=0; i<fetCount; i++) { fe = feTab[i]; for (sme=fe->sea,j=0; j<fe->nse; j++,sme++) { prob = (ptype==LMP_FLOAT) ? FLT_TO_LOG10(sme->prob) : sme->prob*scale; se.prob = PROB_LOG_TO_SHORT(prob); /* LOG10 -> short */ se.word = sme->ndx; if (mustSwap) { SWAP_SE((&se)); fwrite(&se,sizeof(SEntry),1,f); SWAP_SE((&se)); } else { fwrite(&se,sizeof(SEntry),1,f); } total++; } } return total;}static void CountEntries(int lev, int nSize, FLEntry *tgtFE, int *nfe, int *nse){ int i; FLEntry *fe; *nse += tgtFE->nse; if (lev < nSize) *nfe += tgtFE->nfe; for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++) CountEntries(lev+1,nSize,fe,nfe,nse);}static void SaveUltraNGrams(FILE *f, BackOffLM *lm){ int n,neCnt,seCnt,fetCount; CNEntry ne; FLEntry *feBuf[LM_NSIZE], **feTab; Boolean mustSwap = (vaxOrder && !natWriteOrder); fprintf(f,"\n\\N-grams:\n"); neCnt = seCnt = 0; CountEntries(1,lm->nSize,&lm->root,&neCnt,&seCnt); neCnt++; if (mustSwap) { SwapInt32(&neCnt); fwrite(&neCnt,sizeof(int32),1,f); SwapInt32(&neCnt); } else { fwrite(&neCnt,sizeof(int32),1,f); } if (mustSwap) { SwapInt32(&seCnt); fwrite(&seCnt,sizeof(int32),1,f); SwapInt32(&seCnt); } else { fwrite(&seCnt,sizeof(int32),1,f); } INIT_CNE(ne); /* write the root entry */ ne.nse = lm->root.nse; if (mustSwap) { SWAP_CE((&ne)); fwrite(&ne,sizeof(CNEntry),1,f); SWAP_CE((&ne)); } else { fwrite(&ne,sizeof(CNEntry),1,f); } feTab = (FLEntry **) New(&gstack,neCnt*sizeof(FLEntry *)); fetCount = 0; feTab[fetCount++] = &lm->root; if (lm->nSize > 1) { feBuf[0] = &lm->root; WriteNEntry(f,lm,1,feBuf,feTab,&fetCount); if (trace&T_SAVE) { printf("saved %d CNEntry(s), (%d)\n",fetCount,neCnt); fflush(stdout); } } n = WriteSEntry(f,lm,feTab,fetCount); if (trace&T_SAVE) { printf("saved %d SEntry(s), (%d)\n",n,seCnt); fflush(stdout); } Dispose(&gstack,feTab);}#endif /* ULTRA_LM *//*------------------------- LM loading -------------------------*/#define READ_FLOAT(src,x,bin) { \ char buf[100]; \ if (!ReadFloat(src,x,1,bin)) \ HError(15490,"ReadFloat: Float expected at %s",SrcPosition(*src,buf)); \}/* EXPORT-> StoreFEA: move fea array into permanent location */void StoreFEA(FLEntry *fe, MemHeap *heap){ FLEntry *febuf; if (fe==NULL) return; if (fe->nfe==0) { fe->fea = NULL; } else { qsort(fe->fea, fe->nfe, sizeof(FLEntry), CmpFE); febuf = (FLEntry *) New(heap,fe->nfe*sizeof(FLEntry)); fe->fea = memcpy(febuf, fe->fea, fe->nfe*sizeof(FLEntry)); }}/* EXPORT-> StoreSEA: move fea array into permanent location */void StoreSEA(FLEntry *fe, MemHeap *heap){ SMEntry *sebuf; if (fe==NULL) return; if (fe->nse==0) { fe->sea = NULL; } else { qsort(fe->sea, fe->nse, sizeof(SMEntry), CmpSE); sebuf = (SMEntry *) New(heap,fe->nse*sizeof(SMEntry)); fe->sea = memcpy(sebuf, fe->sea, fe->nse*sizeof(SMEntry)); }}/* LoadUnigram: read the unigram part of a file */static int LoadUnigram(Source *src, BackOffLM *lm, int *itran){ char word[256]; int i,tndx,nItem; float bowt,prob,scale; LM_Id ndx; SMEntry *se; FLEntry *fe; NameId wdid; LMProbType ptype; Boolean has_bowt; nItem = 0; scale = lm->gScale*LN10; ptype = lm->probType; lm->root.sea = se = lm->se_buff; lm->root.nse = 0; lm->root.fea = fe = lm->fe_buff; lm->root.nfe = 0; lm->root.bowt = 0.0; lm->root.parent = 0; SyncStr(src,"\\1-grams:"); for (i=1; i<=lm->gInfo[1].nEntry; i++){ READ_FLOAT(src,&prob,FALSE); if (!GetSrcString(src,word,htkEsc)) HError(15413,"LoadUnigram: Unable to read unigram %d",i); SkipWhiteSpace(src); if (!src->wasNewline) { /* process backoff weight */ READ_FLOAT(src,&bowt,FALSE); has_bowt = TRUE; } else { has_bowt = FALSE; } if (itran!=NULL) { if ((wdid = GetNameId(lm->htab,word,FALSE))==NULL) { itran[i] = -1; continue; } if ((tndx = LM_INDEX(wdid)) > 0) HError(15450,"LoadUnigram: Duplicate unigram %s",word); ndx = itran[i] = -tndx; /* indices pre-assigned as negative */ } else { wdid = GetNameId(lm->htab,word,TRUE); ndx = i; } nItem++; lm->binMap[ndx] = wdid; /* This is where the wordlist is built */ se->ndx = LM_INDEX(wdid) = ndx; switch(ptype) { case LMP_FLOAT : se->prob = LOG10_TO_FLT(prob); break; case LMP_LOG :#ifdef LM_COMPACT se->prob = Prob2Shrt(prob); break;#else se->prob = prob * scale; break;#endif default: if (prob < 0.0) HError(15450,"LoadUnigram: Negative probability (%.4f) for unigram %d", prob,i); se->prob = prob; break; } se++; lm->root.nse++; if (has_bowt) { /* process backoff weight */ fe->ndx = ndx; fe->nse = 0; fe->sea = NULL; fe->nfe = 0; fe->fea = NULL; switch(ptype) { case LMP_FLOAT : fe->bowt = LOG10_TO_FLT(bowt); break; case LMP_LOG : fe->bowt = bowt*scale; break; default : fe->bowt = bowt; break; } fe++; lm->root.nfe++; } } if (itran!=NULL && nItem!=lm->vocSize) { /* create dummy entries for unseen unigrams */ for (i=1; i<=lm->vocSize; i++) { if ((tndx = LM_INDEX(lm->binMap[i])) > 0) continue; LM_INDEX(lm->binMap[i]) = -tndx; se->ndx = -tndx; se->prob = 0.0; se++; lm->root.nse++; } } StoreFEA(&(lm->root),lm->heap); StoreSEA(&(lm->root),lm->heap); /* check unigram consistency */ for (se=lm->root.sea, i=0; i<lm->root.nse; i++, se++) { if (se->ndx!=i+1) HError(15450, "LoadUnigram: Mismatched unigram index %d should be %d", se->ndx, i+1); } return nItem;}#define TRINDEX(itran,i) (itran==NULL) ? i : itran[i]/* LoadNGram: read n-gram (N>1) from file f */static int LoadNGram(Source *src, int nSize, BackOffLM *lm, int *itran){ LM_Id ndx=0; NGramInfo *gi; LMProbType ptype; Byte fsize, flags; SMEntry *se=NULL; FLEntry *feptr=NULL, *fe=NULL; float prob,bowt,scale; int i,j,k,num_fe,num_se; /*,n*/ char *s,lnBuf[256],word[256]; Boolean has_bowt, hasOOV, newCTX, isBin=FALSE; NameId wdid[LM_NSIZE], keyid[LM_NSIZE]; if (nSize==1) { return LoadUnigram(src,lm,itran); } scale = lm->gScale*LN10; ptype = lm->probType; gi = lm->gInfo+nSize; if (gi->fmt==LMF_BINARY || gi->fmt==LMF_TEXT) isBin = (gi->fmt==LMF_BINARY); else HError(15450,"LoadNGram: Unknown LM file format (%d)\n",gi->fmt); if (trace&T_LOAD) { printf("Loading %d %d-grams (%s)\n", lm->gInfo[nSize].nEntry,nSize,isBin ? "bin":"text"); fflush(stdout); } num_fe = num_se = 0; keyid[0] = NULL; /* Previous context */ sprintf(lnBuf, "\\%d-grams:",nSize); SyncStr(src,lnBuf); for (i=0; i<lm->gInfo[nSize].nEntry; i++) { has_bowt = FALSE; hasOOV = FALSE; if (isBin) { /* binary model */ fsize = (Byte) GetCh(src); flags = (Byte) GetCh(src); READ_FLOAT(src,&prob,TRUE); for (j=0; j<nSize; j++) { if (flags&INT_LMID) { UInt a; ReadInt(src,(int *)&a,1,TRUE); ndx = (LM_Id) a; } else { UShort a; ReadShort(src,(short *)&a,1,TRUE); ndx = (LM_Id) a; } if (itran!=NULL && itran[ndx]<0) { hasOOV = TRUE; } else { if (itran!=NULL) ndx = itran[ndx]; if ((ndx > 0) && (ndx <=lm->vocSize)) wdid[j] = lm->binMap[ndx]; else HError(15450,"LoadNGram: LM index out of bounds (%d)", ndx); } } if (flags&HAS_BOWT) { READ_FLOAT(src,&bowt,TRUE); has_bowt = TRUE; } } else { /* text model */
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -