?? lmodel.c
字號:
*ptr2 = '\0'; /* Get name ID */ nid = GetNameId(lm->classH, ptr, TRUE); /* Find class name */ ptr = ptr2 + 1; /* Pass over NULL */ ptr += strspn(ptr, " \t"); /* Skip whitespace */ ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of class name */ *ptr2 = '\0'; nid2 = GetNameId(lm->htab, ptr, TRUE); /* Get name id of class */ ptr = ptr2 + 1; /* Pass over NULL */ ptr += strspn(ptr, " \t"); /* Skip over whitespace */ /* Create structure storing word|class probability and class of word */ wordProb = New(&gcheap, sizeof(WordProb)); wordProb->class = nid2; wordProb->prob = atof(ptr); wordProb->id = -1; nid->ptr = wordProb; /* Point word name id here */ /* Set up binMap equivalent */ lm->classBM[loop] = nid; LM_INDEX(nid) = -i; /* assign negative indices (copied code) */ loop++; } /* Check for left over lines */ i=0; while (GetInLine(src, line)) { if (strlen(line)>0) { if (i>10) { HError(-15451, "ReadClassProbs: Further extraneous lines not shown"); break; } HError(-15451, "ReadClassProbs: Extraneous line on end of Word|Class probabilities file\n('%s')", line); i++; } }}/* EXPORT-> LoadLangModel: read N-gram language model from fn */BackOffLM *LoadLangModel(char *fn, WordMap *wl, float gramScale, LMProbType tgtPType, MemHeap *heap){ Source src; NGramInfo *gi; BackOffLM *lm; int *itran,nSize,i,n; char c,sfmt[256]; char lnBuf[MAXSYMLEN]; Boolean isUltra; char *first_line; /* First line of input file */ char wc_fname[MAXSYMLEN]; /* Filename of word|class probs */ Source wcSrc; /* word|class probs/counts file */ int nWords; /* Number of words in total over all classes */ char *ptr; if ((tgtPType&LMP_FLOAT) && (tgtPType&LMP_LOG)) HError(15430,"LoadLangModel: Incompatible probability kind requested: %d",tgtPType); if (InitSource(fn,&src,LangModFilter)!=SUCCESS) /* Open LM file */ HError(15410,"Unable to open language model file"); if (trace&T_LOAD) { printf("Loading language model from %s\n", fn); fflush(stdout); } lm = (BackOffLM *) New(heap,sizeof(BackOffLM) * 2); lm->heap = heap; lm->htab = CreateHashTable(11731,"Back-off LM hash table"); lm->gScale = gramScale; lm->fe_buff = NULL; lm->se_buff = NULL; lm->binMap = NULL; lm->classH = NULL; lm->classLM = FALSE; /* default to not a class-based LM */ lm->classBM = NULL; lm->classW = 0;#ifdef HTK_CRYPT lm->encrypt = (src.crypt!=NULL);#endif for (gi=lm->gInfo, i=1; i<LM_NSIZE; i++,gi++) { gi->nEntry = 0; gi->fmt = LMF_OTHER; gi->aInfo = NULL; gi->boInfo = NULL; } /* Have a look at the input file to see if it's a word|class count/probability file. If so it will link to the 'real' class language model, so load in these probabilities, and then continue to load the class gram counts from a standard language model as if it was the only original input. */ /* Read first line from input LM file */ GetInLine(&src, lnBuf); /* See if it's a multi-file class-based LM */ if (strncmp(lnBuf, "Class-based LM", 14)==0) { /* Class-based LM */ if (trace & T_LOAD) { printf("Loading a multi-file class-based language model\n"); } /* Read filename of word|class probs/counts */ GetInLine(&src, lnBuf); ptr = strchr(lnBuf, ':'); if (!ptr) HError(15450, "LoadLangModel: Class language model file is in unknown format"); ptr++; ptr += strspn(ptr, " \t"); strcpy(wc_fname, ptr); /* Read filename of class|class bigrams */ GetInLine(&src, lnBuf); ptr = strchr(lnBuf, ':'); if (!ptr) HError(15450, "LoadLangModel: Class language model file is in unknown format"); ptr++; ptr += strspn(ptr, " \t"); /* NOTE: ptr content is used later on in this function to load in the class n-grams */ /* Close input file (ignore anything left in the file) */ CloseSource(&src); /* Load in word|class counts/probabilities file header */ wcSrc.f = NULL; /* No existing file */ ReadClassProbsHeader(wc_fname, &nWords, &wcSrc, lm); /* This sets lm->classCounts if it reads the appropriate header; otherwise probabilities */ /* Allocate hash table for words */ lm->classH = CreateHashTable((nWords/3)+1, "LM word/classes map"); /* Allocate space for vocabulary map for words */ lm->classBM = (NameId *) New(lm->heap, nWords*sizeof(NameId)); lm->classBM--; /* indexed from 1 (this is to make it work the same way as binMap) */ /* This is really nasty so be careful if modifying code using classBM (or binMap) */ /* This is a class-based LM (flag is toggled when backing off in GetNGramProb) */ lm->classLM = TRUE; /* Store number of vocab words */ lm->classW = nWords; /* We can either load probabilities or counts; counts require extra storage */ if (lm->classCounts) { int j; /* Allocate word count storage space (totals allocated once we know #classes) */ lm->word = New(&gcheap, nWords * sizeof(int)); for (j=0; j<nWords; j++) { lm->word[j] = 0; } } /* Open class|class n-grams */ if (InitSource(ptr, &src, LangModFilter)!=SUCCESS) /* ptr is n-gram file name */ HError(15410, "LoadLangModel: Unable to open class|class n-gram language model file"); if (trace&T_LOAD) { printf("Loading class n-grams from %s\n", ptr); fflush(stdout); } first_line = NULL; /* Read first line from class n-gram LM */ } /* See if it's a single-file class LM */ else if (strncmp(lnBuf, "CLASS MODEL", 11)==0) { if (trace & T_LOAD) { printf("Loading a class-based language model\n"); } /* Load in word|class counts/probabilities header */ wcSrc = src; /* Copy structure */ ReadClassProbsHeader("", &nWords, &wcSrc, lm); /* This sets lm->classCounts if it reads the appropriate header; otherwise probabilities */ /* Allocate hash table for words */ lm->classH = CreateHashTable((nWords/3)+1, "LM word/classes map"); /* Allocate space for vocabulary map for words */ lm->classBM = (NameId *) New(lm->heap, nWords*sizeof(NameId)); lm->classBM--; /* indexed from 1 (this is to make it work the same way as binMap) */ /* This is really nasty so be careful if modifying code using classBM (or binMap) */ /* This is a class-based LM (flag is toggled when backing off in GetNGramProb) */ lm->classLM = TRUE; /* Store number of vocab words */ lm->classW = nWords; /* We can either load probabilities or counts; counts require extra storage */ if (lm->classCounts) { int j; /* Allocate word count storage space (totals allocated once we know #classes) */ lm->word = New(&gcheap, nWords * sizeof(int)); for (j=0; j<nWords; j++) { lm->word[j] = 0; } } /* Open class|class n-grams */ if (trace&T_LOAD) { printf("Reading class n-gram counts\n"); fflush(stdout); } first_line = NULL; /* Read first line from current open file */ } else { first_line = lnBuf; /* We've already read the first line */ } ReadHeaderInfo(&src, lm, first_line); /* First line of input is passed (or NULL) */ if ((lm->probType&tgtPType)==0) HError(15430,"LoadLangModel: Unable to convert %d to %d pkind", lm->probType,tgtPType); lm->probType &= tgtPType; isUltra = FALSE; for (gi=lm->gInfo+1, nSize=1; nSize<LM_NSIZE; nSize++,gi++) { sprintf(sfmt, "ngram %d%%c%%d", nSize); if (GetInLine(&src,lnBuf)==NULL) HError(15450,"LoadLangModel: EOF whilst parsing n-gram info"); if (sscanf(lnBuf, sfmt, &c, &n)==2) { if (trace&T_LOAD) printf("%s\n", lnBuf); gi->nEntry = n; switch (c) { case '=': gi->fmt = LMF_TEXT; break; case '~': gi->fmt = LMF_BINARY; break; case '#': gi->fmt = LMF_ULTRA; isUltra = TRUE; break; default : HError(15450,"LoadLangModel: Unknown LM file format (%s)",lnBuf); } } else break; } if (--nSize < 1) HError(15450, "LoadLangModel: Unable to identify file %s", fn); lm->nSize = nSize; /* initialise vocabulary size and lookup table */ lm->vocSize = (wl==NULL) ? lm->gInfo[1].nEntry : wl->used; lm->binMap = (NameId *) New(lm->heap,(lm->vocSize)*sizeof(NameId)); lm->binMap--; /* indexed from 1 - beware if altering the code! This is really nasty! */ if (wl!=NULL) { NameId wdid; if (isUltra) HError(15440,"LoadLangModel: Cannot prune models in ultra format"); itran = (int *) New(&gstack,(lm->gInfo[1].nEntry+1)*sizeof(int)); for (i=1; i<=lm->vocSize; i++) { wdid = GetNameId(lm->htab,wl->id[i-1]->name,TRUE); lm->binMap[i] = wdid; LM_INDEX(wdid) = -i; /* assign negative indices */ } } else { itran = NULL; for (i=1; i<=(lm->vocSize); i++) lm->binMap[i]=NULL; } if ((lm->vocSize > USHRT_MAX) && (sizeof(LM_Id)==sizeof(UShort))) HError(15445,"LoadLangModel: Unable to load %d unigrams using %d-byte IDs", lm->vocSize,sizeof(LM_Id)); /* initialise auxilliary structures */ lm->lmvec = (float *) New(lm->heap,(lm->vocSize )*sizeof(float)); lm->lmvec--; /* indexed from 1 (hmmmmm) */ lm->fe_buff = (FLEntry *) New(lm->heap,(lm->vocSize )*sizeof(FLEntry)); lm->se_buff = (SMEntry *) New(lm->heap,(lm->vocSize )*sizeof(SMEntry)); if (isUltra) { /* ultra file format */#ifdef ULTRA_LM unsigned short key[KEY_LENGTH]; if (strstr(lnBuf,"KEY: ")==NULL) HError(15450,"LoadLangModel: Unable to find KEY (%s)",lnBuf); ultraKey[KEY_LENGTH-1] = (vaxOrder && natReadOrder) ? 1 : 0; for (strtok(lnBuf," "),i=0; i<KEY_LENGTH; i++) { if ((s=strtok(NULL," "))==NULL) HError(15450,"LoadLangModel: Unable to read key[%d] (%s)",i,lnBuf); key[i] = strtol(s,(char **)NULL,16); if (key[i]!=ultraKey[i]) HError(15450,"LoadLangModel: key[%d] mismatch %02x - should be %02x\n", i, key[i], ultraKey[i]); } LoadNGram(&src,1,lm,NULL); LoadUltraNGrams(&src,lm);#else HError(15490,"LoadLangModel: Ultra format LMs not supported");#endif } else { /* text or binary file format */ for (i=1; i<=nSize; i++) lm->gInfo[i].nEntry = LoadNGram(&src,i,lm,itran); } if (itran!=NULL) Dispose(&gstack,itran); SyncStr(&src,"\\end\\"); if (wcSrc.f != src.f) CloseSource(&src); for (i=1; i<lm->nSize; i++) { if (lm->gInfo[i].nEntry==0) { HError(-15460,"LoadLangModel: Model order changed from %d-gram to %d-gram", lm->nSize,i-1); lm->nSize=i-1; break; } } /* Build reverse look-up for use when recreating context from an FLEntry pointer */ CreateReverseLookup(&(lm->root)); if (lm->classLM) { if (lm->classCounts) { /* Load in given word|class count file(s) */ if (trace & T_LOAD) printf("Loading word-in-class counts\n"); ReadClassCounts(&wcSrc, nWords, lm); /* Allocate space for and count class totals for each LM */ CountClassTotals(lm); /* Calculate static/initial word|class probabilities */ CalcWordClassProbs(lm); } else { if (trace & T_LOAD) printf("Loading word-in-class probabilities\n"); /* Load in word|class probabilities file */ ReadClassProbs(&wcSrc, nWords, lm); } CloseSource(&wcSrc); } if (trace & T_LOAD) printf("Language model import complete (%d words; %s model)\n", lm->classW, lm->classLM?"class":"word"); return lm;}/*------------------------- LM saving -------------------------*//* WriteNGram: recursive write routine */static int WriteNGram(FILE *f, BackOffLM *lm, FLEntry **feStack, int g, int nSize, Boolean intId){ NGramInfo *gi; int i,j,ndx,nItem; SMEntry *se; FLEntry *fe,*topFE; Byte fsize,flags; float prob,bowt,iScale; Boolean has_bowt, isBin=FALSE; char *s, *word, context[MAXSYMLEN]; LMProbType ptype; nItem = 0; iScale = 1.0/(lm->gScale*LN10); ptype = lm->probType; if (g < nSize) { topFE = feStack[g-1]; for (fe=topFE->fea, i=0; i<topFE->nfe; i++, fe++) { feStack[g] = fe; nItem += WriteNGram(f,lm,feStack,g+1,nSize,intId); } } else { gi = lm->gInfo+nSize; if (gi->fmt==LMF_BINARY || gi->fmt==LMF_TEXT) isBin = (gi->fmt==LMF_BINARY); else HError(15490,"LoadNGram: Unknown LM file format (%d)\n",gi->fmt); for (*context = '\0',s = context,j=1; j<nSize; j++) { ndx = feStack[j]->ndx; if ((ndx < 1) || (ndx > lm->vocSize)) HError(15490,"WriteNGram: Component %d of %d-gram, FE index (%d)", j,nSize,ndx); word = lm->binMap[ndx]->name; if (htkEsc) word = ReWriteString(word,NULL,ESCAPE_CHAR); sprintf(s,"%s ",word); s+=strlen(s); } topFE = feStack[nSize-1]; for (se = topFE->sea,i=0; i<topFE->nse; i++, se++) { if ((se->ndx < 1) || (se->ndx > lm->vocSize)) { HError(15490,"WriteNGram: Invalid SE index (%d)",se->ndx); } switch (ptype) { case LMP_FLOAT : prob = FLT_TO_LOG10(se->prob); break; case LMP_LOG :#ifdef LM_COMPACT prob = Shrt2Prob(se->prob); break;#else prob = se->prob * iScale; break;#endif default: prob = se->prob; break; } if ((nSize < lm->nSize) && (topFE->nfe>0) && (fe = FindFE(topFE->fea,0,topFE->nfe,se->ndx))!=NULL) { /* if (fe->nse>0) {*/ has_bowt = TRUE; switch (ptype) { case LMP_FLOAT : bowt = FLT_TO_LOG10(fe->bowt); break; case LMP_LOG : bowt = fe->bowt * iScale; break; default: bowt = fe->bowt; break; /* }*/ } } else { has_bowt = FALSE; } if (isBin) { flags = 0; fsize = sizeof(float); if (has_bowt) { flags |= HAS_BOWT; fsize += sizeof(float); } if (intId) { fsize += nSize*sizeof(UInt); flags |= INT_LMID; } else { fsize += nSize*sizeof(UShort); } fwrite(&fsize, sizeof(Byte),1,f); /* size field */ fwrite(&flags, sizeof(Byte),1,f); /* flags field */ WriteFloat(f,&prob,1,TRUE); /* probability */ if (flags&INT_LMID) { UInt x; for (j=1; j<nSize; j++) { x = (UInt) feStack[j]->ndx; WriteInt(f,(int *)&x,1,TRUE); } x = (UInt) se->ndx; WriteInt(f,(int *)&x,1,TRUE); } else { UShort x; for (j=1; j<nSize; j++) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -