?? lmodel.c
字號(hào):
READ_FLOAT(src,&prob,FALSE); for (j=0; j<nSize; j++) { /* read n-gram words */ if (!GetSrcString(src,word,htkEsc)) HError(15450,"LoadNGram: Unable to read word %d of %d-gram",j,nSize); if ((wdid[j] = GetNameId(lm->htab,word,FALSE))==NULL) { if (itran==NULL) HError(-15450, "LoadNGram: Word %s not in unigrams, skipping n-gram", word); hasOOV = TRUE; } else { ndx = LM_INDEX(wdid[j]); } } SkipWhiteSpace(src); if (!src->wasNewline) { READ_FLOAT(src,&bowt,FALSE); has_bowt = TRUE; } } if (hasOOV) continue; /* See if the context has changed */ for (newCTX=FALSE, j=0; j<nSize-1; j++) { if (keyid[j]!=wdid[j]) { newCTX=TRUE; break; } } /* Guaranteed to execute the first time through the loop because the context has not been seen before (thus defining fe, se etc) */ if (newCTX) { /* new n-gram context */ if (keyid[0]!=NULL) { /* copy to permanent storage */ StoreFEA(feptr,lm->heap); num_fe += feptr->nfe; StoreSEA(feptr,lm->heap); num_se += feptr->nse; } for (feptr = &(lm->root), j=0; j<nSize-1; j++) { if ((feptr = FindFE(feptr->fea, 0, feptr->nfe, LM_INDEX(wdid[j])))==NULL) { for (s=lnBuf,k=0; k<nSize; k++) { sprintf(s,"%s[%d] ",wdid[k]->name,LM_INDEX(wdid[k])); s+=strlen(s); } HError(15420, "LoadNGram: Cannot find component %d of (%d) %d-gram %s", j,i,nSize,lnBuf); } keyid[j] = wdid[j]; } feptr->fea = fe = lm->fe_buff; feptr->nfe = 0; feptr->sea = se = lm->se_buff; feptr->nse = 0; } se->ndx = ndx; switch(ptype) { case LMP_FLOAT : se->prob = LOG10_TO_FLT(prob); break; case LMP_LOG :#ifdef LM_COMPACT se->prob = Prob2Shrt(prob); break;#else se->prob = prob * scale; break;#endif default: se->prob = prob; break; } se++; (feptr->nse)++; if (has_bowt) { /* also store as full entry */ fe->ndx = ndx; fe->nse = 0; fe->sea = NULL; fe->nfe = 0; fe->fea = NULL; switch(ptype) { case LMP_FLOAT : fe->bowt = LOG10_TO_FLT(bowt); break; case LMP_LOG : fe->bowt = bowt*scale; break; default : fe->bowt = bowt; break; } fe++; (feptr->nfe)++; } } if (keyid[0]!=NULL) { /* store the last accumulated */ StoreFEA(feptr,lm->heap); num_fe += feptr->nfe; StoreSEA(feptr,lm->heap); num_se += feptr->nse; } /* if (isBin) { // read the last 2 zero bytes // ReadShort(src,&ndx,1,TRUE); } */ if (trace&T_LOAD) { printf(" SMEntry: %8d x %2d bytes = %d bytes\n", num_se, sizeof(SMEntry), num_se*sizeof(SMEntry)); printf(" FLEntry: %8d x %2d bytes = %d bytes\n", num_fe, sizeof(FLEntry), num_fe*sizeof(FLEntry)); } lm->gInfo[0].nEntry+=num_fe; return num_se;}/* Create reverse lookup pointers in FLEntry context tree *//* Call with (lm->root, 0) and let it recurse its way down */void CreateReverseLookup(FLEntry *fes){ int i; /* loop counter */ for (i=0; i<fes->nfe; i++) { fes->fea[i].parent = fes; CreateReverseLookup(&(fes->fea[i])); }}/* EXPORT-> ReadHeaderInfo: read header information *//* First parameter is source file, second is LM structure, and third parameter is first input line or NULL to read from file */void ReadHeaderInfo(Source *src, BackOffLM *lm, char *line1){ float ff; int i,j,n; char *s,*s1,*s2=NULL; DiscountType dt; BackOffInfo *bo; DiscountInfo *di; char lnBuf[MAXSYMLEN],*sbuf; lm->probType = LMP_FLOAT|LMP_LOG; for (i=1; i<LM_NSIZE; i++) lm->gInfo[i].boInfo = NULL; while(line1 || GetInLine(src, lnBuf)) { if (line1) { strcpy(lnBuf, line1); line1 = NULL; /* Read the rest from the file */ } if ((s=strstr(lnBuf,"\\data\\"))!=NULL && s==lnBuf) { break; /* gone past header, so exit */ } if (strcmp(lnBuf,"COUNTS")==0) { lm->probType = LMP_COUNT; continue; } for (i=1; i<LM_NSIZE; i++) { /* try each n-gram name in turn */ if ((s=strstr(lnBuf,nGramName[i]))==NULL || s!=lnBuf) continue; bo = (BackOffInfo *) New(lm->heap,sizeof(BackOffInfo)); di = &(bo->dcInfo); if (trace&T_LOAD) printf("Parsing %s header info\n",nGramName[i]); for (dt=DC_LAST,j=0; j<DC_LAST; j++) { if (strstr(lnBuf,dcTypeName[j])!=NULL) { dt = j; break; } } if (dt==DC_LAST) HError(15450,"LoadHeaderInfo: Unable to parse d-type in %s",lnBuf); bo->dcType = dt; if ((s1=strstr(lnBuf,"cutoff"))==NULL) HError(15450,"LoadHeaderInfo: Unable to find 'cutoff' in %s",lnBuf); if (sscanf(s1,"cutoff %d",&n)!=1) HError(15450,"LoadHeaderInfo: Unable to parse cutoff value in %s",lnBuf); bo->cutOff = n; bo->wdThresh = 0.0; if (!GetInLine(src,lnBuf)) HError(15450,"LoadHeaderInfo: EOF reading d-coefs for %s",nGramName[i]); switch (dt) { case DC_KATZ: if ((s1 = strchr(lnBuf,'['))==NULL || (s2 = strchr(lnBuf,']'))==NULL) HError(15450,"LoadHeaderInfo: Unable to find array bounds in %s",lnBuf); *s2='\0'; sbuf = s2+1; di->tgInfo.kRange = n = atoi(s1+1); di->tgInfo.coef = (float *) New(lm->heap,(n+1)*sizeof(float)); for (j=1; j<=n; j++) { s1 = strtok((j==1)?sbuf:NULL," \t\r\n:"); if (s1==NULL) HError(15450,"LoadHeaderInfo: Unable to parse coef %d in %s",j,lnBuf); di->tgInfo.coef[j]=atof(s1); } break; case DC_ABSOLUTE: if ((s1=strstr(lnBuf,"coef:"))==NULL) HError(15450,"LoadHeaderInfo: Unable to find 'coef:' in %s",lnBuf); if (sscanf(s1,"coef: %f",&ff)!=1) HError(15450,"LoadHeaderInfo: Unable to parse float value in %s",s1); di->bCoef=ff; break; default : HError(15450,"LoadHeaderInfo: Unsupported LM type (%d)",dt); break; } lm->gInfo[i].boInfo = bo; } }}/* EXPORT-> WriteHeaderInfo: write header information */void WriteHeaderInfo(FILE *f, BackOffLM *lm){ int i,j; BackOffInfo *bo; DiscountInfo *di; if (lm->probType==LMP_COUNT) fprintf(f,"COUNTS\n\n"); for (i=2; i<=lm->nSize; i++) { if ((bo = lm->gInfo[i].boInfo)==NULL) continue; di = &(bo->dcInfo); if (bo->wdThresh>0) fprintf(f, "%s: method %s, cutoff %d, wdThresh %.3f\n", nGramName[i], dcTypeName[bo->dcType], bo->cutOff, bo->wdThresh); else fprintf(f, "%s: method %s, cutoff %d\n", nGramName[i], dcTypeName[bo->dcType], bo->cutOff); switch (bo->dcType) { case DC_KATZ : fprintf(f, " coef[%d]:", di->tgInfo.kRange); for (j=1; j<=di->tgInfo.kRange; j++) fprintf(f," %.6f", di->tgInfo.coef[j]); fprintf(f,"\n"); break; case DC_ABSOLUTE : fprintf(f, " coef: %.6f\n", di->bCoef); break; case DC_LINEAR : default: break; } fprintf(f, "\n"); }}/* ReadClassProbsHeader: read in word|class probabilities header */static void ReadClassProbsHeader(char *fname, int *nWords, Source *src, BackOffLM *lm){ char line[MAXSYMLEN]; /* Current input line */ char *ptr; /* Temporary pointers */ *nWords = -1; if (!src->f) { /* Open file if necessary */ if (InitSource(fname, src, LangModFilter)!=SUCCESS) { HError(15410, "ReadClassProbsHeader: Unable to open language model word|class file '%s'", fname); } } strcpy(line, ""); GetInLine(src, line); if (strncmp(line, "Word|Class probabilities", 25)==0) { lm->classCounts = FALSE; } else if (strncmp(line, "Word|Class counts", 17)==0) { lm->classCounts = TRUE; } else { HError(15450, "ReadClassProbsHeader: Language model word|class file is in unknown format"); } if (trace & T_LOAD) { printf("Word|class file uses word %s\n", lm->classCounts?"counts":"probabilities"); } while (GetInLine(src, line)) { if (strncmp(line, "Number of classes", 17)==0) { ptr = strchr(line, ':'); if (!ptr) { HError(15450, "ReadClassProbsHeader: Corrupt 'Number of classes' line in word|class file"); } ptr++; while (*ptr==' ' || *ptr=='\t') ptr++; if (trace & T_LOAD) { printf("Number of classes = %d\n", atoi(ptr)); } } else if (strncmp(line, "Number of words", 15)==0) { ptr = strchr(line, ':'); if (!ptr) { HError(15450, "ReadClassProbsHeader: Corrupt 'Number of words' line in word|class file"); } ptr++; while (*ptr==' ' || *ptr=='\t') ptr++; *nWords = atoi(ptr); if (trace & T_LOAD) { printf("Number of words = %d\n", *nWords); } } else if ((strncmp(line, "Word", 4)==0) || (strncmp(line, "Class", 5)==0)) { break; } } if (feof(src->f)) { HError(15450, "ReadClassProbsHeader: Word|Class language model file contains no %s", lm->classCounts?"counts":"probabilities"); } if (*nWords == -1) { HError(15450, "ReadClassProbsHeader: Failed to find number of words header in word|class file"); }}/* ReadClassCounts: read in word|class counts file */static void ReadClassCounts(Source *src, int nWords, BackOffLM *lm){ char line[MAXSYMLEN]; /* Current input line */ char *ptr, *ptr2; /* Temporary pointers */ int i; /* Loop counter */ WordProb *wordProb; /* Temporary pointer */ int loop=1; /* Array index counter */ NameId nid, nid2; /* Word ids */ int class_id=0; /* Number classes from 0 */ int floor_count = 0; /* Number of counts floored */ /* Add labels and wordlist entries for words */ for (i=0; i<nWords; i++) { ptr = GetInLine(src, line); if (!ptr || strlen(ptr)==0) { HError(15450, "ReadClassCounts: Blank line/end of file in word|class language model file"); } /* Segment line into word, class and count */ /* Don't use strtok() in case a client program is using it */ ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of word */ *ptr2 = '\0'; /* Get name ID */ nid = GetNameId(lm->classH, ptr, TRUE); /* Find class name */ ptr = ptr2 + 1; /* Pass over NULL */ ptr += strspn(ptr, " \t"); /* Skip whitespace */ ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of class name */ *ptr2 = '\0'; nid2 = GetNameId(lm->htab, ptr, TRUE); /* Get name id of class */ class_id = atoi(ptr+5) - 1; /* assume called CLASSn */ /* GLM */ nid2->ptr = (void*) class_id; ptr = ptr2 + 1; /* Pass over NULL */ ptr += strspn(ptr, " \t"); /* Skip over whitespace */ lm->word[i] = atoi(ptr); /* Store word count */ if (lm->word[i]<=0) { floor_count++; lm->word[i] = 1; /* Force zero counts to 1 in order to avoid 0 probabilities */ if (floor_count==5) HError(-15450, "ReadClassCounts: too many floored counts to list"); else if (floor_count<5) HError(-15450, "ReadClassCounts: flooring zero count to one for '%s'", nid->name); } /* Create structure storing word|class probability and class of word */ wordProb = New(&gcheap, sizeof(WordProb)); wordProb->class = nid2; wordProb->prob = 0; /* we haven't calculated this yet */ wordProb->id = i; nid->ptr = wordProb; /* Point word name id here */ /* Set up binMap equivalent */ lm->classBM[loop] = nid; LM_INDEX(nid) = -i; /* assign negative indices (copied code) */ loop++; } /* Check for left over lines */ while (GetInLine(src, line)) { if (strlen(line)>0) { HError(15450, "ReadClassCounts: Extraneous line on end of Word|Class probabilities file\n('%s')", line); } } if (floor_count) HError(-15450, "ReadClassCounts: a total of %d counts were floored", floor_count);}/* CountClassTotals: calculate class count totals for LM */static void CountClassTotals(BackOffLM *lm){ register int i; /* Loop counter */ int word_id, class_id; lm->totals = New(&gcheap, lm->vocSize * sizeof(int)); for (i=0; i<lm->vocSize; i++) { lm->totals[i] = 0; } for (i=0; i<(lm->classW); i++) { word_id = ((WordProb*)(lm->classBM[i+1]->ptr))->id; if (word_id!=i) HError(15490, "CountClassTotals: Inconsistent word ids found"); class_id = (int)(((WordProb*)(lm->classBM[i+1]->ptr))->class->ptr); lm->totals[class_id] += lm->word[i]; }}/* CalcWordClassProbs: calculate initial/static word|class probabilities */static void CalcWordClassProbs(BackOffLM *lm){ int i; /* loop counter */ int class_id; double prob=0; /* For each word */ for (i=0; i<lm->classW; i++) { class_id = (int)(((WordProb*)(lm->classBM[i+1]->ptr))->class->ptr); prob = (((double)(lm->word[i]))) / ((double)(lm->totals[class_id])); ((WordProb*)(lm->classBM[i+1]->ptr))->prob = LOG_NATURAL(prob); }}/* ReadClassProbs: read in word|class probabilities file */static void ReadClassProbs(Source *src, int nWords, BackOffLM *lm){ char line[MAXSYMLEN]; /* Current input line */ char *ptr, *ptr2; /* Temporary pointers */ int i; /* Loop counter */ WordProb *wordProb; /* Temporary pointer */ int loop=1; NameId nid, nid2; /* Add labels and wordlist entries for words */ for (i=0; i<nWords; i++) { ptr = GetInLine(src, line); if (!ptr || strlen(ptr)==0) { HError(15450, "ReadClassProbs: Blank line/end of file in word|class language model file"); } /* Segment line into word, class and log probability */ /* We could use strtok(), but I can't be sure that this isn't being used elsewhere wrapped around this call, so I won't! */ ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of word */
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -