?? cluster.c
字號:
}}#endif/* Complain about a broken header */static void invalid_header(void){ HError(17013, "Classmap has broken header - missing '='");}/* Import a HLM classmap file. Currently ignores contents of 'EscMode' field. GLM Also non-IN?*/void import_classmap(char *fname, int numb_words){#define max_line_len 500 FILE *file; /* Input file handle */ char line[max_line_len]; /* Line read buffer */ int C; /* Current class index */ int size; /* Size of current class */ int i; /* Loop counter */ char *ptr; /* Text pointer */ UInt id; /* Word id */ int reassigned = 0; /* Number of reassigned classes */ int unexpected = 0; /* Number of unexpected lines trailing class descriptions */ Boolean pipe_status; if (trace & T_FILE) { printf("Importing classmap '%s'\n", fname); } W = numb_words; clMemb = CNew(&global_stack, W * sizeof(int)); /* Set impossible classmap in order to do integrity check after import */ for (i=0; i<W; i++) { clMemb[i] = -1; } N = 0; file = FOpen(fname, LCMapFilter, &pipe_status); check_file(file, fname, "import_classmap"); while (fgets(line, max_line_len, file)) { if (strncmp(line, "Entries", 7)==0) { ptr = strchr(line, '='); if (!ptr) invalid_header(); ptr++; ptr = strtok(ptr, " \t\n"); N = atoi(ptr); if (trace & T_EXTRA) { printf("Number of classes = %d\n", N); } } else if (strncmp(line, "Iterations", 10)==0) { ptr = strchr(line, '='); if (!ptr) invalid_header(); ptr++; ptr = strtok(ptr, " \t\n"); export_index = atoi(ptr); } else if (strncmp(line, "EscMode", 7)==0) { ptr = strchr(line, '='); if (!ptr) invalid_header(); ptr++; ptr = strtok(ptr, " \t\n"); if (strcmp(ptr, "HTK")==0) { if (inCMapRawTrap && inCMapRaw) { HError(-17013, "Class map specifies HTK escaping on input but configuration file specifies Raw escaping -- using HTK escaping for input"); } inCMapRaw = FALSE; } else if (strcmp(ptr, "Raw")==0) { if (inCMapRawTrap && !inCMapRaw) { HError(-17013, "Class map specifies Raw escaping on input but configuration file specifies HTK escaping -- using Raw escaping for input"); } inCMapRaw = TRUE; } else { HError(17013, "Classmap has unknown escaping of type '%s'", ptr); } if (!outCMapRawTrap) { if (outCMapRaw != inCMapRaw) { HError(-17013, "Setting output class map escaping to same format as input class map (%s)", inCMapRaw?"Raw":"HTK"); } outCMapRaw = inCMapRaw; /* This is common sense */ } if (inCMapRaw != outCMapRaw) { HError(-17013, "Input class map escaping and output class map escaping differ (this is not a problem -- this warning is to alert you in case you meant them to be the same)"); } } else if (strncmp(line, "\\Classes\\", 9)==0) { break; } } if (feof(file)) { HError(17013, "Classmap file is corrupt/contains no classes!"); } if (!N) { HError(17013, "Corrupt classmap header - must specify number of classes!"); } if (trace & T_EXTRA) { printf("Iterations = %d\n", export_index); } C = 0; while (fgets(line, max_line_len, file)) { if (C>=N) { if (strstr(line, " IN")) { HError(17013, "More classes are described than are specified in the header!"); } else { ptr = strtok(line, " \t\n"); if (!ptr) continue; else { HError(-17013, "Warning: ignoring '%s' at end of classmap file", ptr); } } } if (strstr(line, " IN")) { /* Start of a new class */ /* Make this class 'C' */ strtok(line, " \t"); ptr = strtok(NULL, " \t"); if (!ptr) { HError(17013, "Failure reading class header %d in classmap (no id)", C); } if (atoi(ptr) != C+1) { /* We'll renumber this class */ reassigned++; } ptr = strtok(NULL, " \t"); if (!ptr) { HError(17013, "Failure reading class header %d in classmap (no size)", C); } size = atoi(ptr); /* Read number of words in class */ for (i=0; i<size; i++) { fgets(line, max_line_len, file); ptr = strtok(line, " \t\n"); if (!ptr) { /* Warn about the blank line */ HError(-17013, "Found empty line inside class %d definition", C); i--; continue; } /* Unescape word if necessary */ if (!inCMapRaw) { if (strlen(ptr)>255) { HError(17013, "Cannot handle words longer than 255 characters when using HTK escaping (recompile with higher tmp[] buffer size in Cluster.c)"); } ParseString(ptr, tmp); /* Put word in class */ id = get_id_from_word(tmp); } else { /* Put word in class */ id = get_id_from_word(ptr); } if (inCMapRaw && strcmp(what_is_word(id), ptr)!=0) { HError(17095, "import_classmap: word '%s' is id '%d'; id is '%s'!", ptr, id, what_is_word(id)); } else if (!inCMapRaw && strcmp(what_is_word(id), tmp)!=0) { HError(17095, "import_classmap: word '%s' is id '%d'; id is '%s'!", tmp, id, what_is_word(id)); } if (clMemb[id] != -1) { HError(17094, "Word '%s' occurs more than once in classmap!", ptr); } clMemb[id] = C; } C++; } else { /* Where is class header? It's gone missing! */ if (strlen(line)>0) { if (strchr(line, '\n')) *strchr(line, '\n')='\0'; /* Strip linefeed */ if (strlen(line)==0) continue; HError(-17013, "Unexpected line '%s' in classmap", line); unexpected++; } if (unexpected>9) { HError(17013, "Too many unexpected lines in classmap - aborting now"); } /* Loop round to see if it's coming up next */ } } if (C<N) { HError(17013, "Less classes are described than are specified in the header!"); } if (trace & T_TOP) { if (reassigned) { if (reassigned>1) { printf("%d class ids were reassigned\n", reassigned); } else { printf("1 class id was reassigned\n"); } } else { printf("No class ids were reassigned\n"); } } /* Check all words were assigned */ for (i=0; i<W; i++) { if (clMemb[i]==-1) { HError(17052,"import_classmap: Not all words were assigned to classes"); } } FClose(file, pipe_status); if (trace & T_FILE) { printf("Class map import successful\n"); }}/* Write out p(word | class) probabilities */void write_word_probs(char *filename){ FILE *out; int i; /* Loop counter */ double probability; Boolean pipe_status; /* These files never use HTK escaping */ out = FOpen(filename, NoOFilter, &pipe_status); check_file(out, filename, "write_word_probs"); /* Write header */ fprintf(out, "Word|Class probabilities\n"); fprintf(out, "\n"); fprintf(out, "Derived from: %s\n", export_prefix); fprintf(out, "Number of classes: %d\n", N); fprintf(out, "Number of words: %d\n", W); fprintf(out, "Iterations: %d\n", export_index); fprintf(out, "\n"); fprintf(out, "%-15s\tClass name\tProbability (log)\n", "Word"); for (i=0; i<W; i++) { if (uni[i]==0) uni[i]=1; } /* Use tmp_sum1[] to save having to allocate a new array (so can't call this from within a class change calculation, but this isn't a problem!) */ for (i=0; i<N; i++) { tmp_sum1[i] = 0; } for (i=0; i<W; i++) { tmp_sum1[clMemb[i]] += uni[i]; } for (i=0; i<W; i++) { probability = (double)uni[i]/((double)tmp_sum1[clMemb[i]]); fprintf(out, "%-15s\tCLASS%-4d\t%f\n", what_is_word(i), clMemb[i]+1, LOG_NATURAL(probability)); if (LOG_NATURAL(probability)<-90) { printf("prob is %f, discount is %f, uni is %d\n", LOG_NATURAL((double)uni[i]/((double)tmp_sum1[clMemb[i]])), mlv[clMemb[i]], uni[i]); } } FClose(out, pipe_status); if (trace & T_FILE) { printf("Wrote word|class probabilities to '%s'\n", filename); }}/* Write out p(word | class) counts */void write_word_counts(char *filename){ FILE *out; int i; /* Loop counter */ Boolean pipe_status; /* These files never use HTK escaping */ /* Open output file */ out = FOpen(filename, NoOFilter, &pipe_status); check_file(out, filename, "write_word_counts"); /* Write header */ fprintf(out, "Word|Class counts\n"); fprintf(out, "\n"); fprintf(out, "Derived from: %s\n", export_prefix); fprintf(out, "Number of classes: %d\n", N); fprintf(out, "Number of words: %d\n", W); fprintf(out, "Iterations: %d\n", export_index); fprintf(out, "\n"); fprintf(out, "%-15s\tClass name\tCount\n", "Word"); for (i=0; i<W; i++) { fprintf(out, "%-15s\tCLASS%-4d\t%d\n", what_is_word(i), clMemb[i]+1, uni[i]); } FClose(out, pipe_status); if (trace & T_FILE) { printf("Wrote word|class counts to '%s'\n", filename); }}/* Specify whether to keep the unknown word in its own solo-member class or not */void classes_keep_unk_separate(int keep_separate){ unk_sep = (Boolean) keep_separate; start_class = unk_sep?3:2;}/* Pass in start, end and unknown word ids */void set_ids(int start, int end, int unk){ start_id = start; end_id = end; unk_id = unk;}/* This set of functions takes unigram counts, stores them, and then allows them to be retrieved. It simply allocates a count for each possible word id, since they are allocated in a continuous block.*//* Initialise this unigram storage module */void unigram_init(int words){ max_words = words; uni = CNew(&global_stack, words * sizeof(unigram)); sum_of_all_uni_counts = 0;}/* Add a unigram */void unigram_add(NGram ng, int count){ if (ng[0]>=max_words) { /* Something's gone wrong */ HError(17093, "unigram_add: Found a word id higher than the base+number of words - word ids are expected to be allocated in an unbroken chunk\n[Current unigram is (%d); number of words is %d]", ng[0], max_words); } uni[ng[0]] += count; sum_of_all_uni_counts += count; return;}/* Read a unigram */UInt unigram_read(UInt id){#ifdef INTEGRITY_CHECK if ((id<0) || (id>=max_words)) { HError(17092, "unigram_read: attempt to read unigram outside bounds (%d; %d words)", id, max_words); }#endif return uni[id];}/* This section contains functions to store a sequence of bigrams - they must be sequenced before passing to this code, since it relies on the input being sorted. Both forward and backward word to all bigrams look-up tables are built, so given either u or v from a bigram (u,v) then the set of all (u,*) or (*,v) can be found. *//* Grab some space from our current local storage block */static void *get_space(int size){ static void* ptr; /* Test against our not-worth-using cut-off point */ if (size>block_cut_off) return New(&global_stack, size); /* Use New() again if necessary to get a new block */ if (((int)block+(int)size) >= (int)block_end) { block = New(&global_heap, block_grab_size); block_end = (void *) ((int)block+(int)block_grab_size); } /* Hand back the next free space */ ptr = block; block = (void*) ((int) block + (int) size); /* Next free byte */ block = (void*) ((((int)block)+3) & (~(int)3)); /* Word-align */ return ptr;}/* Add a bigram */void bigram_add(NGram ng, int count)
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -