?? cluster.c
字號:
if (clCnt[curr_class][j]) mlv[j] += ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]); if (clCnt[j][curr_class]) mlv[j] += ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]); } } } else { if (logfile) { if (verbose) { fprintf(logfile, "...decided not to move word %d from class %d\n", w, curr_class); } else { fprintf(logfile, "--\n"); } } fflush(stdout); } if (show_MLV && logfile) { fprintf(logfile, " MLV = %f\n", curr_MLV); }#ifdef INTEGRITY_CHECK /* Debug: Check our counts still sum correctly */ check_counts_sum(); /* Debug: Check our updated MLV counts */ max_likelihood_check();#endif } if (w_period) { /* Make sure recovery file reflects end of iteration */ export_classes(1); sprintf(tmp, "%.150s.recovery", export_prefix); file = FOpen(tmp, NoOFilter, &pipe_status); check_file(file, tmp, "do_one_iteration"); fprintf(file, "Clustering automatic recovery status file\n"); fprintf(file, "Clustered up to (excluding) word: all\n"); fprintf(file, "Clusters are stored in: %.150s.recovery.cm\n", export_prefix); fprintf(file, "Keep unknown word token separate: %d\n", unk_sep?1:0); fprintf(file, "Sort order: %s\n", (sort_order==SORT_WMAP)?"WMAP":"FREQ"); FClose(file, pipe_status); } if (total_warnings>=10) { HError(-17053, "A total of %d words were found in the wordmap but not in the gram files", total_warnings); }}/* Recover from a given recovery file */void do_recovery(char *fname, int words){ FILE *file; char *ptr; int from; Boolean pipe_status; file = FOpen(fname, NoFilter, &pipe_status); check_file(file, fname, "do_recovery"); fgets(tmp, 256, file); if (strncmp(tmp, "Clustering automatic", 20)!=0) { HError(17013, "This is not a recovery status file"); } fgets(tmp, 256, file); ptr = strchr(tmp, ':'); if (!ptr) { HError(17013, "Failure to read current word point from status file"); } ptr++; ptr += strspn(ptr, " \t"); if (strncmp(ptr, "all", 3)==0) { from = -1; } else { from = atoi(ptr); } fgets(tmp, 256, file); ptr = strchr(tmp, ':'); if (!ptr) { HError(17013, "Failure to read recovery class map file name from status file"); } ptr++; ptr = strtok(ptr, " \t\n"); import_classmap(ptr, words); fgets(tmp, 256, file); ptr = strchr(tmp, ':'); if (!ptr) { HError(17013, "Failure to read recovery unknown word status from status file"); } ptr++; ptr += strspn(ptr, " \t"); unk_sep = (*ptr=='1'); start_class = unk_sep?3:2; fgets(tmp, 256, file); ptr = strchr(tmp, ':'); if (!ptr) { HError(17013, "Failure to read recovery word sort order status from status file"); } ptr++; ptr += strspn(ptr, " \t"); sort_order = (*ptr=='W')?SORT_WMAP:SORT_FREQ; FClose(file, pipe_status); if (trace & T_TOP) { printf("Continuing from recovered state\n"); } classes_init(words); setup_all_counts(); if (trace & T_TOP) { printf("Iterations that had been completed: %d\n", export_index); } export_index++; /* Open output log file */ if (write_logfile) { sprintf(tmp, "%.150s.%d.log", export_prefix, export_index); logfile = FOpen(tmp, NoOFilter, &pipe_logfile); check_file(logfile, tmp, "do_recovery"); } else logfile = NULL; if (from>=0) { do_one_iteration(rec_freq, from); } if (logfile) FClose(logfile, pipe_logfile); export_classes(0); if (trace & T_EXTRA) { printf("Completed iteration which started from recovered state\n"); if (from == -1) { printf(" (no change since recovery state was stored at end of iteration)\n"); } }}/* Initialise the values used when calculating the current value of the maximum likelihood equation used when clustering */static void max_likelihood_init(void){ int i, j; if (show_MLV) curr_MLV=0; /* We store all those values from the summation which involve a particular class in a value specifically for that class */ for (i=0; i<N; i++) { mlv[i] = 0; for (j=0; j<N; j++) { if (clCnt[i][j]) { mlv[i] += ((double)clCnt[i][j]) * log(clCnt[i][j]); if (show_MLV) { curr_MLV += ((double)clCnt[i][j]) * log(clCnt[i][j]); } } if (i!=j) { if (clCnt[j][i]) mlv[i] += ((double)clCnt[j][i]) * log(clCnt[j][i]); } } if (clSum[i]) { mlv[i] -= 2*(((double)clSum[i]) * log(clSum[i])); if (show_MLV) { curr_MLV -= 2*(((double)clSum[i]) * log(clSum[i])); } } }}#ifdef INTEGRITY_CHECK/* Check the contents of the maximum likelihood running totals store */static void max_likelihood_check(void){ int i, j; double a; char s1[50], s2[50]; /* We store all those values from the summation which involve a particular class in a value specifically for that class */ for (i=0; i<N; i++) { a = 0; for (j=0; j<N; j++) { if (clCnt[i][j]) a += ((double)clCnt[i][j]) * log(clCnt[i][j]); if (i!=j) { if (clCnt[j][i]) a += ((double)clCnt[j][i]) * log(clCnt[j][i]); } } if (clSum[i]) a -= 2*(((double)clSum[i]) * log(clSum[i])); /* Compare strings, to ignore minor precision differences */ sprintf(s1, "%f", a); sprintf(s2, "%f", mlv[i]); if (strcmp(s1, s2)) { HError(17097, "max_likelihood_check: MLV for class %d is wrong - %f instead of %f", i, mlv[i], a); } }}#endif/* Perform a given number of iterations of the clustering algorithm */void cluster_words(int iterations){ int i; for (i=0; i<iterations; i++) { /* Also keep a separate iteration count - we do this because it's possible to call cluster_words() multiple times from a host program, or to continue from an existing classmap */ export_index++; if (trace & T_TOP) { printf("Beginning iteration %d\n", export_index); } /* Open output log file */ if (write_logfile) { sprintf(tmp, "%.150s.%d.log", export_prefix, export_index); logfile = FOpen(tmp, NoOFilter, &pipe_logfile); check_file(logfile, tmp, "cluster_words"); } do_one_iteration(rec_freq, 0); if (logfile) FClose(logfile, pipe_logfile); if (trace & T_TOP) { printf("Iteration complete\n"); } }}/* Setup all class counts, given existing class word map */void setup_all_counts(void){ register int i, j; for (i=0; i<N; i++) { clSum[i] = 0; for (j=0; j<N; j++) { clCnt[i][j] = 0; } } for (i=0; i<W; i++) { /* Class unigram counts */ clSum[clMemb[i]] += uni[i]; /* Class bigram counts */ for (j=0; j<forward[i].size; j++) { clCnt[clMemb[i]][clMemb[forward[i].bi[j].id]] += forward[i].bi[j].count; } } /* Now initialise the maximisation function class values */ max_likelihood_init();}/* Perform some initial clustering - currently just puts all in one class */void initial_cluster(void){ register int i; for (i=0; i<W; i++) { if (unk_sep) { clMemb[i] = 3; /* Put everything in class 3 */ } else { clMemb[i] = 2; /* Put everything in class 2 */ } } clMemb[start_id] = 0; clMemb[end_id] = 1; if (unk_sep) { clMemb[unk_id] = 2; } /* Note that external class numbers are all +1 relative to internal (HLM can't cope with class 0 in a class map) */ if (trace & T_EXTRA) { printf("Initial clustering performed: all words in class %d (total count=%d)\n", unk_sep?4:3, sum_of_all_bigram_counts); printf (" (sentence start in class 1; sentence end in class 2%s)\n", unk_sep?"; unknown in class 3":""); }}/* Define sorting order of words alphabetically, given id */int id_sort(UInt *in1, UInt *in2){ return strcmp(what_is_word(*in1), what_is_word(*in2));}/* Write out a HLM class map file (pass non-zero to write recovery file) */void export_classes(int recovery){ FILE *out; int i, j, index; Boolean pipe_status; /* %.150s limits the length of the filename prefix to 150 characters */ if (recovery) { sprintf(tmp, "%.150s.recovery.cm", export_prefix); } else { sprintf(tmp, "%.150s.%d.cm", export_prefix, export_index); } out = FOpen(tmp, LCMapOFilter, &pipe_status); check_file(out, tmp, "export_classes"); /* Write header */ if (recovery) { fprintf(out, "Name=Classmap_%s_iteration%d\n", export_prefix, export_index-1); fprintf(out, "Entries=%d\n", N); fprintf(out, "Iterations=%d\n", export_index-1); } else { fprintf(out, "Name=Classmap_%s_iteration%d\n", export_prefix, export_index); fprintf(out, "Entries=%d\n", N); fprintf(out, "Iterations=%d\n", export_index); } if (outCMapRaw) { fprintf(out, "EscMode=Raw\n"); } else { fprintf(out, "EscMode=HTK\n"); } fprintf(out, "\\Classes\\\n"); for (i=0; i<N; i++) { index = 0; for (j=0; j<W; j++) { if (clMemb[j] == i) { class_sort[index] = j; index++; } } qsort(class_sort, index, sizeof(UInt), (int (*) (const void *, const void *)) &id_sort); fprintf(out, "CLASS%d %d %d IN\n", i+1, i+1, index); if (outCMapRaw) { for (j=0; j<index; j++) { fprintf(out, " %s\n", what_is_word(class_sort[j])); } } else { for (j=0; j<index; j++) { fprintf(out, " %s\n", ReWriteString(what_is_word(class_sort[j]), NULL, ESCAPE_CHAR)); } } } FClose(out, pipe_status);}#ifdef INTEGRITY_CHECK/* Debugging: Do integrity check on counts - ensure they sum to the same value after each loop! */void check_counts_sum(void){ register int i, j; register int a, b; a = 0; b = 0; for (i=0; i<N; i++) { a += clSum[i]; for (j=0; j<N; j++) { b += clCnt[i][j]; } } if (a != sum_of_all_uni_counts) { HError(17096, "check_counts_sum: unigrams now sum to %d, not %d", a, sum_of_all_uni_counts); } if (b != sum_of_all_bigram_counts) { HError(17096, "check_counts_sum: bigrams now sum to %d, not %d", a, sum_of_all_bigram_counts); } if (a != b) { HError(17096, "check_counts_sum: uni and bi totals differ - %d v %d", a, b);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -