?? cluster.c
字號:
/* Return the number of classes used by default (or currently set) */int classes_get_default(void){ return N;}/* Set the number of classes used */void classes_set_number(int numb){ if (clCnt) { HError(17099, "classes_set_number: must be called prior to initialisation"); } if (numb<1) { HError(17099, "classes_set_number: number of classes must be 1 or more"); } N = numb;}/* Initialise this module */void classes_init(int numb_words){ int i, j; if (W>0 && W!=numb_words) { /* Could only get here if code to do with loading classmap is broken */ HError(17098, "classes_init: internal inconsistency - number of words has changed"); } W = numb_words; /* Create empty storage table */ if (trace & T_MEM) { printf("Allocating memory for %d classes\n", N); } class_sort = CNew(&global_stack, W * sizeof(UInt)); clSum = CNew(&global_stack, N * sizeof(int)); tmp_c1 = CNew(&global_stack, N * sizeof(int)); tmp_c2 = CNew(&global_stack, N * sizeof(int)); tmp_c3 = CNew(&global_stack, N * sizeof(int)); tmp_c4 = CNew(&global_stack, N * sizeof(int)); tmp_sum1 = CNew(&global_stack, N * sizeof(int)); tmp_sum2 = CNew(&global_stack, N * sizeof(int)); mlv = CNew(&global_stack, N * sizeof(double)); sort_uni = CNew(&global_stack, W * sizeof(int)); if (!clMemb) clMemb = CNew(&global_stack, W * sizeof(int)); /* May have been setup by existing map */ clCnt = CNew(&global_stack, N * sizeof(int *)); for (i=0; i<N; i++) { clCnt[i] = CNew(&global_stack, N * sizeof(int)); } if (trace & T_MEM) { printf("Class memory allocated\n"); } /* Create array of bigram (w,w) pair counts (ie. word followed by itself) */ bipair = CNew(&global_stack, W * sizeof(int)); for (i=0; i<W; i++) { bipair[i] = 0; for (j=0; j<forward[i].size; j++) { if (forward[i].bi[j].id == i) { bipair[i] = forward[i].bi[j].count; break; } } }}/* See what change results when word 'w' moved to class 'g' */static void classes_change(UInt w, int g){ register int i; /* tmp_c1[] stores the set of class counts C(G(w),*) tmp_c2[] stores the set of class counts C(*,G(w)) tmp_c3[] stores the set of class counts C(g,*) tmp_c4[] stores the set of class counts C(*,g) tmp_sum1[] stores the bigram class counts C(w,*) tmp_sum2[] stores the bigram class counts C(*,w) */ /* Loop over all classes */ for (i=0; i<N; i++) { if (i!=curr_class && i!=g) { if (tmp_sum1[i]) { /* (G(w),gi) => -C(w,gi) */ tmp_c1[i] = clCnt[curr_class][i] - tmp_sum1[i]; /* (g,gi) => +C(w,gi) */ tmp_c3[i] = clCnt[g][i] + tmp_sum1[i]; } else { tmp_c1[i] = clCnt[curr_class][i]; tmp_c3[i] = clCnt[g][i]; } if (tmp_sum2[i]) { /* (gi,G(w)) => -C(gi,w) */ tmp_c2[i] = clCnt[i][curr_class] - tmp_sum2[i]; /* (gi,g) => +C(gi,w) */ tmp_c4[i] = clCnt[i][g] + tmp_sum2[i]; } else { tmp_c2[i] = clCnt[i][curr_class]; tmp_c4[i] = clCnt[i][g]; } } } /* Calculate correct values for class-to-or-from-only pairs */ /* (G(w),G(w)) => -C(w,G(x)) - C(G(x),w) + C(w,w) */ GwGw = clCnt[curr_class][curr_class] - tmp_sum1[curr_class] - tmp_sum2[curr_class] + bipair[w]; /* (G(w),g) => -C(w,g) + C(G(x),w) - C(w,w) */ Gwg = clCnt[curr_class][g] - tmp_sum1[g] + tmp_sum2[curr_class] - bipair[w]; /* (g,G(w)) => -C(g,w) + C(w,G(x)) - C(w,w) */ gGw = clCnt[g][curr_class] - tmp_sum2[g] + tmp_sum1[curr_class] - bipair[w]; /* (g,g) => +C(w,g) + C(g,w) + C(w,w) */ gg = clCnt[g][g] + tmp_sum1[g] + tmp_sum2[g] + bipair[w];}/* Decide on a class to move word 'w' to. Returns class index. */static int choose_class(UInt w){ register int i; register int j; double d; /* Change in optimisation value */ int uniGx, unig; int best_class; double best_change; double start_value; best_class = curr_class; best_change = 0; /* Create set of forward bigram class counts, C(w,*) and C(*,w) (for * = any class) */ for (i=0; i<N; i++) { tmp_sum1[i] = 0; tmp_sum2[i] = 0; } for (i=0; i<forward[w].size; i++) { tmp_sum1[clMemb[forward[w].bi[i].id]] += forward[w].bi[i].count; } for (i=0; i<backward[w].size; i++) { tmp_sum2[clMemb[backward[w].bi[i].id]] += backward[w].bi[i].count; } /* Try all classes */ for (i=start_class; i<N; i++) { if (i==curr_class || uni[w]==0) { /* If we have no information about this word, or its a self-move, don't bother (self-move gives zero change) */ continue; } d = 0; classes_change(w, i); /* Word has moved to class i, so see how this would change our optimisation equation */ uniGx = clSum[curr_class] - uni[w]; unig = clSum[i] + uni[w]; /* Counts involving original class and a new class */ for (j=0; j<N; j++) { if ((j!=curr_class) && (j!=i)) { if (tmp_c1[j]) { d += ((double)tmp_c1[j]) * log(tmp_c1[j]); } if (tmp_c2[j]) { d += ((double)tmp_c2[j]) * log(tmp_c2[j]); } if (tmp_c3[j]) { d += ((double)tmp_c3[j]) * log(tmp_c3[j]); } if (tmp_c4[j]) { d += ((double)tmp_c4[j]) * log(tmp_c4[j]); } } } /* Unigram part of summation */ if (uniGx) { d -= 2*(((double)uniGx)*log(uniGx)); } if (unig) { d -= 2*(((double)unig)*log(unig)); } /* Exceptions */ if (GwGw) { d += ((double)GwGw) * log(GwGw); } if (Gwg) { d += ((double)Gwg) * log(Gwg); } if (gGw) { d += ((double)gGw) * log(gGw); } if (gg) { d += ((double)gg) * log(gg); } /* Now make 'd' into a difference: */ start_value = mlv[curr_class] + mlv[i]; /* Subtract off the two values we added twice by using mlv[] */ if (clCnt[curr_class][i]) start_value -= clCnt[curr_class][i]*log(clCnt[curr_class][i]); if (clCnt[i][curr_class]) start_value -= clCnt[i][curr_class]*log(clCnt[i][curr_class]); /* And calculate 'd': */ d = d - start_value; if (verbose && logfile) { fprintf(logfile, "...moving word %d to class %d from class %d gives %f change\n", w, i, curr_class, d); } if (d>best_change) { /* Accuracy check - this is a bit of a dodgy hack for gcc 3 */ sprintf(tmp, "%f", d); /* This will flush d from the higher-precision maths register */ /* No doubt a much faster way of doing this, but never mind */ if (d>best_change) { if (verbose && logfile) { fprintf(logfile, " \\- which is better than the current best\n"); } best_change = d; best_class = i; } else { HError(-17059, "Noticed a comparison accuracy difference [you may safely ignore this warning]"); } } } if (show_MLV) { curr_MLV += best_change; } return best_class;}/* Define sort order for word unigrams */static int freq_sort_order(int *in1, int *in2){ return ( (uni[*in1] < uni[*in2]) | (-(uni[*in1] > uni[*in2])));}/* Perform one iteration of the clustering algorithm */static void do_one_iteration(int w_period, int start_word){ UInt w, j, w_index; int to; FILE *file; Boolean pipe_status; int total_warnings=0; for (w=0; w<W; w++) { sort_uni[w] = w; } if (sort_order == SORT_FREQ) { qsort(sort_uni, W, sizeof(int), (int (*) (const void *, const void *)) &freq_sort_order); } for (w_index=start_word; w_index<W; w_index++) { w = sort_uni[w_index]; if (w_period && w%w_period==0) { /* Write recovery file */ export_classes(1); sprintf(tmp, "%.150s.recovery", export_prefix); file = FOpen(tmp, NoOFilter, &pipe_status); check_file(file, tmp, "do_one_iteration"); fprintf(file, "Clustering automatic recovery status file\n"); fprintf(file, "Clustered up to (excluding) word: %d\n", w_index); fprintf(file, "Clusters are stored in: %.150s.recovery.cm\n", export_prefix); fprintf(file, "Keep unknown word token separate: %d\n", unk_sep?1:0); fprintf(file, "Sort order: %s\n", (sort_order==SORT_WMAP)?"WMAP":"FREQ"); FClose(file, pipe_status); } if ((w==start_id) || (w==end_id) || (unk_sep && (w==unk_id))) { /* We don't want to move this special token, so skip it */ continue; } if (uni[w]==0) { /* Word is in wordlist but not used, so warn */ if (total_warnings<10) { HError(-17053, "Word '%s' is in word map but not in any gram files", what_is_word(w)); } else if (total_warnings==10) { HError(-17053, "Suppressing further word 'x' not in gram file warnings"); } total_warnings++; continue; } if (logfile) { if (verbose) { fprintf(logfile, "...deciding whether/where to move word %d (of %d - %2.2f%% done) [id=%d]\n", w_index, W, ((float)w_index/(float)W)*100.0, w); } else { fprintf(logfile, "%d [%d] (%2.2f%%):\t", w_index, w, ((float)w_index/(float)W)*100.0); } } curr_class = clMemb[w]; /* Find out what class word is currently in */ to = choose_class(w); /* Work out where to move it to */ if (curr_class != to) { if (logfile) { if (verbose) { fprintf(logfile, "...moving word id %d from class %d to class %d\n", w, curr_class, to); } else { fprintf(logfile, "-> %d\n", to); } fflush(logfile); } classes_change(w, to); /* Calculate new unigram and bigram values */ /* Remove influence of these two classes from MLV values */ for (j=0; j<N; j++) { if (j!=to && j!=curr_class) { if (clCnt[to][j]) mlv[j] -= ((double)clCnt[to][j]) * log(clCnt[to][j]); if (clCnt[j][to]) mlv[j] -= ((double)clCnt[j][to]) * log(clCnt[j][to]); if (clCnt[curr_class][j]) mlv[j] -= ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]); if (clCnt[j][curr_class]) mlv[j] -= ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]); } } /* Make change permanent */ /* Class map */ clMemb[w] = to; /* Class unigram counts */ clSum[curr_class] -= uni[w]; clSum[to] += uni[w]; /* Class bigram counts */ for (j=0; j<N; j++) { if ((j!=curr_class) && (j!=to)) { /* (Gw, *) */ clCnt[curr_class][j] = tmp_c1[j]; /* (*, Gw) */ clCnt[j][curr_class] = tmp_c2[j]; /* (g, *) */ clCnt[to][j] = tmp_c3[j]; /* (*, g) */ clCnt[j][to] = tmp_c4[j]; } } /* Exceptions */ clCnt[curr_class][curr_class] = GwGw; clCnt[curr_class][to] = Gwg; clCnt[to][curr_class] = gGw; clCnt[to][to] = gg; /* Recalculate maximum-likelihood values involving this class */ mlv[to] = 0; for (j=0; j<N; j++) { if (clCnt[to][j]) mlv[to] += ((double)clCnt[to][j]) * log(clCnt[to][j]); if (to!=j) { if (clCnt[j][to]) mlv[to] += ((double)clCnt[j][to]) * log(clCnt[j][to]); } } if (clSum[to]) mlv[to] -= 2*(((double)clSum[to]) * log(clSum[to])); mlv[curr_class] = 0; for (j=0; j<N; j++) { if (clCnt[curr_class][j]) mlv[curr_class] += ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]); if (curr_class!=j) { if (clCnt[j][curr_class]) mlv[curr_class] += ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]); } } if (clSum[curr_class]) mlv[curr_class] -= 2*(((double)clSum[curr_class]) * log(clSum[curr_class])); /* Update MLV values for other classes */ for (j=0; j<N; j++) { if (j!=to && j!=curr_class) { if (clCnt[to][j]) mlv[j] += ((double)clCnt[to][j]) * log(clCnt[to][j]); if (clCnt[j][to]) mlv[j] += ((double)clCnt[j][to]) * log(clCnt[j][to]);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -