?? cluster.c
字號:
{ bi_count *ptr; int space_used; if ((ng[0]>=max_words) || (ng[1]>=max_words)) { /* Something's gone wrong */ HError(17093, "bigram_add: Found a word id higher than the base+number of words - all word ids are expected to be allocated in an unbroken chunk.\n[Current bigram is (%d,%d). Number of words is %d]", ng[0], ng[1], max_words); } /* Keep backward count */ backward[ng[1]].size++; if (ng[0] == last_word) { /* Make sure there's room in the buffer */ if (store_idx >= curr_bistore_size) { /* Expand bigram buffer store to cope */ curr_bistore_size += bigram_buffer_grow; if (trace & T_MEM) { printf("Expanding bigram read buffer to %d entries\n", curr_bistore_size); } store = realloc(store, curr_bistore_size*sizeof(bi_count)); } /* Store in buffer */ store[store_idx].id = ng[1]; store[store_idx].count = count; store_idx++; return; } /* Otherwise we must have just gone on to a new word, so keep the old details */ forward[last_word].size = store_idx; space_used = store_idx*sizeof(bi_count); ptr = get_space(space_used); memcpy(ptr, store, space_used); forward[last_word].bi = ptr; /* And go on to the next entry */ last_word = ng[0]; store[0].id = ng[1]; store[0].count = count; store_idx = 1;}/* Call when all bigrams have been passed in */void bigram_added_all(void){ bi_count *ptr; int space_used; int i, j, backward_id; /* Store last set of details */ forward[last_word].size = store_idx; space_used = store_idx*sizeof(bi_count); ptr = get_space(space_used); memcpy(ptr, store, space_used); forward[last_word].bi = ptr; free(store); sum_of_all_bigram_counts = 0; /* Generate backward lookup table */ if (trace & T_EXTRA) { printf("Building bigram backward lookup table..."); fflush(stdout); } /* Allocate required storage space */ for (i=0; i<max_words; i++) { backward[i].bi = get_space(backward[i].size * sizeof(bi_count)); backward[i].size = 0; /* Reset to use as counter when building data */ } /* Run through all forward data, copying into backward array */ for (i=0; i<max_words; i++) { for (j=0; j<forward[i].size; j++) { backward_id = forward[i].bi[j].id; backward[backward_id].bi[backward[backward_id].size].id = i; backward[backward_id].bi[backward[backward_id].size].count = forward[i].bi[j].count; backward[backward_id].size++; sum_of_all_bigram_counts += forward[i].bi[j].count; } } if (trace & T_EXTRA) { printf(" done\n"); }}/* Must be called before almost any other function in this file will work */void bigram_init(int words) /* Pass ->used field from word-map */{ max_words = words; forward = CNew(&global_stack, words * sizeof(bigrams)); backward = CNew(&global_stack, words * sizeof(bigrams)); if (trace & T_MEM) { printf("Bigram store for %d words created\n", words); } last_word = 0; store_idx = 0; curr_bistore_size = initial_bigram_buffer; store = calloc(initial_bigram_buffer, sizeof(bi_count)); if (trace & T_MEM) { printf("Bigram read buffer of %d entries created\n", initial_bigram_buffer); }}/* Main program control function */int main(int argc, char *argv[]){ char *s; float weight; /* used when loading gram files */ char *filename; /* used when loading gram files */ int iterations=1, loop; char *init_cmap = NULL; char *recover_from = NULL; char *write_classprobs = NULL; char *write_classcounts = NULL; Boolean read_gram_files=FALSE; /* Has the user passed any gram files? */ Boolean set_classes = FALSE, loaded_map = FALSE; /* Check for -c and -l */ Boolean keep_unk_sep = FALSE; /* Was -k passed? */ Boolean passed_unk = FALSE; /* Unknown word was passed in */ int start_word_id, end_word_id, unknown_word_id; int numb_classes, min_classes; char *ptr, *ptr2; /* temp results */ /* Initialise HTK/HLM modules */ InitShell(argc, argv, Cluster_version, Cluster_vc_id); InitMem(); InitMath(); InitWave(); InitLabel(); InitLUtil(); InitWMap(); InitGBase(); SetConfParms(); /* Default start, end and unknown words */ strcpy(sent_start, DEF_STARTWORD); strcpy(sent_end, DEF_ENDWORD); strcpy(unknown_w, DEF_UNKNOWNNAME); /* Default number of classes */ numb_classes = classes_get_default(); /* Parse command line */ if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(EXIT_FAILURE); /* Create a global stack and heap */ CreateHeap(&global_stack, "Clusterer stack", MSTAK, 1, 0.0, 8192, 8192); CreateHeap(&global_heap, "Clusterer heap", MHEAP, block_grab_size, 0.0, 1, 1); while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s) !=1 ) HError(17019, "Cluster: Bad switch %s; must be single letter",s); switch(s[0]) { case 'c': if (NextArg()!=INTARG) HError(17019,"Cluster: number of categories expected for -c"); numb_classes = GetIntArg(); classes_set_number(numb_classes); set_classes = TRUE; break; case 'i': if (NextArg()!=INTARG) HError(17019,"Cluster: number of iterations expected for -i"); iterations = GetIntArg(); break; case 'r': if (NextArg()!=INTARG) HError(17019,"Cluster: recovery export frequency expected for -r"); rec_freq = GetIntArg(); break; case 'm': classes_showMLV(1); break; case 'o': if (NextArg()!=STRINGARG) HError(17019,"Cluster: output filename prefix expected for -o"); set_output_prefix(GetStrArg()); break; case 'p': if (NextArg()!=STRINGARG) HError(17019,"Cluster: output filename expected for -p"); write_classprobs = GetStrArg(); break; case 'q': if (NextArg()!=STRINGARG) HError(17019,"Cluster: output filename expected for -q"); write_classcounts = GetStrArg(); break; case 'l': if (NextArg()!=STRINGARG) HError(17019,"Cluster: output filename prefix expected for -l"); init_cmap = GetStrArg(); loaded_map = TRUE; break; case 's': if (NextArg()!=STRINGARG) HError(17019,"Cluster: sentence start word expected for -s"); strcpy(sent_start, GetStrArg()); break; case 't': if (NextArg()!=STRINGARG) HError(17019,"Cluster: sentence end word expected for -t"); strcpy(sent_end, GetStrArg()); break; case 'u': if (NextArg()!=STRINGARG) HError(17019,"Cluster: unknown word token expected for -u"); strcpy(unknown_w, GetStrArg()); passed_unk = TRUE; break; case 'x': if (NextArg()!=STRINGARG) HError(17019,"Cluster: recovery filename expected for -x"); recover_from = GetStrArg(); break; case 'w': if (NextArg()!=STRINGARG) HError(17019, "Cluster: wordmap sort order expected for -w"); strcpy(tmp, GetStrArg()); for (ptr=tmp; *ptr!=0; *ptr=toupper(*ptr), ptr++); if (strcmp(tmp, "WMAP")==0) { sort_order = SORT_WMAP; } else if (strcmp(tmp, "FREQ")==0) { sort_order = SORT_FREQ; } else { HError(17019, "Cluster: -w expects either WMAP or FREQ"); } break; case 'k': classes_keep_unk_separate(TRUE); keep_unk_sep = TRUE; break; case 'v': verbose = TRUE; break; case 'n': write_logfile = !write_logfile; break; case 'T': trace = GetChkedInt(0,017,s); break; default: HError(17019,"Cluster: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(17019, "Cluster: word map file name expected"); CreateWordMap(GetStrArg(), &wmap, 0); min_classes = 4 + (keep_unk_sep?1:0); /* Minimum number of classes */ if (loaded_map && set_classes) { HError(-17019, "Ignoring -c option: when combined with -l the number of classes in the existing map must be used"); } else if (numb_classes < min_classes) { HError(17019, "It doesn't make sense to specify less than %d classes -\n %d classes are reserved, and you need at least 2 more", min_classes, min_classes-2); } /* See if start and end word occur in the data */ if (!GetLabId(sent_start, FALSE)) { HError(17051, "Sentence start token '%s' not in word list"); } if (!GetLabId(sent_end, FALSE)) { HError(17051, "Sentence end token '%s' not in word list"); } /* We can't keep the unknown word in its own class if one wasn't passed */ if (!GetLabId(unknown_w, FALSE) && keep_unk_sep) { HError(17051, "Unknown word token '%s' not in word list and -k passed", unknown_w); } /* And generate a sensible warning if necessary: */ if (!GetLabId(unknown_w, FALSE) && passed_unk) { HError(-17051, "Unknown word token '%s' was explicitly given with -u, but does not occur in the word map", unknown_w); } start_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(sent_start, FALSE)->aux))->ndx)); end_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(sent_end, FALSE)->aux))->ndx)); if (keep_unk_sep) { unknown_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(unknown_w, FALSE)->aux))->ndx)); } else { unknown_word_id = 0; } set_ids(start_word_id, end_word_id, unknown_word_id); /* If we're doing no iterations we want to ignore the given filename prefix and use the one from the classmap - this way we'll write the correct information into the saved probabilities file header */ if (iterations==0 && init_cmap) { ptr = strrchr(init_cmap, '.'); if (ptr) { *ptr = '\0'; ptr2 = strrchr(init_cmap, '.'); if (ptr2) { *ptr2 = '\0'; set_output_prefix(init_cmap); *ptr2 = '.'; } else set_output_prefix(init_cmap); *ptr = '.'; } else { set_output_prefix(init_cmap); } } if (trace & T_FILE) { printf("Wordmap loaded - %d words\n", wmap.used); } unigram_init(wmap.used); bigram_init(wmap.used); /* Add input gram files to input set */ if (trace & T_TOP) printf("Preparing input gram set\n"); CreateHeap(&imem, "inputset", MSTAK, 1, 0.0, 1000, 1000); CreateHeap(&imem2, "inputset2", MSTAK, 1, 0.0, 1000, 1000); CreateInputSet(&imem, &wmap, &inset); CreateInputSet(&imem2, &wmap, &inset2); weight = 1.0; while (NextArg() == STRINGARG || NextArg() == FLOATARG) { if (NextArg() == FLOATARG) { weight = GetFltArg(); } if (weight==0.0 || weight<-10000.0 || weight>10000.0) { HError(17019, "Improbable gram file weight (%.4f)", weight); } if (NextArg()!=STRINGARG) { HError(17019,"Gram file name expected"); } filename = GetStrArg(); AddInputGFile(&inset, filename, weight); AddInputGFile(&inset2, filename, weight); read_gram_files = TRUE; if (trace & T_TOP) printf("Input gram file %s added (weight=%f)\n", filename, weight); } if (!read_gram_files) { HError(17019, "No gram files passed"); } LoadBiGrams(); LoadUniGrams(); bigram_added_all(); DeleteHeap(&imem); DeleteHeap(&imem2); if (init_cmap) { import_classmap(init_cmap, wmap.used); } else if (recover_from) { do_recovery(recover_from, wmap.used); } /* Allocate memory and compute bigram pair arrays */ if (!recover_from) { classes_init(wmap.used); /* Perform default initial clustering */ if (!init_cmap) { initial_cluster(); } /* Calculate initial counts required */ setup_all_counts(); } /* Run clustering algorithm */ for (loop=0; loop<iterations; loop++) { cluster_words(1); export_classes(0); } if (write_classprobs) { write_word_probs(write_classprobs); } if (write_classcounts) { write_word_counts(write_classcounts); } if (trace &
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -