?? cluster.c
字號:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* developed at: *//* *//* Speech Vision and Robotics group *//* Cambridge University Engineering Department *//* http://svr-www.eng.cam.ac.uk/ *//* *//* author: Gareth Moore <glm20@eng.cam.ac.uk> *//* *//* ----------------------------------------------------------- *//* Copyright: *//* *//* 1999-2002 Cambridge University *//* Engineering Department *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* Cluster.c: Cluster words into classes */char *Cluster_version = "!HVER!Cluster: 3.2 [CUED 09/12/02]";char *Cluster_vc_id = "$Id: Cluster.c,v 1.1 2002/12/19 16:36:27 ge204 Exp $";/* HTK/HLM libraries: */#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LUtil.h"#include "LWMap.h"#include "LGBase.h"#include "LModel.h"#include "LCMap.h"/* Uncomment the following line to run integrity checks on each iteration to ensure that: * The class counts all add up correctly * The maximum-likelihood values have all been updated correctly *//*#define INTEGRITY_CHECK *//* -------------------------- Trace Flags ------------------------ */#define T_TOP 00001 /* Basic tracing */#define T_FILE 00002 /* Report major file operations */#define T_EXTRA 00004 /* Extra tracing */#define T_BOND 007 /* Undercover tracing */#define T_MEM 00010 /* Trace memory usage *//* Constants *//* Size of blocks we grab from New() and then allocate internally (This is done to avoid grabbing around 100,000 small blocks!) #bytes */#define block_grab_size 1048576/* Cut-off point at which we decide not to use the internal block and just use New() - number of bigrams using a given word */#define block_cut_off (200*sizeof(bi_count))/* Initial size of bigram read buffer (large enough to hold maximum number of bigrams featuring a single word in a single position) - can grow */#define initial_bigram_buffer 10000/* Granularity of growth of above buffer, if required */#define bigram_buffer_grow 1000/* Identifiers for word clustering sort orders */#define SORT_WMAP 1#define SORT_FREQ 2/* Type definitions *//* Bigram count */typedef struct { UInt id; /* Word id */ int count; /* Bigram count */}bi_count;/* All bigrams which start or end with a certain word */typedef struct { bi_count *bi; /* Array of counts */ int size; /* Number of bigrams with this word in */}bigrams;typedef UInt unigram; /* Occurrence count *//* ---------------------- Global Variables ----------------------- *//* DEFAULTS *//* Global variables - defaults */static int N = 1000; /* Default number of classes */static Boolean show_MLV=FALSE; /* Show MLV after each change */static char *export_prefix="cluster"; /* Prefix of export filenames */static Boolean unk_sep = FALSE; /* Keep unknown word in its own class? */static Boolean outCMapRaw = FALSE; /* Output classes in raw mode */static Boolean inCMapRaw = FALSE; /* Input classes in raw mode *//* Global variables - others *//* Used by core clusterer */static int **clCnt=NULL; /* Array of arrays; index with count[c1][c2] (clCnt = 'class count') */static int *tmp_c1=NULL; /* Temporary set of bigrams (1) */static int *tmp_c2=NULL; /* Temporary set of bigrams (2) */static int *tmp_c3=NULL; /* Temporary set of bigrams (3) */static int *tmp_c4=NULL; /* Temporary set of bigrams (4) */static int *tmp_sum1=NULL; /* Temporary word-class counts (1) */static int *tmp_sum2=NULL; /* Temporary word-class counts (2) */static int *clSum=NULL; /* Class unigram [classes] returns word unigram sum */static int *clMemb=NULL; /* Class membership [words] returns class given a word */static int GwGw, gGw, Gwg, gg; /* Special-case class counts */static double *mlv; /* ML values involving class [N] */static int *bipair; /* Array of word bigrams (w,w) */static int sum_of_all_bigram_counts;/* Sum of all bigram counts */static int sum_of_all_uni_counts; /* Sum of all unigram counts */static int curr_class; /* Temporary value, saves passing */static int start_class = 2; /* Which is the first 'real' class? */static double curr_MLV=0; /* ...and its current value */static int W = 0; /* Number of words */static bigrams *forward, *backward; /* Forward and backward bigram tables */static int export_index=0; /* What iteration is this? */static FILE *logfile=NULL; /* Log progress to this file */static char tmp[256]; /* Scrap array */static int start_id=-1, end_id=-1; /* Start and end word ids */static int unk_id=-1; /* Unknown word token id */static MemHeap global_heap; /* Claim fixed block memory from here */static MemHeap global_stack; /* Claim other memory from here *//* Used by uni/bigram storage */static unigram *uni; /* Unigram store */static int max_words; /* Maximum number of words */static bigrams *forward=0, *backward; /* Forward and backward bigram tables */static void *block=0; /* First word of free memory we have */static void *block_end=0; /* First byte after current block */static UInt last_word; /* ID of last word (w,?) read in */static int store_idx; /* Next index of second word in bigram */static bi_count *store; /* Store of current word w (w,*) pairs */static int curr_bistore_size; /* Current size of bigram buffer store *//* Front-end code */static WordMap wmap; /* HTK word map */static MemHeap imem; /* memory for input gram file set */static MemHeap imem2; /* memory for input gram file set (copy) */static NGInputSet inset; /* input gram file set */static NGInputSet inset2; /* input gram file set (copy) */static char sent_start[256]; /* sentence start word */static char sent_end[256]; /* sentence end word */static char unknown_w[256]; /* unknown word token */static ConfParam *cParm[MAXGLOBS]; /* configuration script parameters */static int nParm = 0; /* total num params */static int trace = 0; /* trace setting */static UInt *class_sort; /* Used to sort output alphabetically */static Boolean pipe_logfile; /* HShell file handling - using pipe? */static int rec_freq = 1000; /* Frequency we write recovery files (0 = off) */static Boolean verbose = FALSE; /* Verbose file logging */static Boolean write_logfile = TRUE; /* Write a log file during execution */static int sort_order = SORT_WMAP; /* Order words are considered in */static int *sort_uni; /* Sort unigrams by count */static Boolean outCMapRawTrap = FALSE; /* Has this been changed by config file? */static Boolean inCMapRawTrap = FALSE; /* Has this been changed by config file? *//* ---------------- Function Prototypes -------------------------- */#ifdef INTEGRITY_CHECKstatic void check_counts_sum(void);static void max_likelihood_check(void);#endifstatic void max_likelihood_init(void);/* Add a bigram */void bigram_add(NGram ng, int count);/* Call when all bigrams have been passed in */void bigram_added_all(void);/* Must be called before almost any other function in this file will work */void bigram_init(int words);/* Initialise this unigram storage module */void unigram_init(int numb_words);/* Add a unigram */void unigram_add(NGram ng, int count);/* Read a unigram */UInt unigram_read(UInt id);/* Set whether to show MLV or not (non-zero = on) */void classes_showMLV(int on);/* Set prefix for all output files */void set_output_prefix(char *name);/* Return the number of classes used by default */int classes_get_default(void);/* Set the number of classes used */void classes_set_number(int numb);/* Initialise this module - MUST have initialised bigrams first */void classes_init(int numb_words);/* Perform a given number of iterations of the clustering algorithm */void cluster_words(int iterations);/* Setup all class counts, given existing class word map */void setup_all_counts(void);/* Perform some initial clustering - currently just puts all in one class, except for given start, end and unknown (if -k passed) ids */void initial_cluster(void);/* Write out class sets (pass non-zero to write recovery file) */void export_classes(int recovery);/* Import existing HLM classmap */void import_classmap(char *fname, int numb_words);/* Recover from a given recovery file */void do_recovery(char *fname, int words);/* Write out p(word | class) probabilities */void write_word_probs(char *filename);/* Write out p(word | class) counts */void write_word_counts(char *filename);/* Specify whether to keep the unknown word in its own solo-member class or not (non-zero = keep separate) */void classes_keep_unk_separate(int keep_separate);/* Pass in start, end and unknown word ids */void set_ids(int start_id, int end_id, int unk_id);/* Report an error message to stderr */void report_error(char *text);/* TEMP? */char *what_is_word(UInt id); /* In Cluster.c *//* ---------------- Process Command Line ------------------------- *//* See if any configuration parameters have been set for this tool */void SetConfParms(void){ char b[256]; int i; nParm = GetConfig("CLUSTER", TRUE, cParm, MAXGLOBS); if (nParm>0) { if (GetConfInt(cParm,nParm,"TRACE", &i)) trace = i; if (GetConfStr(cParm,nParm,"STARTWORD", b)) strcpy(sent_start, b); if (GetConfStr(cParm,nParm,"ENDWORD", b)) strcpy(sent_end, b); if (GetConfStr(cParm,nParm,"UNKNOWNNAME", b)) strcpy(unknown_w, b); if (GetConfBool(cParm,nParm,"INCMAPRAW", &inCMapRaw)) { inCMapRawTrap = TRUE; } if (GetConfBool(cParm,nParm,"OUTCMAPRAW", &outCMapRaw)) { outCMapRawTrap = TRUE; } }}/* Provide skeleton help */void ReportUsage(void){ printf("\nUSAGE: Cluster [options] mapfile gramfile ...\n\n"); printf(" Option Default\n"); printf(" -c n use n classes %d\n", classes_get_default()); printf(" -i n perform n iterations 1\n"); printf(" -k put unknown word in a separate class off\n"); printf(" -l f start from existing classmap 'f' off\n"); printf(" -m add running ML values to logfile %s\n", show_MLV?"on":"off"); printf(" -n do not produce any logfile output %s\n", write_logfile?"off":"on"); printf(" -o f set prefix of output files %s\n", export_prefix); printf(" -p f write word|class probs to file 'f' off\n"); printf(" -q f write word|class counts to file 'f' off\n"); printf(" -r n write recovery file freq (0=off) %d\n", rec_freq); printf(" -s t specify sentence start word as 't' %s\n", DEF_STARTWORD); printf(" -t t specify sentence end word as 't' %s\n", DEF_ENDWORD); printf(" -u t specify unknown word token as 't' %s\n", DEF_UNKNOWNNAME); printf(" -v use verbose log file format %s\n", verbose?"on":"off"); printf(" -w t specify word sort order - WMAP/FREQ %s\n", sort_order==SORT_WMAP?"WMAP":"FREQ"); printf(" -x f continue from recovery file 'f' off\n"); printf(" Standard options:\n"); PrintStdOpts(""); printf("\n");}void check_file(FILE *file, char *fname, char *function){ if (!file) HError(17011, "%s: Can't open file '%s'", function, fname);}/* --------------------- Import N-grams ----------------- *//* LoadBiGrams: load in N-gram files, keeping only bigrams */static void LoadBiGrams(){ UInt ng[2]; float cnt; /* Occurrence count */ int added=0; if (trace & T_FILE) { printf("Loading bigrams from N-gram files\n"); } OpenInputSet(&inset); if (trace & T_FILE) { printf("Opened input set of %d entries\n", inset.nFiles); } while (GetNextNGram(&inset, ng, &cnt, 2)) { /* ng stores ngram in format [0],[1]...[N]; count is separate */ ng[0] = GetMEIndex(&wmap, ng[0]); ng[1] = GetMEIndex(&wmap, ng[1]); bigram_add(ng, (int) cnt); added++; } CloseInputSet(&inset); if (trace & T_FILE) { printf("Bigram load complete - %d bigrams imported\n", added); }}/* LoadUniGrams: load in N-gram files - we want unigrams */static void LoadUniGrams(){ UInt ng[1]; float cnt; /* Occurrence count */ int added=0; if (trace & T_FILE) { printf("Loading unigrams from N-gram files\n"); } OpenInputSet(&inset2); while (GetNextNGram(&inset2, ng, &cnt, 1)) { /* ng stores ngram in format [0],[1]...[N]; count is separate */ ng[0] = GetMEIndex(&wmap, ng[0]); /* convert into value indexed from 0 */ unigram_add(ng, (int) cnt); added++; } CloseInputSet(&inset2); if (trace & T_FILE) { printf("Unigram load complete - %d unigrams imported\n", added); }}/* Return word text given an internal id */char *what_is_word(UInt id){ return wmap.id[id]->name;}/* Return a word id given a word */UInt get_id_from_word(char *word){ if (!(GetLabId(word, FALSE))) { HError(17050, "Word '%s' found in class map but not in word map", word); } return GetMEIndex(&wmap, (((MapEntry *)(GetLabId(word, FALSE)->aux))->ndx));}/* Class functions *//* Set whether to show MLV or not */void classes_showMLV(int on){ show_MLV = on ? TRUE : FALSE;}/* Set prefix for all output files */void set_output_prefix(char *name){ if (clCnt) { HError(-17099, "set_output_prefix(): this function must be called before initialisation"); /* No need to abort - it will just affect future files opened */ } export_prefix = New(&global_stack, strlen(name)+1); strcpy(export_prefix, name);}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -