?? wi2dvf.c
字號:
/* Add the number of bytes it will take to write the WI'th "document vector" */ seek_current += bow_dv_write_size (wi2dvf->entry[wi].dv); } } /* We have now finished writing the DVF seek information; we should be at the position we calculated earlier for SEEK_BASE. */ assert (ftell (fp) == seek_base); /* Now write the actual "document vector" information. */ for (wi = 0; wi < wi2dvf->size; wi++) { if (wi2dvf->entry[wi].dv != NULL) { /* Make sure we are at the same place in the file that we said we'd be. */ assert (ftell (fp) == wi2dvf->entry[wi].seek_start); bow_dv_write (wi2dvf->entry[wi].dv, fp); } }}/* Write WI2DVF to a file, in a machine-independent format. This is the format expected by bow_wi2dvf_new_from_file(). */voidbow_wi2dvf_write_data_file (bow_wi2dvf *wi2dvf, const char *filename){ FILE *fp; if (!(fp = fopen (filename, "w"))) bow_error ("Couldn't open file `%s' for writing.", filename); bow_wi2dvf_write (wi2dvf, fp); fclose (fp);}/* Create a `wi2dvf' by reading data from file-pointer FP. This doesn't actually read in all the "document vectors"; it only reads in the DVF information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_fp (FILE *fp){ int size; bow_wi2dvf *ret; int wi; /* Read the number of "word indices" used as keys in the new WI2DVF. */ bow_fread_int (&size, fp); /* Create a new WI2DVF of that size.*/ ret = bow_wi2dvf_new (size); ret->fp = fp; /* Read all the DVF information, but not the actual "document vectors"; We'll do that later in bow_wi2dvf_dv(). */ for (wi = 0; wi < size; wi++) { bow_fread_int (&(ret->entry[wi].seek_start), fp); if (ret->entry[wi].seek_start != -1) (ret->num_words)++; ret->entry[wi].dv = NULL; } return ret;}/* Create a `wi2dvf' by reading data from a file. This doesn't actually read in all the "document vectors"; it only reads in the DVF information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_file (const char *filename){ FILE *fp; bow_wi2dvf *ret; if (!(fp = fopen (filename, "r"))) bow_error ("Couldn't open file `%s' for reading.", filename); ret = bow_wi2dvf_new_from_data_fp (fp); /* Don't close the FP because it will still be needed for reading the "document vectors", DV's. */ return ret;}/* Free the memory held by the map WI2DVF. */voidbow_wi2dvf_free (bow_wi2dvf *wi2dvf){ int i; if (wi2dvf->fp) fclose (wi2dvf->fp); for (i = 0; i < wi2dvf->size; i++) { if (wi2dvf->entry[i].dv) bow_dv_free (wi2dvf->entry[i].dv); } bow_free (wi2dvf);}/* Return the "document vector" corresponding to "word index" WI. This function will read the "document vector" out of the file passed to bow_wi2dvf_new_from_file() if is hasn't been read already. */bow_dv *bow_wi2dvf_dv (bow_wi2dvf *wi2dvf, int wi){ /* If the word-index is higher than anything we know about, return NULL. This could legitimately happen if the query document has vocabulary that wasn't in the training data. */ if (wi >= wi2dvf->size) return NULL; /* If the "document vector" is available (it has already been read in, it is non-NULL), and it is not hidden (it's SEEK_START is greater than or equal to -1) then simply return it. Note that newly created WI2DVF's that haven't been saved (like those for VPC_BARREL's) with have non-NULL dv's and SEEK_START's of -1. */ if (wi2dvf->entry[wi].dv && wi2dvf->entry[wi].seek_start >= -1) { assert (wi2dvf->entry[wi].dv->idf == wi2dvf->entry[wi].dv->idf); return wi2dvf->entry[wi].dv; } /* If the SEEK_START position of WI'th DVF is -1, then this was an empty "document vector", so return NULL. If the SEEK_START position of the WI'th DVF is less than -1, then this document vector was hidden by BOW_WI2DVF_HIDE_WI(), so return NULL. */ if (wi2dvf->entry[wi].seek_start <= -1) return NULL; /* If we want to read it in, but if this WI2DVF isn't backed by a data file (for example, it's being built from a directory of text files), then just return NULL. */ if (wi2dvf->fp == NULL) return NULL; /* Read in the document vector. */ assert (wi2dvf->entry[wi].seek_start > 2); fseek (wi2dvf->fp, wi2dvf->entry[wi].seek_start, SEEK_SET); wi2dvf->entry[wi].dv = bow_dv_new_from_data_fp (wi2dvf->fp); /* Check for NaN. */ assert (wi2dvf->entry[wi].dv->idf == wi2dvf->entry[wi].dv->idf); assert (wi == wi2dvf->size - 1 || wi2dvf->entry[wi+1].seek_start == -1 || ftell (wi2dvf->fp) == wi2dvf->entry[wi+1].seek_start); /* Return what we just read. */ return wi2dvf->entry[wi].dv;}/* Compare two maps, and return 0 if they are equal. This function was written for debugging. */intbow_wi2dvf_compare (bow_wi2dvf *map1, bow_wi2dvf *map2){ int max_wi, wi; bow_dv *dv1, *dv2; max_wi = bow_num_words (); /* (map1->size > map2->size) ? map1->size : map2->size; */ /* Step through all the "word indices" in each of the maps. */ for (wi = 0; wi < max_wi; wi++) { dv1 = bow_wi2dvf_dv (map1, wi); dv2 = bow_wi2dvf_dv (map2, wi); if (dv1 == NULL || dv2 == NULL) { if (!(dv1 == NULL && dv2 == NULL)) { bow_verbosify (bow_progress, "%s: Differ by NULL at wi %d\n", __PRETTY_FUNCTION__, wi); return 1; } } else { /* We have two non-NULL "document vectors" */ int max_dv_i, dv_i; max_dv_i = (dv1->length > dv2->length) ? dv1->length : dv2->length; for (dv_i = 0; dv_i < max_dv_i; dv_i++) { if (dv1->entry[dv_i].di != dv2->entry[dv_i].di || dv1->entry[dv_i].count != dv2->entry[dv_i].count) { bow_verbosify (bow_progress, "%s: Differ by entry at wi %d\n", __PRETTY_FUNCTION__, wi); return 2; } } } } return 0;}/* Print statistics about the WI2DVF map to STDOUT. */voidbow_wi2dvf_print_stats (bow_wi2dvf *map){ int wi, wi_max; bow_dv *dv; /* stats on "document vector" length */ int dvl_count, dvl_max, dvl_ave, dvl_min; int dvl_max_count, dvl_min_count; int dvl_max_wi, dvl_min_wi; /* stats on "document vector" count */ /* int dvc_max, dvc_ave, dvc_min; */ /* stats on used/unused memory */ int de_used_count, de_unused_count; wi_max = bow_num_words (); printf ("%8d libbow's num words\n", wi_max); printf ("%8d num words in wi2dvf\n", map->num_words); /* printf ("%8d unique documents\n", bow_num_docnames ()); */ /* Get stats on "document vector" length. */ dv = bow_wi2dvf_dv (map, 0); dvl_max = dvl_ave = dvl_min = dv->length; dvl_max_count = dvl_min_count = 1; dvl_max_wi = dvl_min_wi = 0; dvl_count = 0; de_used_count = de_unused_count = 0; for (wi = 1; wi < wi_max; wi++) { dv = bow_wi2dvf_dv (map, wi); if (dv) { dvl_count++; dvl_ave += dv->length; if (dv->length > dvl_max) { dvl_max = dv->length; dvl_max_wi = wi; dvl_max_count = 1; } else if (dv->length > dvl_max) dvl_max_count++; if (dv->length < dvl_min) { dvl_min = dv->length; dvl_min_wi = wi; dvl_min_count = 1; } else if (dv->length > dvl_max) dvl_min_count++; de_used_count += dv->length; de_unused_count += dv->size - dv->length; assert (dv->size - dv->length >= 0); } } printf ("%8d minimum document vector length (eg word=`%s', %d others)\n", dvl_min, bow_int2word (dvl_min_wi), dvl_min_count); printf ("%8.1f average document vector length\n", ((double)dvl_ave)/dvl_count); printf ("%8d maximum document vector length (eg word=`%s', %d others)\n", dvl_max, bow_int2word (dvl_max_wi), dvl_max_count); printf ("%8d document vector entries used\n", de_used_count); printf ("%8d document vector entries allocated but unused\n", de_unused_count); printf ("%8.1f average unused document vector entries\n", ((double)de_unused_count)/dvl_count);}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -