?? wi2dvf.c
字號:
/* Word-index to document-vector-file *//* Copyright (C) 1997 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <netinet/in.h> /* for machine-independent byte-order */#include <assert.h>#define INIT_BOW_DVF(DVF) { DVF.seek_start = -1; DVF.dv = NULL; }unsigned int bow_wi2dvf_default_capacity = 1024;bow_wi2dvf *bow_wi2dvf_new (int capacity){ bow_wi2dvf *ret; int i; if (capacity == 0) capacity = bow_wi2dvf_default_capacity; ret = bow_malloc (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * capacity)); ret->size = capacity; ret->num_words = 0; ret->fp = NULL; for (i = 0; i < capacity; i++) INIT_BOW_DVF(ret->entry[i]); return ret;}/* xxx We should think about a scheme that doesn't require keeping all the "document vectors" in core at the time time. We could write them to disk, read them back in when we needed to add to them, then write them back out again. We would need a nice caching scheme, as well as nice way to deal with "document vectors" that grow. *//* Add a "word vector" WV, associated with "document index" DI, to the map WI2DVF. */ voidbow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv){ int i, wi; int max_wi = bow_num_words (); if (max_wi > (*wi2dvf)->size) { /* There are so many unique words, we need to grow the array that maps WI's to DVF's. */ int wi = (*wi2dvf)->size; /* a "word index" */ (*wi2dvf)->size = MAX (max_wi, (*wi2dvf)->size * 2); (*wi2dvf) = bow_realloc (*wi2dvf, (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * (*wi2dvf)->size))); /* Initialize the new part of the realloc'ed space. */ for ( ; wi < (*wi2dvf)->size; wi++) INIT_BOW_DVF((*wi2dvf)->entry[wi]); } /* Run down the "word vector", depositing each entry in the WI2DVF. */ for (i = 0; i < wv->num_entries; i++) { wi = wv->entry[i].wi; assert ((*wi2dvf)->size > wi); if ((*wi2dvf)->entry[wi].dv == NULL) { /* There is not yet a "document vector" for "word index" WI, so create one. */ (*wi2dvf)->entry[wi].dv = bow_dv_new (0); /* This 2 is a flag to the hide/unhide code that this DV exists. */ (*wi2dvf)->entry[wi].seek_start = 2; ((*wi2dvf)->num_words)++; } /* Add the "document index" DI and the count associated with word index WI to the WI'th "document vector". */ /* xxx Should we be adding WV->ENTRY[i].WEIGHT also? */ bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, wv->entry[i].count, 0); }}/* Read all the words from file pointer FP, and add them to WI2DVF, associated with document index DI. */voidbow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp){ char word[BOW_MAX_WORD_LENGTH]; /* buffer for reading and stemming words */ int wi; /* a word index */ bow_lex *lex; /* Loop once for each document in this file. */ while ((lex = bow_default_lexer->open_text_fp (bow_default_lexer, fp))) { /* Loop once for each lexical token in this document. */ while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { /* Find out the word's "index". */ wi = bow_word2int_add_occurrence (word); if (wi < 0) continue; /* Increment our stats about this word/document pair. */ bow_wi2dvf_add_wi_di_count_weight (wi2dvf, wi, di, 1, 0); } bow_default_lexer->close (bow_default_lexer, lex); }}/* In the map WI2DVF, increase by COUNT and WEIGHT our record of the number times and weight that the document with "document index" DI contains the word with "word index" WI. */voidbow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi, int di, int count, float weight){ if (wi >= (*wi2dvf)->size) { /* There are so many unique words, we need to grow the array that maps WI's to DVF's. */ int old_size = (*wi2dvf)->size; /* a "word vector" */ (*wi2dvf)->size = MAX (wi+1, (*wi2dvf)->size * 2); (*wi2dvf) = bow_realloc (*wi2dvf, (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * (*wi2dvf)->size))); /* Initialize the new part of the realloc'ed space. */ for ( ; old_size < (*wi2dvf)->size; old_size++) INIT_BOW_DVF((*wi2dvf)->entry[old_size]); } /* Increment the stats for the WI/DI pair. */ if ((*wi2dvf)->entry[wi].dv == NULL) { /* There is not yet a "document vector" for "word index" WI, so create one. */ (*wi2dvf)->entry[wi].dv = bow_dv_new (0); /* This 2 is a flag to the hide/unhide code that this DV exists. */ (*wi2dvf)->entry[wi].seek_start = 2; ((*wi2dvf)->num_words)++; } /* Add the "document index" DI and the count associated with word index WI to the WI'th "document vector". */ bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, count, weight);}/* Return a pointer to the BOW_DE for a particular word/document pair, or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di){ bow_dv *dv = bow_wi2dvf_dv (wi2dvf, wi); if (!dv) return NULL; return bow_dv_entry_at_di (dv, di);}/* Remove the word with index WI from the vocabulary of the map WI2DVF */voidbow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi){ assert (wi < wi2dvf->size); if (wi2dvf->entry[wi].dv) { bow_dv_free (wi2dvf->entry[wi].dv); (wi2dvf->num_words)--; } INIT_BOW_DVF (wi2dvf->entry[wi]);}#define FREE_WHEN_HIDING_WI 0/* Temporarily hide the word with index WI from the vocabulary of the map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry for this WI, but */voidbow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi){ assert (wi < wi2dvf->size);#if FREE_WHEN_HIDING_WI if (wi2dvf->entry[wi].dv) { bow_dv_free (wi2dvf->entry[wi].dv); /* (wi2dvf->num_words)--; */ } wi2dvf->entry[wi].dv = NULL;#endif /* The token -1 is reserved to mean that the DV is uninitialized. */ assert (!(wi2dvf->entry[wi].dv && wi2dvf->entry[wi].seek_start == -1)); /* Make the SEEK_START negative so we won't use it in normal situations, but will be able to remember it and get it back when we need it. */ if (wi2dvf->entry[wi].seek_start > 0) { wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start); (wi2dvf->num_words)--; }}/* Make visible all DVF's that were hidden with BOW_WI2DVF_HIDE_WI(). */voidbow_wi2dvf_unhide_all_wi (bow_wi2dvf *wi2dvf){ int wi; for (wi = 0; wi < wi2dvf->size; wi++) { if (wi2dvf->entry[wi].seek_start < -1) { wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start); (wi2dvf->num_words)++; } }}/* Write WI2DVF to file-pointer FP, in a machine-independent format. This is the format expected by bow_wi2dvf_new_from_fp(). */voidbow_wi2dvf_write (bow_wi2dvf *wi2dvf, FILE *fp){ long seek_base; long seek_current; int wi; bow_wi2dvf_unhide_all_wi (wi2dvf); /* Figure out how many bytes the WI2DVF (without the DV's) will take at the beginning the file. */ seek_base = (ftell (fp) /* Where we are starting */ + (sizeof (int) /* for the number of "word indices" */ + (sizeof (int) /* for each SEEK_START value */ * wi2dvf->size))); /* multiplied by the number of WI's */ /* Write the maximum "word index". */ bow_fwrite_int (wi2dvf->size, fp); /* Figure out the correct SEEK_START values for all the DVF's, set them in the DVF's data structure, and write out the DVF's SEEK_START information. */ for (wi = 0, seek_current = seek_base; wi < wi2dvf->size; wi++) { if (wi2dvf->entry[wi].dv == NULL) { /* Write an indication of a NULL document vector. */ bow_fwrite_int (-1, fp); /* Set the SEEK_START in the data structure. */ wi2dvf->entry[wi].seek_start = -1; } else { /* Write the DVF's SEEK_START info. */ bow_fwrite_int (seek_current, fp); /* Set the SEEK_START in the data structure. */ wi2dvf->entry[wi].seek_start = seek_current;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -