?? wi2dvf.c

?? 貝葉斯學習算法分類文本。基于樸素貝葉斯分類器的文本分類的通用算法
?? C
?? 第 1 頁 / 共 2 頁
字號:
上一頁 12
	  /* Add the number of bytes it will take to write the	     WI'th "document vector" */	  seek_current += bow_dv_write_size (wi2dvf->entry[wi].dv);	}    }  /* We have now finished writing the DVF seek information; we should      be at the position we calculated earlier for SEEK_BASE. */  assert (ftell (fp) == seek_base);  /* Now write the actual "document vector" information. */  for (wi = 0; wi < wi2dvf->size; wi++)    {      if (wi2dvf->entry[wi].dv != NULL)	{	  /* Make sure we are at the same place in the file that	     we said we'd be. */	  assert (ftell (fp) == wi2dvf->entry[wi].seek_start);	  bow_dv_write (wi2dvf->entry[wi].dv, fp);	}    }}/* Write WI2DVF to a file, in a machine-independent format.  This   is the format expected by bow_wi2dvf_new_from_file(). */voidbow_wi2dvf_write_data_file (bow_wi2dvf *wi2dvf, const char *filename){  FILE *fp;  if (!(fp = fopen (filename, "w")))    bow_error ("Couldn't open file `%s' for writing.", filename);  bow_wi2dvf_write (wi2dvf, fp);  fclose (fp);}/* Create a `wi2dvf' by reading data from file-pointer FP.  This   doesn't actually read in all the "document vectors"; it only reads   in the DVF information, and lazily loads the actually "document   vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_fp (FILE *fp){  int size;  bow_wi2dvf *ret;  int wi;  /* Read the number of "word indices" used as keys in the new WI2DVF. */  bow_fread_int (&size, fp);  /* Create a new WI2DVF of that size.*/  ret = bow_wi2dvf_new (size);  ret->fp = fp;  /* Read all the DVF information, but not the actual "document vectors";     We'll do that later in bow_wi2dvf_dv(). */  for (wi = 0; wi < size; wi++)    {      bow_fread_int (&(ret->entry[wi].seek_start), fp);      if (ret->entry[wi].seek_start != -1)	(ret->num_words)++;      ret->entry[wi].dv = NULL;    }  return ret;}/* Create a `wi2dvf' by reading data from a file.  This doesn't actually    read in all the "document vectors"; it only reads in the DVF    information, and lazily loads the actually "document vectors". */bow_wi2dvf *bow_wi2dvf_new_from_data_file (const char *filename){  FILE *fp;  bow_wi2dvf *ret;  if (!(fp = fopen (filename, "r")))    bow_error ("Couldn't open file `%s' for reading.", filename);  ret = bow_wi2dvf_new_from_data_fp (fp);  /* Don't close the FP because it will still be needed for      reading the "document vectors", DV's. */  return ret;}/* Free the memory held by the map WI2DVF. */voidbow_wi2dvf_free (bow_wi2dvf *wi2dvf){  int i;  if (wi2dvf->fp)    fclose (wi2dvf->fp);  for (i = 0; i < wi2dvf->size; i++)    {      if (wi2dvf->entry[i].dv)	bow_dv_free (wi2dvf->entry[i].dv);    }  bow_free (wi2dvf);}/* Return the "document vector" corresponding to "word index" WI.  This   function will read the "document vector" out of the file passed to   bow_wi2dvf_new_from_file() if is hasn't been read already. */bow_dv *bow_wi2dvf_dv (bow_wi2dvf *wi2dvf, int wi){  /* If the word-index is higher than anything we know about,     return NULL.  This could legitimately happen if the query     document has vocabulary that wasn't in the training data. */  if (wi >= wi2dvf->size)    return NULL;  /* If the "document vector" is available (it has already been read     in, it is non-NULL), and it is not hidden (it's SEEK_START is     greater than or equal to -1) then simply return it.  Note that     newly created WI2DVF's that haven't been saved (like those for     VPC_BARREL's) with have non-NULL dv's and SEEK_START's of -1. */  if (wi2dvf->entry[wi].dv && wi2dvf->entry[wi].seek_start >= -1)    {      assert (wi2dvf->entry[wi].dv->idf == wi2dvf->entry[wi].dv->idf);      return wi2dvf->entry[wi].dv;    }  /* If the SEEK_START position of WI'th DVF is -1, then this was an     empty "document vector", so return NULL.  If the SEEK_START     position of the WI'th DVF is less than -1, then this document     vector was hidden by BOW_WI2DVF_HIDE_WI(), so return NULL. */  if (wi2dvf->entry[wi].seek_start <= -1)    return NULL;  /* If we want to read it in, but if this WI2DVF isn't backed by a     data file (for example, it's being built from a directory of     text files), then just return NULL. */  if (wi2dvf->fp == NULL)    return NULL;  /* Read in the document vector. */  assert (wi2dvf->entry[wi].seek_start > 2);  fseek (wi2dvf->fp, wi2dvf->entry[wi].seek_start, SEEK_SET);  wi2dvf->entry[wi].dv = bow_dv_new_from_data_fp (wi2dvf->fp);  /* Check for NaN. */  assert (wi2dvf->entry[wi].dv->idf == wi2dvf->entry[wi].dv->idf);  assert (wi == wi2dvf->size - 1	  || wi2dvf->entry[wi+1].seek_start == -1	  || ftell (wi2dvf->fp) == wi2dvf->entry[wi+1].seek_start);  /* Return what we just read. */  return wi2dvf->entry[wi].dv;}/* Compare two maps, and return 0 if they are equal.  This function was   written for debugging. */intbow_wi2dvf_compare (bow_wi2dvf *map1, bow_wi2dvf *map2){  int max_wi, wi;  bow_dv *dv1, *dv2;  max_wi = bow_num_words ();  /* (map1->size > map2->size) ? map1->size : map2->size; */  /* Step through all the "word indices" in each of the maps. */  for (wi = 0; wi < max_wi; wi++)    {      dv1 = bow_wi2dvf_dv (map1, wi);      dv2 = bow_wi2dvf_dv (map2, wi);      if (dv1 == NULL || dv2 == NULL)	{	  if (!(dv1 == NULL && dv2 == NULL))	    {	      bow_verbosify (bow_progress, "%s: Differ by NULL at wi %d\n",			     __PRETTY_FUNCTION__, wi);	      return 1;	    }	}      else	{	  /* We have two non-NULL "document vectors" */	  int max_dv_i, dv_i;	  max_dv_i = (dv1->length > dv2->length) ? dv1->length : dv2->length;	  for (dv_i = 0; dv_i < max_dv_i; dv_i++)	    {	      if (dv1->entry[dv_i].di != dv2->entry[dv_i].di		  || dv1->entry[dv_i].count != dv2->entry[dv_i].count)		{		  bow_verbosify (bow_progress, 				 "%s: Differ by entry at wi %d\n",				 __PRETTY_FUNCTION__, wi);		  return 2;		}	    }	}    }  return 0;}/* Print statistics about the WI2DVF map to STDOUT. */voidbow_wi2dvf_print_stats (bow_wi2dvf *map){  int wi, wi_max;  bow_dv *dv;  /* stats on "document vector" length */  int dvl_count, dvl_max, dvl_ave, dvl_min;  int dvl_max_count, dvl_min_count;  int dvl_max_wi, dvl_min_wi;  /* stats on "document vector" count */  /* int dvc_max, dvc_ave, dvc_min; */  /* stats on used/unused memory */  int de_used_count, de_unused_count;  wi_max = bow_num_words ();  printf ("%8d libbow's num words\n", wi_max);  printf ("%8d num words in wi2dvf\n", map->num_words);  /* printf ("%8d unique documents\n", bow_num_docnames ()); */  /* Get stats on "document vector" length. */  dv = bow_wi2dvf_dv (map, 0);  dvl_max = dvl_ave = dvl_min = dv->length;  dvl_max_count = dvl_min_count = 1;  dvl_max_wi = dvl_min_wi = 0;  dvl_count = 0;  de_used_count = de_unused_count = 0;  for (wi = 1; wi < wi_max; wi++)    {      dv = bow_wi2dvf_dv (map, wi);      if (dv)	{	  dvl_count++;	  dvl_ave += dv->length;	  if (dv->length > dvl_max)	    {	      dvl_max = dv->length;	      dvl_max_wi = wi;	      dvl_max_count = 1;	    }	  else if (dv->length > dvl_max)	    dvl_max_count++;	  if (dv->length < dvl_min)	    {	      dvl_min = dv->length;	      dvl_min_wi = wi;	      dvl_min_count = 1;	    }	  else if (dv->length > dvl_max)	    dvl_min_count++;	  de_used_count += dv->length;	  de_unused_count += dv->size - dv->length;	  assert (dv->size - dv->length >= 0);	}    }  printf ("%8d minimum document vector length (eg word=`%s', %d others)\n",	  dvl_min, bow_int2word (dvl_min_wi), dvl_min_count);  printf ("%8.1f average document vector length\n",	  ((double)dvl_ave)/dvl_count);  printf ("%8d maximum document vector length (eg word=`%s', %d others)\n",	  dvl_max, bow_int2word (dvl_max_wi), dvl_max_count);  printf ("%8d document vector entries used\n", 	  de_used_count);  printf ("%8d document vector entries allocated but unused\n", 	  de_unused_count);  printf ("%8.1f average unused document vector entries\n", 	  ((double)de_unused_count)/dvl_count);}
上一頁 12
?? 文件大小 238 K
?? 上傳用戶 xulei147
?? 所屬分類數學計算
??? 相關標簽

#貝葉斯 #學習算法 #分類 #分類器
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? wi2dvf.c

?? 快捷鍵說明