?? pv.c
字號(hào):
/* "Position vector", a (compressed) list of word positions in documents *//* Copyright (C) 1998 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#define _FILE_OFFSET_BITS 64#include <bow/libbow.h>#include <bow/archer.h>#define PV_DEBUG 1/* The total amount of memory consumed by PVM's */int bow_pvm_total_bytes = 0;/* The maximum memory we will allow PVM's to take before we flush them to disk. Currently set to 128M */int bow_pvm_max_total_bytes = 128 * 1024 * 1024;/* Allocate and return a new PVM that can hold SIZE bytes */bow_pvm *bow_pvm_new (int size){ bow_pvm *ret = bow_malloc (sizeof (bow_pvm) + size); ret->size = size; ret->read_end = 0; ret->write_end = 0; bow_pvm_total_bytes += sizeof (bow_pvm) + size; return ret;}/* Increase the capacity of PVM, growing by doubling size until we get to 128k, then just grow by 128k increments. */voidbow_pvm_grow (bow_pvm **pvm){ if ((*pvm)->size < 64 * 1024) { (*pvm)->size *= 2; bow_pvm_total_bytes += (*pvm)->size; } else { (*pvm)->size += 64 * 1024; bow_pvm_total_bytes += 64 * 1024; } *pvm = bow_realloc (*pvm, sizeof (bow_pvm) + (*pvm)->size);}/* Free the memory associated with the PVM */voidbow_pvm_free (bow_pvm *pvm){ bow_free (pvm);}/* Put the PVM's reader-pointer back to the beginning */static inline voidbow_pvm_rewind (bow_pvm *pvm){ pvm->read_end = 0;}/* PV functions *//* The first four bytes of a segment are an int that indicate how many bytes are allocated in this segment. The last four bytes of a segment are an int that indicates the seek location of the next segment. The read_segment_bytes_remaining does not include the size of the two int's. *//* Always enough for one "document index"/"word index" pair: 5 bytes == 6+4*7 == 34 bits for di, likewise for pi. */#define bow_pv_max_sizeof_di_pi (2 * 5)static int bow_pv_sizeof_first_segment = 2 * bow_pv_max_sizeof_di_pi;/* Fill in PV with the correct initial values. */voidbow_pv_init (bow_pv *pv, FILE *fp){ //pv->byte_count = 0; pv->word_count = 0; //pv->document_count = 0; pv->pvm = NULL; pv->seek_start = 0; //-1 pv->read_seek_end = 0; pv->read_segment_bytes_remaining = -1; pv->read_last_di = -1; pv->read_last_pi = -1; pv->write_last_di = -1; pv->write_last_pi = -1; pv->write_seek_last_tailer = 0; /* This value must match READ_SEEK_END */}/* Write this PV's PVM to disk, and free the PVM. */voidbow_pv_flush (bow_pv *pv, FILE *fp){ off_t seek_new_segment; off_t seek_new_tailer; if (pv->pvm == NULL || pv->pvm->write_end == 0) return; /* Seek to the end of the file, which is the position at which this segment of the PV will begin. */ fseeko (fp, 0, SEEK_END); seek_new_segment = ftello (fp); /* If none of this PV has ever been written to disk, remember this position as the start position so that we can rewind there later. */ if (pv->seek_start == 0) //-1 { pv->seek_start = seek_new_segment; pv->read_seek_end = seek_new_segment; pv->read_segment_bytes_remaining = pv->pvm->write_end; } /* Write the "header", which is the number of contents data bytes in this segment. */ bow_fwrite_int (pv->pvm->write_end, fp); /* Write the contents data */ fwrite (pv->pvm->contents, sizeof (unsigned char), pv->pvm->write_end, fp); /* Write (a temporary value for) the "tailer". Later we will put here the seek position of the next pv segment on disk. */ /* xxx Don't actually need a ftello() here. Do the math instead. */ seek_new_tailer = ftello (fp); bow_fwrite_off_t (0, fp); //-1 /* If this is not the first time this PV has been flushed, then the "tailer" of the previous flushed segment, and write the seek position of this segment there. */ if (pv->write_seek_last_tailer != 0) { fseeko (fp, pv->write_seek_last_tailer, SEEK_SET); bow_fwrite_off_t (seek_new_segment, fp); } pv->write_seek_last_tailer = seek_new_tailer; bow_pvm_total_bytes -= sizeof (bow_pvm) + pv->pvm->size; bow_pvm_free (pv->pvm); pv->pvm = NULL;}/* Write to PVM the unsigned integer I, marked with the special flag saying if it is a DI or a PI, (as indicated by IS_DI). Assumes there is enough space there in this PVM to write the info. Returns the number of bytes written. */static inline intbow_pvm_write_unsigned_int (bow_pvm *pvm, unsigned int i, int is_di){ bow_pe pe; int byte_count = 1; /* Count already the last byte */ /* assert (i < (1 < 6+7+7+7+1)); */ if (is_di) pe.bits.is_di = 1; else pe.bits.is_di = 0; if (i > 0x3f) /* binary = 00111111 */ { pe.bits.is_more = 1; pe.bits.index = i & 0x3f; /* binary = 00111111 */ pvm->contents[pvm->write_end++] = pe.byte; /* Write the first byte */ byte_count++; i = i >> 6; while (i > 0x7f) /* binary = 01111111 */ { pe.bits_more.is_more = 1; pe.bits_more.index = i & 0x7f; pvm->contents[pvm->write_end++] = pe.byte; byte_count++; i = i >> 7; } pe.bits_more.is_more = 0; pe.bits_more.index = i; pvm->contents[pvm->write_end++] = pe.byte; } else { pe.bits.is_more = 0; pe.bits.index = i; /* Write the first byte and only */ pvm->contents[pvm->write_end++] = pe.byte; } return byte_count;}/* Read an unsigned integer into I, and indicate whether it is a "document index" or a "position index" by the value of IS_DI. Returns the number of bytes read. */static inline intbow_pvm_read_unsigned_int (bow_pvm *pvm, unsigned int *i, int *is_di){ bow_pe pe; int index; int shift = 6; int byte_count = 1; pe.byte = pvm->contents[pvm->read_end++]; if (pe.bits.is_di) *is_di = 1; else *is_di = 0; index = pe.bits.index; while (pe.bits.is_more) /* The above test relies on pe.bits.is_more == pe.bits_more.is_more */ { pe.byte = pvm->contents[pvm->read_end++]; byte_count++; index |= pe.bits_more.index << shift; shift += 7; } *i = index; return byte_count;}/* Read an unsigned integer into I, and indicate whether it is a "document index" or a "position index" by the value of IS_DI. Assumes that FP is already seek'ed to the correct position. Returns the number of bytes read. */static inline intbow_pv_read_unsigned_int (unsigned int *i, int *is_di, FILE *fp){ bow_pe pe; int index; int shift = 6; int byte_count = 1; pe.byte = fgetc (fp); if (pe.bits.is_di) *is_di = 1; else *is_di = 0; index = pe.bits.index; while (pe.bits.is_more) /* The above test relies on pe.bits.is_more == pe.bits_more.is_more */ { pe.byte = fgetc (fp); byte_count++; index |= pe.bits_more.index << shift; shift += 7; } *i = index; return byte_count;}#define PV_WRITE_SIZE_INT(N) \(((N) < (1 << (6+1))) \ ? 1 \ : (((N) < (1 << (6+7+1))) \ ? 2 \ : (((N) < (1 << (6+7+7+1))) \ ? 3 \ : (((N) < (1 << (6+7+7+7+1))) \ ? 4 \ : 5))))static inline intbow_pv_write_size_di_pi (bow_pv *pv, int di, int pi){
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -