?? build_in.c
字號:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. *//* ./glimpse/index/build_in.c *//* -------------------------------------------------------------- build_index(): build an index list from a set of files. INPUT: a set of file names char **name_list[]; a partition table int p_table[]; OUTPUT: an index list; char *index_list; the index list is a char string as follows: each entry of the index list contains two parts: name and indices, where name is an ascii character string, and indices is a list of short integer. (unsigned char) We use newline as a 'record delimiter' (a 'record is logically a word associated with its indices), and WORD_END_MARK to separate a word from its list of indices (s.t. fscanf %s works). Since we restrict the max number of partitions to be 255. a byte is enough to represent the index value. Note that there cannot be a partition #ed '\n'. An example index list: (in logical view) this 12 19 \n is 9 17 12 18 19 \n an 7 12 \n example 16 \n-----------------------------------------------------------------------*/#include "glimpse.h"#define debugt#define BINARY 1/* #define SW_DEBUG the original sw output of index set *//* This flag must always be defined: it is used only in build_in.c *//* #define UDI_DEBUG the original outputs of each indexed file *//* Some variables used throughout */#if BG_DEBUGextern FILE *LOGFILE; /* file descriptor for LOG output */#endif /*BG_DEBUG*/extern FILE *STATFILE; /* file descriptor for statistical data about indexed files */extern FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */extern char INDEX_DIR[MAX_LINE_LEN];extern char sync_path[MAX_LINE_LEN];extern struct stat istbuf;extern struct stat excstbuf;extern struct stat incstbuf;void insert_h();void insert_index();extern int ICurrentFileOffset;extern int NextICurrentFileOffset;/* Some options used throughout */extern int OneFilePerBlock;extern int IndexNumber;extern int CountWords;extern int StructuredIndex;extern int InterpretSpecial;extern int total_size;extern int MAXWORDSPERFILE;extern int NUMERICWORDPERCENT;extern int AddToIndex;extern int DeleteFromIndex;extern int FastIndex;extern int BuildDictionary;extern int BuildDictionaryExisting;extern int CompressAfterBuild;extern int IncludeHigherPriority;extern int FilenamesOnStdin;extern int UseFilters;extern int ByteLevelIndex;extern int RecordLevelIndex;extern int StoreByteOffset;extern int rdelim_len;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];/* int IndexUnderscore; */extern int IndexableFile;extern int MAX_INDEX_PERCENT;extern int MAX_PER_MB;extern int I_THRESHOLD;extern int usemalloc;extern int BigHashTable;extern int AddedMaxWordsMessage;extern int AddedMixedWordsMessage;extern int icount; /* count the number of my_malloc for indices structure */extern int hash_icount; /* to see how much was added to the current hash table */extern int save_icount; /* to see how much was added to the index by the current file */extern int numeric_icount; /* to see how many numeric words were there in the current file */extern int num_filter;extern int filter_len[MAX_FILTER];extern CHAR *filter[MAX_FILTER];extern CHAR *filter_command[MAX_FILTER];extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern int mask_int[32];struct indices *deletedlist = NULL;char **name_list[MAXNUM_INDIRECT];unsigned int *disable_list = NULL;int *size_list[MAXNUM_INDIRECT]; /* temporary area to store size of each file */extern int p_table[MAX_PARTITION];int p_size_list[MAX_PARTITION]; /* sum of the sizes of the files in each partition */int part_num; /* number of partitions */extern int memory_usage;/* borrowd from getword.c */extern int PrintedLongWordWarning;extern int indexable_char[256];extern char *getword();extern int file_num;extern int old_file_num;extern int attr_num;extern int bp; /* buffer pointer */extern unsigned char word[MAX_WORD_BUF];extern int FirstTraverse1;extern struct indices *ip;extern int HashTableSize;struct token **hash_table; /*[MAX_64K_HASH];*/build_index(){ int i; if (AddToIndex || FastIndex) { FirstTraverse1 = OFF; } if ((total_size < LIMIT_64K_HASH*1024*1024) || !BigHashTable) { hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_64K_HASH); HashTableSize = MAX_64K_HASH; } else { hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_256K_HASH); HashTableSize = MAX_256K_HASH; } build_hash(); /* traverse1(); ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ removed on oct/8/96, bgopal, to see if crazysegvs disappear on lec */ return;}/* ----------------------------------------------------------------------traverse()function: traverse the hash list of indices = a hash list is a array oflinked list, where every node in a linked list contains a word whosehash_value is the same.While traversing the hash list, traverse() output a stream of index list.It also frees the memory used in hash_table.------------------------------------------------------------------------*/#define CRAZYSEGV 0traverse(){ int numseencount = 0; int numelements; int numonline; int i, j, attribute; struct token *tp, *tp_old; struct indices *ip, *ip_old;#if !CRAZYSEGV FILE *f_out;#else unsigned char onechar[4]; unsigned char onestring[MAX_LINE_LEN]; int f_out;#endif char s[MAX_LINE_LEN]; char *word; int x = -1, y=0, diff, temp, even_words=1; /* 0 is an even number */ int fputcerr; /* added by dgh 5-8-96 */#ifdef SW_DEBUG printf("in traverse()\n");#endif sprintf(s, "%s/%s", INDEX_DIR, I2);#if !CRAZYSEGV if ((f_out = fopen(s, "w")) == NULL) {#else if ((f_out = open(s, O_WRONLY|O_CREAT|O_TRUNC, 0600)) == -1) {#endif fprintf(stderr, "Cannot open %s for writing\n", s); exit(2); } for(i=0; i<HashTableSize; i++) { if(hash_table[i] == NULL) continue; tp = hash_table[i]; tp_old = tp; while(tp != NULL) { /* traverse the token list */ word = tp->word; while(*word != '\0') { /* copy the word to output */#if !CRAZYSEGV fputcerr=fputc(*word++, f_out);/* change from putc to fputc */ /* by dgh, 8-5-96 */#else write(f_out, word, 1); word++;#endif } /* Look for stop lists */ if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) {#if !CRAZYSEGV putc(ALL_INDEX_MARK, f_out);#else onechar[0] = ALL_INDEX_MARK; write(f_out, onechar, 1);#endif if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute);#if !CRAZYSEGV putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out);#else onechar[0] = (attribute&0x0000ff00)>>8; onechar[1] = (attribute&0x000000ff); write(f_out, onechar, 2);#endif }#if !CRAZYSEGV putc(DONT_CONFUSE_SORT, f_out);#else onechar[0] = DONT_CONFUSE_SORT; write(f_out, onechar, 1);#endif goto next_token; } else if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : MAX_ALL_INDEX) )) {#if !CRAZYSEGV putc(ALL_INDEX_MARK, f_out);#else onechar[0] = ALL_INDEX_MARK; write(f_out, onechar, 1);#endif if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute);#if !CRAZYSEGV putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out);#else onechar[0] = (attribute&0x0000ff00)>>8; onechar[1] = (attribute&0x000000ff); write(f_out, onechar, 2);#endif }#if !CRAZYSEGV putc(DONT_CONFUSE_SORT, f_out);#else onechar[0] = DONT_CONFUSE_SORT; write(f_out, onechar, 2);#endif goto next_token; }#if !CRAZYSEGV putc(WORD_END_MARK, f_out);#else onechar[0] = WORD_END_MARK; write(f_out, onechar, 1);#endif if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute);#if !CRAZYSEGV putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out);#else onechar[0] = (attribute&0x0000ff00)>>8; onechar[1] = (attribute&0x000000ff); write(f_out, onechar, 2);#endif } numonline = 0; x = -1; y = 0; even_words = 1; ip = tp->ip; /* traverse the indices list */ ip_old = ip; numelements = 0; while(ip != NULL) { numelements ++; if (CountWords) {#if !CRAZYSEGV fprintf(f_out, "%d", ip->offset[0]);#else sprintf(onestring, "%d", ip->offset[0]); write(f_out, onestring, strlen(onestring));#endif } else { if (ByteLevelIndex) { for (j=0; j < INDEX_SET_SIZE; j++) { if (ip->index[j] == INDEX_ELEM_FREE) continue; if ((ip->offset[j] <= y) && (y > 0) && (x == ip->index[j])) { /* consecutive offsets not increasing in same file! */ fprintf(stderr, "ignoring (%d, %d) > (%d, %d)\n", x, y, ip->index[j], ip->offset[j]); continue; /* error! */ } if (numonline >= MAX_PER_LINE) { /* terminate current line since it is too late to put ALL_INDEX_MARK now ... Unfortunate since sort is screwedup */#if !CRAZYSEGV putc('\n', f_out);#else onechar[0] = '\n'; write(f_out, onechar, 1);#endif#if 0 putc('\n', stdout);#endif /*0*/ word = tp->word; while(*word != '\0') { /* copy the word to output */#if !CRAZYSEGV putc(*word++, f_out);#else write(f_out, word, 1); word ++;#endif }#if !CRAZYSEGV putc(WORD_END_MARK, f_out);#else onechar[0] = WORD_END_MARK; write(f_out, onechar, 1);#endif if (StructuredIndex) { /* force big-endian as usual */ attribute = encode16b(tp->attribute);#if !CRAZYSEGV putc((attribute&0x0000ff00)>>8, f_out); putc((attribute&0x000000ff), f_out);#else onechar[0] = (attribute&0x0000ff00)>>8; onechar[1] = (attribute&0x000000ff); write(f_out, onechar, 2);#endif } numonline = 0; x = -1; /* to force code below to output it as if it is a fresh file */ y = 0; /* must output first offset as is, rather than difference */ } if (x != ip->index[j]) { if (x != -1) { temp = encode8b(0);#if !CRAZYSEGV putc(temp, f_out); /* can never ordinarily happen since ICurrentFileOffset is always ++d => delimiter (unless RecordLevelIndex) */#else onechar[0] = temp; write(f_out, onechar, 1);#endif } if (file_num <= MaxNum8bPartition) { x = encode8b(ip->index[j]);#if !CRAZYSEGV putc(x&0x000000ff, f_out);#else onechar[0] = x&0x000000ff; write(f_out, onechar, 1);#endif } else if (file_num <= MaxNum16bPartition) { x = encode16b(ip->index[j]);#if !CRAZYSEGV putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out);#else onechar[0] = (x&0x0000ff00)>>8; onechar[1] = x&0x000000ff; write(f_out, onechar, 2);#endif } else { x = encode24b(ip->index[j]);#if !CRAZYSEGV putc((x&0x00ff0000)>>16, f_out); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out);#else onechar[0] = (x&0x00ff0000)>>16;
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -