?? invert10_31.c
字號:
if (Index == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
printf("內(nèi)存不足!\n");
return NULL;
}
for(i = 0; i < (GBHTTOPNUM - GBHTBTMNUM + 1); i++)
{
Index[i] = (InvertNode *) malloc(( GBLWTOPNUM - GBLWBTMNUM + 1) * sizeof(InvertNode ));
if (Index[i] == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
printf("內(nèi)存不足!\n");
return NULL;
}
}
return Index;
}
/*該函數(shù)功能:建立英文倒排索引表,(索引號是GB碼減去176和161)
入口參數(shù):InvertNode *Index為待分配的空間
int * error 錯(cuò)誤編碼,函數(shù)執(zhí)行正確為0
返回值: InvertNode * 分配內(nèi)存后的地址
*/
InvertNode * EIdxBuild(InvertNode *Index, int * error)
{
int count = 0;
Index = (InvertNode *) malloc (26 * sizeof(InvertNode));
if (Index == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
return NULL;
}
return Index;
}
/*該函數(shù)功能:建立倒排索引表,(索引號是GB碼減去176和161)
入口參數(shù):InvertNode **InvertIndex 為中文國標(biāo)碼索引待分配的空間
InvertNode *EIntIdx 為英文單詞索引
const int DocCount 為一次處理的文檔個(gè)數(shù)
const ForwardNode *fNode 為正向表所指向的指針
返回值: 為0,操作正常,否則錯(cuò)誤
*/
int InvertIdxBld(InvertNode **InvertIndex ,InvertNode *EIntIdx ,const ForwardNode *fNode, const int DocCount)
{
/*int *IndexError;*/
WordNode *pCurWordNode;
DocNode *pCurDoc;
int flag;
int i,k,m;
unsigned char uTemp[MAXWORD + 1];
InvertNode *pInvertNode;
/*IndexError = (int *) malloc (sizeof(int));
*IndexError = 0;*/
if (fNode == NULL){
return EFNODE;
}
if (DocCount < 0) {
return EFNODE;
}
/*建立倒排索引*/
for(i = 0 ; i < DocCount; i++)
{
pCurWordNode = (fNode + i)->wFWordNode;
printf("\n正在建立文檔: %s 的倒排索引\n", (fNode + i)->sFileURL );/**/
while(pCurWordNode != NULL)
{
sprintf(uTemp, pCurWordNode->sWords);
/*如果是漢字,將漢字寫入漢字倒排索引*/
if((uTemp[0] <= GBHTTOPNUM) && (uTemp[0] >= GBHTBTMNUM) && (uTemp[1] <= GBLWTOPNUM) && (uTemp[1] >= GBLWBTMNUM))
{
/*如果是未出現(xiàn)的字*/
if(InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM].lWordNum == 0)
{
pInvertNode = &InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM];
pInvertNode->lWordNum = 1;
/*如果是單字,在單字的位置上,新添加文檔節(jié)點(diǎn)*/
if (uTemp[2] == '\0')
{
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
strcpy(pInvertNode->sWords, uTemp);
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
/*如果不是單字就在后面連上一個(gè)InvertNode,并記錄*/
else
{
pInvertNode->pNextNode = (InvertNode *) malloc (sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
strcpy(pInvertNode->sWords, uTemp);
pInvertNode->pNextNode = NULL;
pInvertNode->lDocNum = 0;
pInvertNode->pDocNode = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
}
/*如果該字已出現(xiàn)過*/
else
{
pInvertNode = &InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM];
/*尋找該詞*/
flag = 0;
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
else
{
while (pInvertNode->pNextNode != NULL)
{
/*如果該詞出現(xiàn)過*/
if(strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
break;
}
pInvertNode = pInvertNode->pNextNode;
}
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
}
/*如果該詞是未出現(xiàn)的詞*/
if(flag == 0)
{
(InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM].lWordNum)++;
pInvertNode->pNextNode = (InvertNode *)malloc(sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
/*這句可要可不要,因?yàn)橹辉趩巫止?jié)點(diǎn)中記錄以該字為頭的詞數(shù),不在詞節(jié)點(diǎn)中記錄個(gè)數(shù)*/
pInvertNode->lWordNum = 1;
pInvertNode->lDocNum = 0;
pInvertNode->pNextNode = NULL;
strcpy(pInvertNode->sWords, uTemp);
/*分配文檔節(jié)點(diǎn)的內(nèi)存*/
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
}
/*如果以前出現(xiàn)過,則找到該詞所在的pInvertNode,然后新添加文檔節(jié)點(diǎn)*/
else
{
pCurDoc = pInvertNode->pDocNode;
/*找到鏈表的尾端*/
while (pCurDoc->pNext != NULL)
{
pCurDoc = pCurDoc->pNext;
}
pCurDoc ->pNext = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pCurDoc->pNext;
pCurDoc->pNext = NULL;
}
}
/*填入文檔信息*/
pInvertNode->lDocNum ++;
strcpy(pCurDoc->sDocID,(fNode + i)->sDocID);
pCurDoc->fWeight = pCurWordNode->fWeight;
pCurDoc->iFreq = pCurWordNode->iFreq;
pCurDoc->iPos = (int *) malloc(pCurDoc->iFreq * sizeof(int));
for( k = 0; k < pCurWordNode->iFreq; k++)
{
pCurDoc->iPos[k] = pCurWordNode->iPos[k];
}
strcpy(pCurDoc->sFileURL, (fNode + i)->sFileURL);
}
/*如果正向表中是英文,則將其寫入英文倒排索引*/
if((uTemp[0] <= 'z') && (uTemp[0] >= 'a') || (uTemp[0] <= 'Z') && (uTemp[0] >= 'A'))
{
/*將所有大寫字母轉(zhuǎn)換成小寫字母*/
m = 0;
while(uTemp[m] != '\0')
{
if((uTemp[m] <= 'Z') && (uTemp[m] >= 'A'))
{
uTemp[m] = uTemp[m] + DIFLOWHIGA;
}
m++;
}
/*如果是未出現(xiàn)的字母*/
if(EIntIdx[uTemp[0] - LOWERA].lWordNum == 0)
{
pInvertNode = &EIntIdx[uTemp[0] - LOWERA];
pInvertNode->lWordNum = 1;
/*如果是單個(gè)字母,在單個(gè)字母的位置上,新添加文檔節(jié)點(diǎn)*/
if (uTemp[1] == '\0')
{
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
strcpy(pInvertNode->sWords, uTemp);
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
/*如果不是單個(gè)字母就在后面連上一個(gè)InvertNode,并記錄*/
else
{
pInvertNode->pNextNode = (InvertNode *) malloc (sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
strcpy(pInvertNode->sWords, uTemp);
pInvertNode->pNextNode = NULL;
pInvertNode->lDocNum = 0;
pInvertNode->pDocNode = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
}
}
/*如果該字母已出現(xiàn)過*/
else
{
pInvertNode = &EIntIdx[uTemp[0] - LOWERA];
/*尋找該詞*/
flag = 0;
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
else
{
while (pInvertNode->pNextNode != NULL)
{
/*如果該詞出現(xiàn)過*/
if(strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
break;
}
pInvertNode = pInvertNode->pNextNode;
}
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
}
/*如果該詞是未出現(xiàn)的詞*/
if(flag == 0)
{
(EIntIdx[uTemp[0] - LOWERA].lWordNum)++;
pInvertNode->pNextNode = (InvertNode *)malloc(sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
/*這句可要可不要,因?yàn)橹辉趩巫止?jié)點(diǎn)中記錄以該字為頭的詞數(shù),不在詞節(jié)點(diǎn)中記錄個(gè)數(shù)*/
pInvertNode->lWordNum = 1;
pInvertNode->lDocNum = 0;
pInvertNode->pNextNode = NULL;
strcpy(pInvertNode->sWords, uTemp);
/*分配文檔節(jié)點(diǎn)的內(nèi)存*/
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->iPos = NULL;
pCurDoc->pNext = NULL;
}
/*如果以前出現(xiàn)過,則找到該詞所在的pInvertNode,然后新添加文檔節(jié)點(diǎn)*/
else
{
pCurDoc = pInvertNode->pDocNode;
/*找到鏈表的尾端*/
while (pCurDoc->pNext != NULL)
{
pCurDoc = pCurDoc->pNext;
}
pCurDoc ->pNext = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pCurDoc->pNext;
pCurDoc->iPos = NULL;
pCurDoc->pNext = NULL;
}
}
/*填入文檔信息*/
pInvertNode->lDocNum ++;
strcpy(pCurDoc->sDocID,(fNode + i)->sDocID);
pCurDoc->fWeight = pCurWordNode->fWeight;
pCurDoc->iFreq = pCurWordNode->iFreq;
pCurDoc->iPos = (int *) malloc(pCurDoc->iFreq * sizeof(int));
for( k = 0; k < pCurWordNode->iFreq; k++)
{
pCurDoc->iPos[k] = pCurWordNode->iPos[k];
}
strcpy(pCurDoc->sFileURL, (fNode + i)->sFileURL);
}
/*處理完一個(gè)節(jié)點(diǎn),釋放正向表詞節(jié)點(diǎn)所占的內(nèi)存*/
pCurWordNode = pCurWordNode->pnext;
}
}
/*if(IndexError)
free(IndexError);*/
return 0;
}
/* 將所給單鏈表的內(nèi)容寫到指定文件中,若文件超過指定大小,則新建文件。
* 新文件的名稱為:原文件名+數(shù)字
*/
/* 返回文件大小 (字節(jié))*/
long filesize(FILE *stream)
{
long curpos, length;
curpos = ftell(stream);
fseek(stream, 0L, SEEK_END);
length = ftell(stream) ;
fseek(stream, curpos, SEEK_SET);
return length;
}
/** 獲取文件序號 **/
int get_file_seq_num(char * sFileName)
{
int i, length;
int ndotpos, numstartpos;
length = strlen(sFileName);
/* 確定文件后綴開始位置*/
for(i = length-1 ; (sFileName[i] != '.') && (i>0) ; i--) ;
if (i>0)
{
ndotpos = i;
}
else
ndotpos = length-1;
/* 從文件后綴開始位置往回搜索,找數(shù)字開始位置*/
i = ndotpos ;
while ((i >0) && (atoi(sFileName+i-1) !=0) )
{
i--;
}
numstartpos = i;
/* 返回文件序號, 若沒有數(shù)字, 返回為0*/
return(atoi(sFileName+numstartpos)) ;
}
int create_next_file_name(char *sname_in, char ** sname_out)
{
int i, length, oldseq, newseq;
int ndotpos, numstartpos;
char *tmpstr;
length = strlen(sname_in);
tmpstr = (char *)malloc(sizeof(char)* (length +1)) ;
if(strcmp(sname_in, "10.txt") == 0)
{
i = 0;
}
/* 確定文件后綴開始的位置*/
for(i = length-1 ; (sname_in[i] != '.') && (i>0) ; i--) ;
if (i>0)
{
ndotpos = i;
}
else
{
ndotpos = length-1;
}
/*從文件后綴開始的位置往回搜索,找數(shù)字開始的位置*/
i = ndotpos ;
while ((i >0) && (atoi(sname_in+i-1) >= 0)&&(atoi(sname_in+i-1) <= 9))
{
i--;
}
if (i == 1)
{
if ((sname_in[0] >= '0') && (sname_in[0] <= '9'))
{
i = 0;
}
}
numstartpos = i;
for(i = numstartpos; i < ndotpos; i++)
{
tmpstr[i] = sname_in[i];
}
tmpstr[i] = '\0';
oldseq = atoi(tmpstr);
newseq = oldseq + 1;
itoa(newseq, *sname_out, 10);
sprintf(*sname_out, "%s%s", *sname_out, sname_in+ndotpos);
if(tmpstr)
free(tmpstr);
return 0 ;
}
/*該函數(shù)的功能:建立倒排索引時(shí),當(dāng)索引文檔大小大于一定值時(shí),寫入另一個(gè)文檔
本函數(shù)主要是得到當(dāng)前目錄下上一次建立索引時(shí)寫過的文檔,
即可能沒有達(dá)到指定大小的文檔的名字,
該名字保存在$curfile.txt文件中。
入口參數(shù):sCurDir為當(dāng)前索引目錄
返回值:索引文件名字
*/
char* GetWrtFlName(const char *sCurDir)
{
char *sDir;
FILE *stream;
char *sFileName;
int count,i;
sFileName = (char *) malloc(sizeof(char) * MAXPATHLEN);
sDir = (char *) malloc(sizeof(char) * MAXPATHLEN);
strcpy(sDir, sCurDir);
strcat(sDir, "\\");
strcat(sDir, "$curfile.txt");
if((stream = fopen(sDir, "r")) ==NULL)
{
/*當(dāng)?shù)谝淮谓⑺饕龝r(shí),索引文件以1.txt開始*/
strcpy(sFileName, "1.txt");
if(sDir)
free(sDir);
return sFileName;
}
else
{
fscanf(stream, "%s", sFileName);
/*判斷記錄當(dāng)前索引文件名字的索引文檔是否正常,如果正常讀出,
否則將sFileName置成1.txt*/
count = 0;
i = 0;
while(sFileName[i] != '\0')
{
i++;
if(sFileName[i] == '.')
{
count++;
}
}
if(count != 1)
{
strcpy(sFileName, "1.txt");
}
fclose(stream);
if(sDir)
free(sDir);
return sFileName;
}
}
/*該函數(shù)的功能:建立倒排索引時(shí),當(dāng)索引文檔大小大于一定值時(shí),寫入另一個(gè)文檔
本函數(shù)主要是記錄當(dāng)前所寫的文檔的名字,
該名字保存在$curfile.txt文件中。
入口參數(shù):sFileName為當(dāng)前的文件名字如1.txt,2,txt...等
sCurDir為當(dāng)前索引目錄如e:\invt\半
返回值:成功返回0
*/
int WrtFlName(const char *sFileName, const char *sCurDir)
{
char *sDir;
FILE *stream;
sDir = (char *) malloc(sizeof(char) * MAXPATHLEN);
strcpy(sDir, sCurDir);
strcat(sDir, "\\");
strcat(sDir, "$curfile.txt");
if ((stream = fopen(sDir, "w+")) == NULL)
{
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -