?? invert10_31.c
字號(hào):
/*本程序試驗(yàn)更新和建立倒排索引,該程序添加了英文索引處理,
在索引目錄下存儲(chǔ)$curfile.txt文件,記錄當(dāng)前寫的索引文件
2006_10_4 程序重新復(fù)查內(nèi)存情況,主要為了解決倒排索引中可能存在的內(nèi)存泄漏問(wèn)題,另外去掉內(nèi)存中不相關(guān)的函數(shù)
2006_10_8寫更新倒排程序,其中idx.txt文件每生成多篇文檔后寫一次idx.txt文件。
對(duì)于$curfile.txt文件,第一行記錄當(dāng)前寫的倒排文件的名字,第二行記錄已經(jīng)更新過(guò)的文件的名字,
下一次更新時(shí)從該文件開(kāi)始更新
基于以上要求,修改程序思路如下:
1、去掉建立倒排索引時(shí)對(duì)idx.txt文件寫的操作,idx.txt文件只在更新時(shí)生成
2、寫?yīng)毩⒌母潞瘮?shù),該函數(shù)是對(duì)當(dāng)前目錄下的所有大于$update.txt中記錄的文件(除idx.txt和$curfile.txt)文件內(nèi)容
進(jìn)行重新整理的過(guò)程,重整主要是將相同的詞放在一起。
建倒排索引時(shí)不寫idx.txt
*/
#include "stdio.h"
#include "seng.h"
#include "string.h"
#include "math.h"
#include "malloc.h"
#include "stdlib.h"
#include "assert.h"
#include "direct.h"
#define MALCSIZE 100 /*一次分配的內(nèi)存大小*/
#define RELCSIZE 100 /*當(dāng)一次分配的內(nèi)存不夠時(shí),二次分配時(shí)的加數(shù)*/
#define SHORTSIZE 20 /*一個(gè)詞的最大詞長(zhǎng)10*/
#define INDEXNUMBER 6768 /*簡(jiǎn)體中文字的個(gè)數(shù)*/
#define GBLWBTMNUM 161 /*簡(jiǎn)體中文國(guó)標(biāo)碼低位最小值*/
#define GBLWTOPNUM 254 /*簡(jiǎn)體中文國(guó)標(biāo)碼低位最大值*/
#define GBHTBTMNUM 176 /*簡(jiǎn)體中文國(guó)標(biāo)碼高位最小值*/
#define GBHTTOPNUM 247 /*簡(jiǎn)體中文國(guó)標(biāo)碼高位最大值*/
#define MAXPATHL 50 /*最大路徑*/
#define MAXPATH 50 /*最大路徑*/
#define MAXWORD 80 /*最長(zhǎng)的詞長(zhǎng)為40,一篇文檔中最多出現(xiàn)的相同的字頭的詞的個(gè)數(shù)*/
#define MAXWORDONE 50 /*以某一個(gè)字開(kāi)頭的可能有的詞數(shù)*/
#define MAXLINEFILE 3 /*倒排索引文件的最大行數(shù)*/
/* #define MAXWORDLEN 50 最大詞長(zhǎng)*/
#define MAXNUMBER 50 /*最大文檔數(shù)*/
#define MAXFILENAME 20 /*最大文檔數(shù)*/
#define MAXPOS 400 /*一個(gè)詞在一篇文章中最多出現(xiàn)的次數(shù)*/
#define LOWERA 97 /*字母a所對(duì)應(yīng)的的ASCII碼*/
#define DIFLOWHIGA 32 /*大寫字母和小寫字母ASCII碼的差值*/
#define MAXLINELEN 1000 /*倒排文檔中每行最多出現(xiàn)的字符個(gè)數(shù)*/
#define MAXBUFFER 2000 /*最大緩存區(qū), 要注意大小*/
int realloccount = 10;
/*將當(dāng)前的詞放入正向表中,sWords詞內(nèi)容, iPos詞在原文件中的位置,正常返回0*/
int InstWd2FwList(ForwardNode *pfNode, char *sWords, char *sFileURL, int iPos, char type)
{
WordNode *pWordNode;
/*在鏈表中尋找該詞,如果該詞出現(xiàn)了就在原節(jié)點(diǎn)上將頻率加1,如果該詞沒(méi)有出現(xiàn)則新創(chuàng)建并添加節(jié)點(diǎn)*/
if((pfNode == NULL) || (sWords == NULL) || (sFileURL == NULL) || (iPos < 0))
{
printf("error is: %d\n", EFWLIST);
return EFWLIST;
}
/*得到詞鏈表的首節(jié)點(diǎn)*/
if (pfNode->wFWordNode == NULL)
{
pfNode->wFWordNode = (WordNode *) malloc(sizeof(WordNode));
pWordNode = pfNode->wFWordNode;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
pWordNode->iPos[0] = iPos;
pWordNode->sResever = '0';
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
strcpy(pWordNode->sWords, sWords);
return 0;
}
pWordNode = pfNode->wFWordNode;
while ((strcmp(pWordNode->sWords, sWords) != 0) && (pWordNode->pnext != NULL))
{
pWordNode = pWordNode->pnext;
}
/*找到該詞*/
if ((strcmp(pWordNode->sWords, sWords)) == 0)
{
/*詞頻加1*/
(pWordNode->iFreq)++;
/*記住該詞的位置*/
if (pWordNode->iFreq < MAXPOS)
{
pWordNode->iPos[pWordNode->iFreq - 1] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
}
else
{
pWordNode->iPos = (int *) realloc(pWordNode->iPos, pWordNode->iFreq * sizeof(int));
strcpy(pWordNode->sFileURL, sFileURL);
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內(nèi)存不足!\n");
return EMALLOC;
}
}
}
/*該鏈表中沒(méi)有該詞,創(chuàng)建該詞節(jié)點(diǎn)*/
else
{
pWordNode->pnext = (WordNode *) malloc(sizeof(WordNode));
if(pWordNode->pnext == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內(nèi)存不足!\n");
return EMALLOC;
}
pWordNode = pWordNode->pnext;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->sResever = type;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內(nèi)存不足!\n");
return EMALLOC;
}
pWordNode->iPos[0] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
if(pWordNode->sWords == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內(nèi)存不足!\n");
return EMALLOC;
}
strcpy(pWordNode->sWords, sWords);
}
return 0;
}
/*計(jì)算每個(gè)詞在文章中的權(quán)值
入口參數(shù):ForwardNode *pfNode 正向表的指針
ipos該段文章中一共所含的詞的個(gè)數(shù)
*/
int Weight(ForwardNode *pfNode, long lPos)
{
WordNode *pWNode;
double lFreq;
double dPos;
if(pfNode == NULL) {
printf("EWEIGHT\n");
return EWEIGHT;
}
dPos =lPos;
pWNode = pfNode->wFWordNode;
while(pWNode != NULL)
{
/*如果是文檔題目*/
if(pWNode->sResever == 't')
{
pWNode->fWeight = 0.7;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔作者*/
else if(pWNode->sResever == 'a')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔摘要*/
else if(pWNode->sResever == 'b')
{
pWNode->fWeight = 0.5;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔關(guān)鍵詞*/
else if(pWNode->sResever == 'k')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
else
{
lFreq = pWNode->iFreq;
pWNode->fWeight = lFreq/dPos;
pWNode = pWNode->pnext;
}
}
return 0;
}
/*功能:讀分詞后的內(nèi)存,該函數(shù)包括詞的個(gè)數(shù)和位置的統(tǒng)計(jì),位置只統(tǒng)計(jì)該詞在正向表中的位置
入口參數(shù):SegBuf 分詞后的buffer
sDocID 文檔編號(hào),(目前以自然數(shù)統(tǒng)計(jì))
pfNode 正向表在內(nèi)存中的指針
返回值:正確返回0,錯(cuò)誤返回錯(cuò)誤碼
*/
int SegBufPos(char *SegBuf, char *sDocID, char *sFileURL, ForwardNode *pfNode)
{
long DCount = 0, lSegBufLen, lLoop;
char sWords[MAXWORDLEN];
int iPosTemp, iWordLen;
long lPos;
char type;
unsigned char temp[2];
if((SegBuf == NULL) || (sDocID == NULL) || (sFileURL == NULL) || (pfNode == NULL)){
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*記錄文檔個(gè)數(shù)*/
if(MAXDOCID > strlen(sDocID)){
strcpy(pfNode->sDocID, sDocID);
}
else{
printf("error is: %d\n", EDOCID);
printf("MAXDOCID is not enough!\n");
return EDOCID;
}
if(MAXPATHL > strlen(sDocID)){
strcpy(pfNode->sFileURL,sFileURL);
}
else{
printf("error is: %d\n", EPATHLEN);
printf("MAXPATHL is not enough!\n");
return EPATHLEN;
}
printf("begin to SegBufPos\n");
lPos = 0;
pfNode->wFWordNode = NULL;
lSegBufLen = strlen(SegBuf);
if (lSegBufLen <= 0) {
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*printf("%s\n", SegBuf);*/
type = '0';
for( lLoop = 0, iWordLen = 0; lLoop < lSegBufLen; lLoop = lLoop + iWordLen)
{
while (SegBuf[lLoop] == ' ') {
lLoop++;
}
if(lLoop >= lSegBufLen) break;
sscanf((SegBuf + lLoop),"%s ", sWords);
iWordLen = strlen(sWords) + 1;
if (strcmp(sWords,"末##末") == 0)
{
continue;
}
if (strcmp(sWords,"title@title") == 0)
{
type = 't';
continue;
}
if (strcmp(sWords,"author@author") == 0)
{
type = 'a';
continue;
}
if (strcmp(sWords,"keyword@keyword") == 0)
{
type = 'k';
continue;
}
if (strcmp(sWords,"abstract@abstract") == 0)
{
type = 'b';
continue;
}
if (strcmp(sWords,"text@text") == 0)
{
type = 'x';
continue;
}
if (strcmp(sWords,"畢紅") == 0)
{
printf("break!\n");
}
temp[0] = sWords[0];
temp[1] = sWords[1];
iPosTemp = 0;
/*找到該詞的位置*/
lPos++;
/*如果是漢字*/
if ((temp[0] <= GBHTTOPNUM) && (temp[0] >= GBHTBTMNUM) && (temp[1] <= GBLWTOPNUM ) && (temp[1] >= GBLWBTMNUM ))
{
/*將該詞插入漢字正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
/*如果是英文單詞*/
else if ((temp[0] >= 'a') && (temp[0] <= 'z') || ((temp[0] >= 'A') && (temp[0] <= 'Z')))
{
/*將該詞放入正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
}
printf("SegBufPos is over!\n");
/*計(jì)算各個(gè)詞的權(quán)值*/
if( Weight(pfNode, lPos) != 0) {return EWEIGHT;}
printf("weight calculate is over!\n");
return 0;
}
/*
功能:建立正向表,
記錄處理的文檔總數(shù)的文件名,即文檔編號(hào),
入口參數(shù):
sFilePath 記錄文檔總個(gè)數(shù)文件所在的路徑
sRcdFile 記錄文檔總個(gè)數(shù)的文件
sScFile 記錄原文件位置的路徑
fNode 正向表指針
DocCount 文檔個(gè)數(shù)
sResult 分詞后的文檔緩沖區(qū)
返回值:成功返回0,否則返回錯(cuò)誤編碼
*/
int ForwardBld(char *sFilePath, char *sRcdFile, char *sScFile, ForwardNode **fNode, long DocCount, char *strBuf)
{
FILE *pFlCount;
char *sDocCount;
char *RcdFile;
char *sFileURL;
int iCurDoc;
long ltempCount;
int i;
if((sRcdFile == NULL) || (sScFile == NULL) || (sFilePath == NULL) || (fNode == NULL) || (strBuf == NULL))
{
printf("建立正向表時(shí)入口參處有誤!\n");
return EFWBLDPAR;
}
if(DocCount <= 0)
{
printf("建立正向表時(shí)入口參處有誤!\n");
return EFWBLDPAR;
}
RcdFile = (char *) malloc(MAXPATHLEN * sizeof(char ));
sDocCount = (char *) malloc(MAXNUMBER * sizeof(char));
sFileURL = (char *) malloc(MAXPATHLEN * sizeof(char));
if((RcdFile == NULL) || (sDocCount == NULL) || (sFileURL == NULL))
{
printf("內(nèi)存不足!\n");
return EMEM;
}
/*得到記錄系統(tǒng)總共文檔數(shù)的文件名*/
strcpy(RcdFile, sFilePath);
strcat(RcdFile,"\\");
strcat(RcdFile, sRcdFile);
/*記錄總共處理的文檔個(gè)數(shù)*/
if( (pFlCount = fopen( RcdFile, "r" )) == NULL )
{
if((pFlCount = fopen(RcdFile, "a+")) == NULL)
{
printf("record.txt cann't open!\n");
/*return EFILEOPEN;*/
}
ltempCount = DocCount;
iCurDoc = 0;
itoa(ltempCount, sDocCount, 10);
if(pFlCount)
{
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
}
/*讀出處理過(guò)的文檔個(gè)數(shù)*/
else
{
fscanf(pFlCount,"%s ",sDocCount);
if(strlen(sDocCount) > MAXNUMBER)
{
/*文件指針退回*/
i = strlen(sDocCount);
i = -i;
fseek(pFlCount, i, 1);
sDocCount = (char *) realloc (sDocCount, (MAXNUMBER + RELCSIZE ) * sizeof(char));
fscanf(pFlCount,"%s ",sDocCount);
}
ltempCount = atol(sDocCount);
iCurDoc = ltempCount;
ltempCount = ltempCount + DocCount;
itoa(ltempCount, sDocCount, 10);
fclose(pFlCount);
if( (pFlCount = fopen( RcdFile, "w+" )) == NULL )
{
printf("record.txt can't open! (else) \n");
return EFILEOPEN;
}
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
printf("當(dāng)前正在處理文檔:%s\n", sDocCount);
*fNode = (ForwardNode *) malloc((DocCount) * sizeof(ForwardNode));
if(*fNode == NULL)
{
return EMEM;
}
/*得到分詞前原文件名字*/
strcpy(sFileURL, sScFile);
for(i = 0; i < DocCount; i++)
{
/* 當(dāng)前目錄下的分詞結(jié)果文件以數(shù)字命名
itoa(i,sFileName, 10);
strcat(sFileName,".txt");
*/
if(itoa(iCurDoc + i,sDocCount, 10) == NULL)
{
return EITOA;
}
if(SegBufPos(strBuf, sDocCount, sFileURL, *fNode + i) != 0)
{
printf("%d\n", ERSEGFILE);
printf("segbufPos return error!\n ");
return ERSEGFILE;
}
}
if(RcdFile)
free(RcdFile);
if(sDocCount)
free(sDocCount);
if(sFileURL)
free(sFileURL);
return 0;
}
/*該函數(shù)功能:建立漢字倒排索引表,(索引號(hào)是GB碼減去176和161)
入口參數(shù):InvertNode **Index為待分配的空間
int * error 錯(cuò)誤編碼,函數(shù)執(zhí)行正確為0
返回值: InvertNode ** 分配內(nèi)存后的地址
*/
InvertNode ** IndexBuild(InvertNode **Index, int * error)
{
int i;
int count = 0;
Index = (InvertNode **) malloc ((GBHTTOPNUM - GBHTBTMNUM + 1) * sizeof(InvertNode *));
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -