?? invert10_31.c
字號:
/*本程序試驗更新和建立倒排索引,該程序添加了英文索引處理,
在索引目錄下存儲$curfile.txt文件,記錄當前寫的索引文件
2006_10_4 程序重新復查內存情況,主要為了解決倒排索引中可能存在的內存泄漏問題,另外去掉內存中不相關的函數
2006_10_8寫更新倒排程序,其中idx.txt文件每生成多篇文檔后寫一次idx.txt文件。
對于$curfile.txt文件,第一行記錄當前寫的倒排文件的名字,第二行記錄已經更新過的文件的名字,
下一次更新時從該文件開始更新
基于以上要求,修改程序思路如下:
1、去掉建立倒排索引時對idx.txt文件寫的操作,idx.txt文件只在更新時生成
2、寫獨立的更新函數,該函數是對當前目錄下的所有大于$update.txt中記錄的文件(除idx.txt和$curfile.txt)文件內容
進行重新整理的過程,重整主要是將相同的詞放在一起。
建倒排索引時不寫idx.txt
*/
#include "stdio.h"
#include "seng.h"
#include "string.h"
#include "math.h"
#include "malloc.h"
#include "stdlib.h"
#include "assert.h"
#include "direct.h"
#define MALCSIZE 100 /*一次分配的內存大小*/
#define RELCSIZE 100 /*當一次分配的內存不夠時,二次分配時的加數*/
#define SHORTSIZE 20 /*一個詞的最大詞長10*/
#define INDEXNUMBER 6768 /*簡體中文字的個數*/
#define GBLWBTMNUM 161 /*簡體中文國標碼低位最小值*/
#define GBLWTOPNUM 254 /*簡體中文國標碼低位最大值*/
#define GBHTBTMNUM 176 /*簡體中文國標碼高位最小值*/
#define GBHTTOPNUM 247 /*簡體中文國標碼高位最大值*/
#define MAXPATHL 50 /*最大路徑*/
#define MAXPATH 50 /*最大路徑*/
#define MAXWORD 80 /*最長的詞長為40,一篇文檔中最多出現的相同的字頭的詞的個數*/
#define MAXWORDONE 50 /*以某一個字開頭的可能有的詞數*/
#define MAXLINEFILE 3 /*倒排索引文件的最大行數*/
/* #define MAXWORDLEN 50 最大詞長*/
#define MAXNUMBER 50 /*最大文檔數*/
#define MAXFILENAME 20 /*最大文檔數*/
#define MAXPOS 400 /*一個詞在一篇文章中最多出現的次數*/
#define LOWERA 97 /*字母a所對應的的ASCII碼*/
#define DIFLOWHIGA 32 /*大寫字母和小寫字母ASCII碼的差值*/
#define MAXLINELEN 1000 /*倒排文檔中每行最多出現的字符個數*/
#define MAXBUFFER 2000 /*最大緩存區, 要注意大小*/
int realloccount = 10;
/*將當前的詞放入正向表中,sWords詞內容, iPos詞在原文件中的位置,正常返回0*/
int InstWd2FwList(ForwardNode *pfNode, char *sWords, char *sFileURL, int iPos, char type)
{
WordNode *pWordNode;
/*在鏈表中尋找該詞,如果該詞出現了就在原節點上將頻率加1,如果該詞沒有出現則新創建并添加節點*/
if((pfNode == NULL) || (sWords == NULL) || (sFileURL == NULL) || (iPos < 0))
{
printf("error is: %d\n", EFWLIST);
return EFWLIST;
}
/*得到詞鏈表的首節點*/
if (pfNode->wFWordNode == NULL)
{
pfNode->wFWordNode = (WordNode *) malloc(sizeof(WordNode));
pWordNode = pfNode->wFWordNode;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
pWordNode->iPos[0] = iPos;
pWordNode->sResever = '0';
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
strcpy(pWordNode->sWords, sWords);
return 0;
}
pWordNode = pfNode->wFWordNode;
while ((strcmp(pWordNode->sWords, sWords) != 0) && (pWordNode->pnext != NULL))
{
pWordNode = pWordNode->pnext;
}
/*找到該詞*/
if ((strcmp(pWordNode->sWords, sWords)) == 0)
{
/*詞頻加1*/
(pWordNode->iFreq)++;
/*記住該詞的位置*/
if (pWordNode->iFreq < MAXPOS)
{
pWordNode->iPos[pWordNode->iFreq - 1] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
}
else
{
pWordNode->iPos = (int *) realloc(pWordNode->iPos, pWordNode->iFreq * sizeof(int));
strcpy(pWordNode->sFileURL, sFileURL);
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內存不足!\n");
return EMALLOC;
}
}
}
/*該鏈表中沒有該詞,創建該詞節點*/
else
{
pWordNode->pnext = (WordNode *) malloc(sizeof(WordNode));
if(pWordNode->pnext == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內存不足!\n");
return EMALLOC;
}
pWordNode = pWordNode->pnext;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->sResever = type;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內存不足!\n");
return EMALLOC;
}
pWordNode->iPos[0] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
if(pWordNode->sWords == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("內存不足!\n");
return EMALLOC;
}
strcpy(pWordNode->sWords, sWords);
}
return 0;
}
/*計算每個詞在文章中的權值
入口參數:ForwardNode *pfNode 正向表的指針
ipos該段文章中一共所含的詞的個數
*/
int Weight(ForwardNode *pfNode, long lPos)
{
WordNode *pWNode;
double lFreq;
double dPos;
if(pfNode == NULL) {
printf("EWEIGHT\n");
return EWEIGHT;
}
dPos =lPos;
pWNode = pfNode->wFWordNode;
while(pWNode != NULL)
{
/*如果是文檔題目*/
if(pWNode->sResever == 't')
{
pWNode->fWeight = 0.7;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔作者*/
else if(pWNode->sResever == 'a')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔摘要*/
else if(pWNode->sResever == 'b')
{
pWNode->fWeight = 0.5;
pWNode = pWNode->pnext;
continue;
}
/*如果是文檔關鍵詞*/
else if(pWNode->sResever == 'k')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
else
{
lFreq = pWNode->iFreq;
pWNode->fWeight = lFreq/dPos;
pWNode = pWNode->pnext;
}
}
return 0;
}
/*功能:讀分詞后的內存,該函數包括詞的個數和位置的統計,位置只統計該詞在正向表中的位置
入口參數:SegBuf 分詞后的buffer
sDocID 文檔編號,(目前以自然數統計)
pfNode 正向表在內存中的指針
返回值:正確返回0,錯誤返回錯誤碼
*/
int SegBufPos(char *SegBuf, char *sDocID, char *sFileURL, ForwardNode *pfNode)
{
long DCount = 0, lSegBufLen, lLoop;
char sWords[MAXWORDLEN];
int iPosTemp, iWordLen;
long lPos;
char type;
unsigned char temp[2];
if((SegBuf == NULL) || (sDocID == NULL) || (sFileURL == NULL) || (pfNode == NULL)){
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*記錄文檔個數*/
if(MAXDOCID > strlen(sDocID)){
strcpy(pfNode->sDocID, sDocID);
}
else{
printf("error is: %d\n", EDOCID);
printf("MAXDOCID is not enough!\n");
return EDOCID;
}
if(MAXPATHL > strlen(sDocID)){
strcpy(pfNode->sFileURL,sFileURL);
}
else{
printf("error is: %d\n", EPATHLEN);
printf("MAXPATHL is not enough!\n");
return EPATHLEN;
}
printf("begin to SegBufPos\n");
lPos = 0;
pfNode->wFWordNode = NULL;
lSegBufLen = strlen(SegBuf);
if (lSegBufLen <= 0) {
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*printf("%s\n", SegBuf);*/
type = '0';
for( lLoop = 0, iWordLen = 0; lLoop < lSegBufLen; lLoop = lLoop + iWordLen)
{
while (SegBuf[lLoop] == ' ') {
lLoop++;
}
if(lLoop >= lSegBufLen) break;
sscanf((SegBuf + lLoop),"%s ", sWords);
iWordLen = strlen(sWords) + 1;
if (strcmp(sWords,"末##末") == 0)
{
continue;
}
if (strcmp(sWords,"title@title") == 0)
{
type = 't';
continue;
}
if (strcmp(sWords,"author@author") == 0)
{
type = 'a';
continue;
}
if (strcmp(sWords,"keyword@keyword") == 0)
{
type = 'k';
continue;
}
if (strcmp(sWords,"abstract@abstract") == 0)
{
type = 'b';
continue;
}
if (strcmp(sWords,"text@text") == 0)
{
type = 'x';
continue;
}
if (strcmp(sWords,"畢紅") == 0)
{
printf("break!\n");
}
temp[0] = sWords[0];
temp[1] = sWords[1];
iPosTemp = 0;
/*找到該詞的位置*/
lPos++;
/*如果是漢字*/
if ((temp[0] <= GBHTTOPNUM) && (temp[0] >= GBHTBTMNUM) && (temp[1] <= GBLWTOPNUM ) && (temp[1] >= GBLWBTMNUM ))
{
/*將該詞插入漢字正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
/*如果是英文單詞*/
else if ((temp[0] >= 'a') && (temp[0] <= 'z') || ((temp[0] >= 'A') && (temp[0] <= 'Z')))
{
/*將該詞放入正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
}
printf("SegBufPos is over!\n");
/*計算各個詞的權值*/
if( Weight(pfNode, lPos) != 0) {return EWEIGHT;}
printf("weight calculate is over!\n");
return 0;
}
/*
功能:建立正向表,
記錄處理的文檔總數的文件名,即文檔編號,
入口參數:
sFilePath 記錄文檔總個數文件所在的路徑
sRcdFile 記錄文檔總個數的文件
sScFile 記錄原文件位置的路徑
fNode 正向表指針
DocCount 文檔個數
sResult 分詞后的文檔緩沖區
返回值:成功返回0,否則返回錯誤編碼
*/
int ForwardBld(char *sFilePath, char *sRcdFile, char *sScFile, ForwardNode **fNode, long DocCount, char *strBuf)
{
FILE *pFlCount;
char *sDocCount;
char *RcdFile;
char *sFileURL;
int iCurDoc;
long ltempCount;
int i;
if((sRcdFile == NULL) || (sScFile == NULL) || (sFilePath == NULL) || (fNode == NULL) || (strBuf == NULL))
{
printf("建立正向表時入口參處有誤!\n");
return EFWBLDPAR;
}
if(DocCount <= 0)
{
printf("建立正向表時入口參處有誤!\n");
return EFWBLDPAR;
}
RcdFile = (char *) malloc(MAXPATHLEN * sizeof(char ));
sDocCount = (char *) malloc(MAXNUMBER * sizeof(char));
sFileURL = (char *) malloc(MAXPATHLEN * sizeof(char));
if((RcdFile == NULL) || (sDocCount == NULL) || (sFileURL == NULL))
{
printf("內存不足!\n");
return EMEM;
}
/*得到記錄系統總共文檔數的文件名*/
strcpy(RcdFile, sFilePath);
strcat(RcdFile,"\\");
strcat(RcdFile, sRcdFile);
/*記錄總共處理的文檔個數*/
if( (pFlCount = fopen( RcdFile, "r" )) == NULL )
{
if((pFlCount = fopen(RcdFile, "a+")) == NULL)
{
printf("record.txt cann't open!\n");
/*return EFILEOPEN;*/
}
ltempCount = DocCount;
iCurDoc = 0;
itoa(ltempCount, sDocCount, 10);
if(pFlCount)
{
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
}
/*讀出處理過的文檔個數*/
else
{
fscanf(pFlCount,"%s ",sDocCount);
if(strlen(sDocCount) > MAXNUMBER)
{
/*文件指針退回*/
i = strlen(sDocCount);
i = -i;
fseek(pFlCount, i, 1);
sDocCount = (char *) realloc (sDocCount, (MAXNUMBER + RELCSIZE ) * sizeof(char));
fscanf(pFlCount,"%s ",sDocCount);
}
ltempCount = atol(sDocCount);
iCurDoc = ltempCount;
ltempCount = ltempCount + DocCount;
itoa(ltempCount, sDocCount, 10);
fclose(pFlCount);
if( (pFlCount = fopen( RcdFile, "w+" )) == NULL )
{
printf("record.txt can't open! (else) \n");
return EFILEOPEN;
}
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
printf("當前正在處理文檔:%s\n", sDocCount);
*fNode = (ForwardNode *) malloc((DocCount) * sizeof(ForwardNode));
if(*fNode == NULL)
{
return EMEM;
}
/*得到分詞前原文件名字*/
strcpy(sFileURL, sScFile);
for(i = 0; i < DocCount; i++)
{
/* 當前目錄下的分詞結果文件以數字命名
itoa(i,sFileName, 10);
strcat(sFileName,".txt");
*/
if(itoa(iCurDoc + i,sDocCount, 10) == NULL)
{
return EITOA;
}
if(SegBufPos(strBuf, sDocCount, sFileURL, *fNode + i) != 0)
{
printf("%d\n", ERSEGFILE);
printf("segbufPos return error!\n ");
return ERSEGFILE;
}
}
if(RcdFile)
free(RcdFile);
if(sDocCount)
free(sDocCount);
if(sFileURL)
free(sFileURL);
return 0;
}
/*該函數功能:建立漢字倒排索引表,(索引號是GB碼減去176和161)
入口參數:InvertNode **Index為待分配的空間
int * error 錯誤編碼,函數執行正確為0
返回值: InvertNode ** 分配內存后的地址
*/
InvertNode ** IndexBuild(InvertNode **Index, int * error)
{
int i;
int count = 0;
Index = (InvertNode **) malloc ((GBHTTOPNUM - GBHTBTMNUM + 1) * sizeof(InvertNode *));
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -