?? msearch.cpp
字號:
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <stdlib.h>
#include <time.h>
#include <conio.h>
#include <windows.h>
#define SHIFT3 4
#define SHIFT2 10
#define SHIFT1_USED 10
#define SHIFT1 (32-SHIFT2-SHIFT3)
static char * ChangeFileExt(const char * fn, const char * fext)
{
int l=strlen(fn);
int le=strlen(fext);
for(int i=l-1; fn[i]!='.' && fn[i]!='\\' && fn[i]!='/' && fn[i]!=':' && i>=0; i--);
char * fnext;
// if extensible name does not exist
if(i<=0||fn[i]=='\\'||fn[i]=='/'||fn[i]==':')
{
// allocate memory for new name
fnext=new char[l+le+2];
strcpy(fnext,fn);
fnext[l]='.';
l++;
strcpy(fnext+l,fext);
}
else
{
l=i+1;
// allocate
fnext=new char[l+le+1];
strcpy(fnext,fn);
strcpy(fnext+l,fext);
}
return fnext;
}
#define LBUF_SIZE 1024 // line buffer size
#define MEMORY_MIRROR // process dictionary file in memory (unused)
#define DEBUG
#ifdef DEBUG
static void dumpBytes(const void * _bytes, size_t len)
{
const unsigned char * bytes = (const unsigned char *)_bytes;
size_t j;
for(j=0; j<len; j++)
printf("%02x ", (int)bytes[j] );
}
#endif
#pragma pack(1)
// Dictionary file header
typedef struct _DictHeader
{
char maodict[8]; // string "MAODICT"
long so; // index start offset
long eo; // index end offset
} DictHeader;
// Index item structure(5 bytes)
typedef struct _IndexItem
{
union
{
long offset; // string offset
char * str; // string pointer(unused)
};
char length; // string length
} IndexItem;
// A list node of searching result
typedef struct _ResultNode
{
int index; // index number in dictionary
int count; // match times of word
//ResultNode * next;
} ResultNode;
// Global variables
static FILE * fpDict;
static char * lbuf;
static size_t lbufLen;
static char * buf;
static int compareItem(const void * a1, const void * a2)
{
IndexItem * it1 = (IndexItem *)a1;
IndexItem * it2 = (IndexItem *)a2;
size_t len1 = it1->length;
size_t len2 = it2->length;
int ret;
ret = strncmp(buf + it1->offset, buf + it2->offset, (len1<len2)?len1:len2 );
if(ret==0)
{
if(len1>len2)
ret = 1;
else if(len1<len2)
ret = -1;
}
return ret;
}
// Create binary format dictionary (one word per line in text file, need not sorted or unique)
static int textToBinaryFile(const char * fnDictT, const char * fnDict)
{
FILE * fpDictT;
//FILE * fpDict;
FILE * fpTemp;
size_t lbufSize;
//char * lbuf;
//size_t lbufLen;
DictHeader hdr;
int rcCount; // record count
IndexItem it;
IndexItem * indexes;
size_t strAreaSize;
int i;
int lastWIndex; // last written index number
int rcCountF; // record count after filted
fpDictT = fopen(fnDictT, "r");
if(fpDictT==NULL)
{
printf("Cannot open source file [%s].\n", fnDictT);
return -1;
}
fpDict = fopen(fnDict, "wb");
if(fpDict==NULL)
{
printf("Cannot open target file [%s].\n", fnDict);
fclose(fpDictT);
return -1;
}
fpTemp = tmpfile();
if(fpTemp==NULL)
{
printf("Cannot create temporary file.\n");
fclose(fpDictT);
fclose(fpDict);
return -1;
}
lbufSize = LBUF_SIZE;
lbuf = (char *)malloc(lbufSize*sizeof(char));
// write target file header
memset(&hdr, 0, sizeof(hdr));
strcpy(hdr.maodict, "MAODICT");
fwrite(&hdr, sizeof(hdr), 1, fpDict);
printf("Reading and creating data area... ");
// read all lines
rcCount = 0;
while(!feof(fpDictT))
{
fgets(lbuf, lbufSize, fpDictT);
lbufLen = strlen(lbuf);
// trim the terminal '\n'
if(lbufLen>0)
if(lbuf[lbufLen-1]=='\n' || lbuf[lbufLen-1]=='\r')
lbuf[--lbufLen] = '\0';
if(lbufLen==0)
continue;
it.offset = ftell(fpDict);
it.length = (char)lbufLen;
fwrite(lbuf, sizeof(char), lbufLen+1, fpDict);
fwrite(&it, sizeof(it), 1, fpTemp);
rcCount++;
//printf("%d: %d %s\n", rcCount, lbufLen, lbuf);
}
printf("done!\n");
// get the whole string area size
strAreaSize = ftell(fpDict);
fclose(fpDictT);
indexes = (IndexItem *)malloc(rcCount*sizeof(IndexItem));
fseek(fpTemp, 0, SEEK_SET);
fread(indexes, sizeof(IndexItem), rcCount, fpTemp);
fclose(fpTemp);
// now, only fpDict is open
// reopen the file for read+write
fclose(fpDict);
fpDict = fopen(fnDict, "r+b");
printf("Sorting and building index table... ");
buf = (char *)malloc(strAreaSize);
fseek(fpDict, 0, SEEK_SET);
fread(buf, 1, strAreaSize, fpDict);
qsort(indexes, rcCount, sizeof(IndexItem), compareItem);
printf("done!\n");
/*
// output all records for check
for(i=0; i<rcCount; i++)
{
fseek(fpDict, indexes[i].offset, SEEK_SET);
fread(lbuf, sizeof(char), indexes[i].length, fpDict);
lbuf[indexes[i].length] = '\0';
printf("%6d: %s\n", i, lbuf);
}
*/
fseek(fpDict, 0, SEEK_END);
hdr.so = ftell(fpDict);
printf("Writing index table... ");
lastWIndex = -1;
rcCountF = 0;
for(i=0; i<rcCount; i++)
{
if(lastWIndex != -1)
if(indexes[i].length == indexes[lastWIndex].length)
if(strncmp(buf + indexes[i].offset, buf + indexes[lastWIndex].offset, indexes[i].length) == 0)
continue;
fwrite(&indexes[i], sizeof(IndexItem), 1, fpDict);
lastWIndex = i;
rcCountF++;
}
printf("done!\n");
hdr.eo = ftell(fpDict);
free(buf);
fseek(fpDict, 0, SEEK_SET);
fwrite(&hdr, sizeof(hdr), 1, fpDict);
/*
fseek(fpDict, hdr.so, SEEK_SET);
fread(indexes, sizeof(IndexItem), rcCountF, fpDict);
// output all records for check
for(i=0; i<rcCountF; i++)
{
fseek(fpDict, indexes[i].offset, SEEK_SET);
fread(lbuf, sizeof(char), indexes[i].length, fpDict);
lbuf[indexes[i].length] = '\0';
printf("%s\n", lbuf);
}
*/
fclose(fpDict);
printf("Dictionary file [%s] is created.\n", fnDict);
free(indexes);
free(lbuf);
return 0;
}
/*
* dbuf: data area pointer
* idx: index area pointer
* ch: current character
* j: current character's position in word
* _si, _li: previous range
* return: 1 - fonnd; 0 - not found
*/
static inline int getCharIndex( const char * dbuf,
const IndexItem * idx,
int ch,
int j, int * _si, int * _li)
{
int si = *_si;
int li = *_li;
int mi;
int ssi, lli, mmi;
#define GETCH(x) ( (unsigned char)*(dbuf + idx[x].offset + j) )
if(ch < GETCH(si))
{
// above the upper border [not found - case 1]
*_si = *_li = si;
return 0;
}
else if(ch == GETCH(si))
{
// start position
*_si = si;
if(ch == GETCH(li))
{
// li is just the end
*_li = li;
return 1;
}
else
{
/* ch < GETCH(li) */
// using binary search, find the end
ssi = si;
lli = li;
while(lli-ssi>1)
{
mmi = (ssi + lli) / 2;
if(ch < GETCH(mmi))
lli = mmi;
else
ssi = mmi;
}
*_li = ssi;
return 1;
}
}
else
{
/* ch > GETCH(si) */
if(ch > GETCH(li))
{
// below the lower border [not found - case 2]
*_si = *_li = li+1;
return 0;
}
else if(ch == GETCH(li))
{
*_li = li;
// using binary search, find the start
ssi = si;
lli = li;
while(lli-ssi>1)
{
mmi = (ssi + lli) / 2;
if(ch <= GETCH(mmi))
lli = mmi;
else
ssi = mmi;
}
*_si = lli;
return 1;
}
else
{
/* ch < GETCH(li) */
// the most common case
while(li-si>1)
{
mi = (si + li) / 2;
if(ch < GETCH(mi))
li = mi;
else if(ch > GETCH(mi))
si = mi;
else
{
/* == found */
// search the upper border
ssi = si;
lli = mi;
while(lli-ssi>1)
{
mmi = (ssi + lli) / 2;
if(ch <= GETCH(mmi))
lli = mmi;
else
ssi = mmi;
}
*_si = lli;
// search the lower border
ssi = mi;
lli = li;
while(lli-ssi>1)
{
mmi = (ssi + lli) / 2;
if(ch < GETCH(mmi))
lli = mmi;
else
ssi = mmi;
}
*_li = ssi;
return 1;
}
}
// not included [not found - case 3]
*_si = *_li = li;
return 0;
}
}
}
static int compareResultNode(const void * a1, const void * a2)
{
return -( ((ResultNode *)a1)->count - ((ResultNode *)a2)->count );
}
int SearchTextFile(const char * fnDict, FILE * fp)
{
DictHeader hdr;
int si, li;
int rcCount;
char * dbuf; // data buffer
IndexItem * idx; // index buffer
//unsigned char * lbuf;
int * cbuf;
//size_t dbufSize;
//size_t lbufSize = LBUF_SIZE;
size_t cbufSize = LBUF_SIZE;
//int ch;
int i, j, k;
int ret;
int t1, t2, t3;
size_t cs; // calculated size
size_t msizeCount = 0; // malloc size count
int * * * resultMap = NULL;
clock_t startClock, finishClock;
int resultCount = 0;
ResultNode * resultList = NULL;
int resultListSize = 1024;
//ResultNode * * pCurrentRN = &resultList;
//ResultNode * currentRN;
// variables for File Mapping
HFILE hDictFile;
HANDLE hMapFile;
HANDLE hMapView;
OFSTRUCT opBuf;
//
lbuf = (char *)malloc(MAX_PATH);
cbuf = (int *)malloc(cbufSize);
// 3 steps to map a file: OpenFile, CreateFileMapping, MapViewOfFile
hDictFile = OpenFile(fnDict, &opBuf, OF_READ);
if(hDictFile == HFILE_ERROR)
{
fprintf(stderr, "Cannot open dictionary file [%s].\n", fnDict);
return -1;
}
hMapFile = CreateFileMapping((HANDLE)hDictFile, NULL, PAGE_READONLY, 0, 0, "DictMap");
if(hMapFile == NULL)
{
fprintf(stderr, "Mapping file failed.\n");
CloseHandle((HANDLE)hDictFile);
return -1;
}
CloseHandle((HANDLE)hDictFile);
hDictFile = 0;
hMapView = MapViewOfFile(hMapFile, FILE_MAP_READ, 0, 0, 0);
if(hMapView == NULL)
{
fprintf(stderr, "Mapping view failed.\n");
CloseHandle(hMapFile);
return -1;
}
// memory - file mapped
dbuf = (char *)hMapView;
memcpy(&hdr, dbuf, sizeof(hdr));
if(strncmp(hdr.maodict, "MAODICT", sizeof(hdr.maodict)) != 0)
{
printf("Invalid dictionary format, expecting \"MAODICT\".\n");
UnmapViewOfFile(hMapView);
CloseHandle(hMapFile);
return -1;
}
idx = (IndexItem *)(dbuf + hdr.so);
rcCount = (hdr.eo - hdr.so) / sizeof(IndexItem);
// initialize the first class mapping table
cs = (1<<SHIFT1_USED)*sizeof(*resultMap);
resultMap = (int ***)malloc(cs);
memset(resultMap, NULL, cs);
msizeCount += cs;
//fprintf(stderr, "Searching... ");
// remember when we start
startClock = clock();
j = 0; // word char index
k = 0; // number of buffered chars
do
{
j = 0; // return zero
si = 0;
li = rcCount-1;
for(j=0; ; j++)
{
while(k<=j)
cbuf[k++] = fgetc(fp);
if(cbuf[j]==EOF)
break;
ret = getCharIndex(dbuf, idx, cbuf[j], j, &si, &li);
//======================================
// if this is a complete word, add it
if(ret && j==idx[si].length-1)
{
t1 = si >> (SHIFT2+SHIFT3);
t2 = ((unsigned int)si<<SHIFT1) >> (SHIFT1+SHIFT3);
t3 = ((unsigned int)si << (SHIFT1+SHIFT2) ) >> (SHIFT1+SHIFT2);
if(resultMap[t1]==NULL)
{
cs = (1<<SHIFT2) * sizeof(resultMap[0][0]);
resultMap[t1] = (int **)malloc(cs);
memset(resultMap[t1], NULL, cs);
msizeCount += cs;
}
if(resultMap[t1][t2]==NULL)
{
cs = (1<<SHIFT3) * sizeof(resultMap[0][0][0]);
resultMap[t1][t2] = (int *)malloc(cs);
memset(resultMap[t1][t2], 0, cs);
msizeCount += cs;
}
//printf("%d: %d %d %d\n", si, t1, t2, t3);
resultMap[t1][t2][t3] += 1;
}
//======================================
if(!ret)
break;
else
{
if(li-si==0)
if(j==idx[si].length-1)
break;
}
}
// move buffer one step foward (overlapped spaces!!!)
if(k>1)
memmove(cbuf, cbuf+1, (k-1)*sizeof(cbuf[0]));
//printf("%d\n", k);
k --;
} while(cbuf[0]!=EOF);
finishClock = clock();
//fprintf(stderr, "complete!\n");
fprintf(stderr, "Processed in %.3fs.\n", double(finishClock-startClock)/CLOCKS_PER_SEC );
fprintf(stderr, "Totally allocated memory: %.2fKB\n", (double)msizeCount/1024 );
//================================================
resultList = (ResultNode *)malloc(resultListSize*sizeof(ResultNode) );
for(i=0; i<(1<<SHIFT1_USED); i++)
if(resultMap[i])
for(j=0; j<(1<<SHIFT2); j++)
if(resultMap[i][j])
for(k=0; k<(1<<SHIFT3); k++)
if(resultMap[i][j][k]>0)
{
//printf("\"%s\" -- %d\n", dbuf + idx[(i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k].offset, resultMap[i][j][k] );
// add to result list
if(resultCount >= resultListSize)
{
resultListSize *= 2;
resultList = (ResultNode *)realloc(resultList, resultListSize*sizeof(ResultNode) );
}
resultList[resultCount].index = (i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k;
resultList[resultCount].count = resultMap[i][j][k];
/*
*pCurrentRN = (ResultNode *)malloc( 1*sizeof(ResultNode) );
(*pCurrentRN)->index = (i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k;
(*pCurrentRN)->count = resultMap[i][j][k];
(*pCurrentRN)->next = NULL;
pCurrentRN = &(*pCurrentRN)->next;
*/
resultCount += 1;
}
// save them to array
/*
ResultNode * * results = (ResultNode * *)malloc( resultCount*sizeof(ResultNode *) );
j = 0;
for(currentRN=resultList; currentRN!=NULL; currentRN=currentRN->next)
{
results[j++] = currentRN;
}
*/
// sort
qsort(resultList, resultCount, sizeof(ResultNode), compareResultNode);
// output result
//fprintf(stderr, "Result:\n");
for(j=0; j<resultCount; j++)
{
printf("%s -- %d\n", dbuf + idx[resultList[j].index].offset, resultList[j].count );
}
free(resultList);
UnmapViewOfFile(hMapView);
return 0;
}
int main(int argc, char * argv[])
{
char * fileName = NULL;
char opcode = 's';
int i;
const char * fnDict = "English.dat";
//FILE * fp;
//FILE * fpDict;
for(i=1; i<argc && i<3; i++)
{
if(argv[i][0]=='-')
opcode = argv[i][1];
else
fileName = argv[i];
}
// select an operation
switch(opcode)
{
case 'c':
if(fileName==NULL)
{
fprintf(stderr, "File name is needed.\n");
return -1;
}
textToBinaryFile(fileName, ChangeFileExt(fileName, "dat") );
//textToTrieFile(fileName, ChangeFileExt(fileName, "dat") );
break;
case 's':
if(fileName==NULL)
{
fprintf(stderr, "File name is needed.\n");
return -1;
}
SearchTextFile(fileName, stdin);
break;
case 'h':
default:
printf("Usage:\n");
printf(" msearch -c <source file> .... Convert text file to dictionary.\n");
printf(" msearch <dict file> .... Input text to search, ended with [Ctrl+Z].\n");
printf(" msearch -h .... Print help information.\n");
printf("Examples:\n");
printf(" msearch -c English.txt .... Create English.dat.\n");
printf(" msearch English.dat <gpl3.txt >result.txt \n"
" .... Search keywords in gpl3.txt and write results to result.txt,\n"
" using dictionary English.dat\n");
return -1;
}
return 0;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -