?? 分詞程序.cpp

?? 實現(xiàn)中文分詞
?? CPP
字號:
/*中文分詞使用整數(shù)進行判斷*/
/*在目錄下的word.txt是中文詞的文件*/
/*程序?qū)ν荒夸浵碌腶rticle.txt進行分割*/
/*程序?qū)崿F(xiàn):第一個中文字用數(shù)組進行查找，之后進行森林的搜索*/
#include <stdio.h>
#include <malloc.h>
//#include <ctype.h>
#include <windows.h>
typedef struct Forest/*森林的結(jié)構(gòu)*/
{ 
	long int value; 
	struct Forest *child,*sibling;
	int eof;/*中文詞的標記*/
}Forest;
Forest *HashTable[65536]={NULL};/*第一個字的根結(jié)點用數(shù)組存儲*/
Forest *search(Forest *root,int value)/*在此層進行文字的查找*/
{ 
	if(root==NULL) 
		return root;
	else 
	{ 
		root=root->child; 
		while(root&&(root->value!=value)) 
			root=root->sibling;
		return root;
	}
}
Forest *insert(Forest *root,int value)
{ 
	Forest *tmpcell,*tmp;
	if(root->child==NULL) 
	{ 
		tmpcell=(Forest*)malloc(sizeof(Forest));
		tmpcell->value=value; 
		tmpcell->child=NULL; 
		tmpcell->sibling=NULL; 
		tmpcell->eof=0; 
		root->child=tmpcell; 
		return tmpcell;
	}
	else 
	{ 
		tmp=root->child;
		while(tmp&&(tmp->value!=value)) 
		{ 
			root=tmp; 
			tmp=tmp->sibling; 
		} 
		if(tmp)return tmp;
		else 
		{ 
			tmpcell=(Forest*)malloc(sizeof(Forest));		
			tmpcell->value=value;
			tmpcell->child=NULL; 
			tmpcell->sibling=NULL;
			tmpcell->eof=0; 
			root->sibling=tmpcell;
			return tmpcell; 
		}
	}
}

void create()/*建立索引*//*結(jié)構(gòu)很簡單，但很難描述*/
/*中文字第一層用hash查找并找到根，后面用森林進行查找*/
{ 
	FILE *fp;
	int i;
	char filename[10]={0};/////////////////
	char sequence[4]={0};//////////////////
	unsigned char buffer[20]={0};
	long int value; 
	Forest *root,*tmpcell; 
	 
      fp=fopen("word.txt","r");
	  if(fp==NULL)
	  {
		  printf("詞庫文件找不到\n");
		  return ;	
	  }
	  while(!feof(fp)) 
	  { 
		  fscanf(fp,"%s\n",buffer); 
		  value=buffer[0];
		  value*=256; 
		  value+=buffer[1];
		  if(HashTable[value]==NULL) 
		  { 
			  tmpcell=(Forest*)malloc(sizeof(Forest));
			  tmpcell->value=value;
			  tmpcell->child=NULL; 
			  tmpcell->sibling=NULL;
			  tmpcell->eof=1; 
			  HashTable[value]=tmpcell; 
		  } 
		  root=HashTable[value];
		  for(i=2;i<=16;i+=2) 
		  { 
			  value=buffer[i]; 
			  value*=256; 
			  value+=buffer[i+1];
			  root=insert(root,value);
		  
		  root->eof=1;}
	  } 
	  fclose(fp);
  
}
void freeForest(Forest *root)/*遞歸釋放森令各節(jié)點*/
{ 
	Forest *tmp;
	if(root==NULL) return ; 
	else 
	{
		tmp=root->sibling;
		freeForest(root->child);
		free(root); 
		freeForest(tmp);
	}
}
void freehash()/*釋放hash和后面的根*/
{ 
	int i=0; 
	Forest *root; 
	  
	root=HashTable[i]; 
	freeForest(root); 
	
}
void fsplit(char *filename)
{ 
	int lasteof=0/*記錄最后一個匹配的詞*/,str_pos=0/*str的位置*/,value;
	unsigned char str[20]={0},ch[2]={0};
	Forest *root; 
	FILE *fp=fopen(filename,"r"); 
	printf("filename :%s\n",filename);
	if(fp==NULL)
	{ 
		printf("No file\n");
		return ; 
	}
	while(!feof(fp)) 
	{ 
		ch[0]=fgetc(fp); 
		if(ch[0]<128)/*非中文字符直接輸出*/ /*并將前面匹配的中文詞輸出*/ 
		{ 
			str[str_pos]=0; 
			printf("%s",str);
			str_pos=0;
			putchar(ch[0]);
		} 
		else 
		{
			ch[1]=fgetc(fp);/*若非種文字符需要讀兩個字節(jié)*/
			value=ch[0];
			value*=256;
			value+=ch[1];
			if(str_pos==0) 
			{ 
				root=HashTable[value];
				str[str_pos++]=ch[0]; 
				str[str_pos++]=ch[1];
				lasteof=2; 
			}
			else 
			{ 
				root=search(root,value);
				if(root==NULL)/*沒有匹配*/
				{ 
					str[lasteof]=0; 
					printf("[%s]",str);
					if(str_pos!=2)/*這部分很重要*/ /*根據(jù)str_pos和lasteof作適當?shù)幕赝?/
					{ 
						if(str_pos==lasteof) 
							fseek(fp,-2,SEEK_CUR);
						else fseek(fp,-(str_pos-lasteof),SEEK_CUR); 
					}
					else
						fseek(fp,-2,SEEK_CUR);
					str_pos=0; 
				}
				else/*若匹配直接存儲*/
				{
					str[str_pos++]=ch[0];
					str[str_pos++]=ch[1];
					if(root->eof==1)/*若是一個中文詞，記錄下最后的位置*/ 
						lasteof=str_pos;
				} 
			}
		} 
	} 
fclose(fp); 
return ;
}


int main()
{ 
	char *filename="article.txt";  
	create(); 
	fsplit(filename); 
	freehash(); 
	getchar(); 
	return 0;
}
?? 文件大小 206 K
?? 上傳用戶 kyo
?? 所屬分類多國語言處理
??? 相關(guān)標簽

#分
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? 分詞程序.cpp

?? 快捷鍵說明