?? untitled1.cpp

?? 中文分詞程序
?? CPP
字號(hào):
/*中文分詞使用整數(shù)進(jìn)行判斷*/
/**//*在目錄下的word-.txt是中文詞的文件*/
/**//*程序?qū)ν荒夸浵碌膕plit.txt進(jìn)行分割*/
/**//*程序?qū)崿F(xiàn)：
/*第一個(gè)中文字用數(shù)組進(jìn)行查找，之后進(jìn)行森林的搜索*/
#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <string.h>
#include <windows.h>
typedef struct Trie/**//*森林的結(jié)構(gòu)*/
{
    long int value;
    struct Trie *child,*sibling;
    int eof;/**//*中文詞的標(biāo)記*/
}Trie;   
Trie *HashTable[65536]={NULL};/**//*第一個(gè)字的根結(jié)點(diǎn)用數(shù)組存儲(chǔ)*/
Trie *search(Trie *root,int value)
/**//*在此層進(jìn)行文字的查找*/
{
    if(root==NULL)
        return root;
    else
    {
        root=root->child;
        while(root&&(root->value!=value))
          root=root->sibling;
        return root;
    }
}
Trie *insert(Trie *root,int value)
{
    Trie *tmpcell,*tmp;
    if(root->child==NULL)
    {
        tmpcell=(Trie*)malloc(sizeof(Trie));
        tmpcell->value=value;
        tmpcell->child=NULL;
        tmpcell->sibling=NULL;
        tmpcell->eof=0;
        root->child=tmpcell;
        return tmpcell;
    }
    else
    {
        tmp=root->child;
        while(tmp&&(tmp->value!=value))
        {
            root=tmp;
            tmp=tmp->sibling;
        }
        if(tmp)return tmp;
        else
        {
            tmpcell=(Trie*)malloc(sizeof(Trie));
            tmpcell->value=value;
            tmpcell->child=NULL;
            tmpcell->sibling=NULL;
            tmpcell->eof=0;
            root->sibling=tmpcell;
            return tmpcell;
        }
    }
}
void create()/**//*建立索引*/
/**//*結(jié)構(gòu)很簡單，但很難描述*/
/**//*中文字第一層用hash查找并找到根，后面用森林進(jìn)行查找*/
{
    FILE *fp;
    int words=2,maxwords=20,i;
    char *filename="word.txt"; 
    //char filename[10]={0};
    char sequence[4]={0};
    unsigned char buffer[40]={0};
    long int value;
    Trie *root,*tmpcell;
    for(words=2;words<=maxwords;words+=2)
    {
        //strcpy(filename,"word");
        //itoa(words,sequence,10);
        //strcat(filename,sequence);
        //strcat(filename,".txt");/**//*構(gòu)造詞文件名*/
        fp=fopen(filename,"r");
        if(fp==NULL)
            continue;
        while(!feof(fp))
        {
            fscanf(fp,"%s ",buffer);
            value=buffer[0];
            value=value<<8;
            value+=buffer[1];
            if(HashTable[value]==NULL)
            {
                tmpcell=(Trie*)malloc(sizeof(Trie));
                tmpcell->value=value;
                tmpcell->child=NULL;
                tmpcell->sibling=NULL;
                tmpcell->eof=0;
                HashTable[value]=tmpcell;
            }
            root=HashTable[value];
            for(i=2;i<=words-2;i+=2)
            {
                value=buffer[i];
                value=value<<8;
                value+=buffer[i+1];
                root=insert(root,value);
            }
            root->eof=1;
        }
        fclose(fp);
    }
}
void freetrie(Trie *root)
/**//*遞歸釋放森令各節(jié)點(diǎn)*/
{
    Trie *tmp;
    if(root==NULL)
        return ;
    else
    {
        tmp=root->sibling;
        freetrie(root->child);
        free(root);
        freetrie(tmp);
    }
}
void freehash()/**//*釋放hash和后面的根*/
{
    int i;
    Trie *root;
    for(i=0;i<65536;i++)
    {    
        root=HashTable[i];
        if(root!=NULL)
            freetrie(root);
    }
}
void fsplit(char *filename)
{
    int lasteof=0/**//*記錄最后一個(gè)匹配的詞*/,str_pos=0/**//*str的位置*/,value;
    unsigned char str[20]={0},ch[2]={0};
    Trie *root;
    FILE *fp=fopen(filename,"r");
    
    if(fp==NULL)
    {
        printf("No file ");
        return ;
    }
    while(!feof(fp))
    {
        ch[0]=fgetc(fp);
        if(ch[0]<128)/**//*非中文字符直接輸出*/
        /**//*并將前面匹配的中文詞輸出*/
        {
            str[str_pos]=0;
            printf("%s",str);
            str_pos=0;
            //putchar(ch[0]);
        }
        else
        {
            ch[1]=fgetc(fp);/**//*若非種文字符需要讀兩個(gè)字節(jié)*/
            value=ch[0];
            value=value<<8;
            value+=ch[1];
            if(str_pos==0)
            {
                root=HashTable[value];
                str[str_pos++]=ch[0];
                str[str_pos++]=ch[1];
                lasteof=2;
            }
            else
            {
                root=search(root,value);
                if(root==NULL)/**//*沒有匹配*/
                {
                    str[lasteof]=0;
                    printf("%s ",str);
                    if(str_pos!=2)/**//*這部分很重要*/
                    /**//*根據(jù)str_pos和lasteof作適當(dāng)?shù)幕赝?/
                    {
                         if(str_pos==lasteof)
                                   fseek(fp,-2,SEEK_CUR);
                         else
                                   fseek(fp,-(str_pos-lasteof),SEEK_CUR);
                    }
                    else
                         fseek(fp,-2,SEEK_CUR);    
                    str_pos=0;
                }
                else/**//*若匹配直接存儲(chǔ)*/
                {
                    str[str_pos++]=ch[0];
                    str[str_pos++]=ch[1];
                    if(root->eof==1)/**//*若是一個(gè)中文詞，記錄下最后的位置*/
                        lasteof=str_pos;
                }
            }
        }        
    }
    fclose(fp);
    return ;
}
int main()
{
    char *filename="split.txt"; 
    DWORD time[2];
    create();
    
    fsplit(filename);
    
    freehash();
    getchar();
    return 0;
}
?? 文件大小 13 K
?? 上傳用戶 ys0796
?? 所屬分類多國語言處理
??? 相關(guān)標(biāo)簽

#分 #程序
?? 快捷鍵說明

復(fù)制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號(hào) Ctrl + =
減小字號(hào) Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? untitled1.cpp

?? 快捷鍵說明