?? cdict.py

?? 在網(wǎng)上下載的程序
?? PY
字號(hào):
# -*- coding: cp936 -*-
# author:sunjoy
# email:ccnusjy@gmail.com

import bsddb,os
import re
class CDict:
    def __init__(self):
        #print os.path.split(__file__)[0]+'/data/dict.dat'
        self.d=bsddb.btopen('data/dict.dat','c')
    def __del__(self):
        self.d.close()
        
    def segWords(self,sentence):
        # 將字符的編碼轉(zhuǎn)換為utf-8編碼
        try:
            sentence=sentence.decode('utf-8')
        except:
            return []
        # 將字符串中的一些諸如。，,等字符全都用空格來替代
        sentence=re.sub(u"[。，,！……!《》<>\"':：？\?、\|“”‘’；]"," ",sentence)
        print "sentence   :"+sentence
        
        # 進(jìn)行一些下面將會(huì)用到的變量的初始化
        length=len(sentence)
        i=length
        result=[]

        # 
        while True:
            # 只有當(dāng)i小于等于0的時(shí)候，while循環(huán)才會(huì)跳出
            if i<=0:
                break
            #
            found=-1
            tempi=i
            # tok 為第i-1個(gè)字符
            tok=sentence[i-1:i]

            # －－－－－－英文的情況
            # 如果tok為0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符，則將i前移，以便找到一個(gè)不出現(xiàn)上述字符的位置，這些字符就都為英文字符，可以用英語(yǔ)單詞的方法來切分
            while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
                i-=1
                tok=sentence[i-1:i]
                print "tok   "+tok
            # 如國(guó)tempi-i>0，即存在0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符或他們組成的單詞，則將單詞保存值結(jié)果列表result中
            if tempi-i>0:
                result.append(sentence[i:tempi].lower().encode('utf-8'))


            # －－－－－－中文的情況
            for j in xrange(4,0,-1):    # j 分別等于4,3,2,1
                if i-j<0:
                    continue
                utf8Word=sentence[i-j:i].encode('utf-8')
                print "utf8Word raw   "+utf8Word
                if(self.d.has_key(utf8Word)):
                    found=i-j
                    result.append(utf8Word)
                    print "utf8Word result "+utf8Word
                    #print "the total result "
                    #for dddd in result:
                    #    print dddd
                    break

            if found==-1:
                if i<length and sentence[i].strip()=="":
                    result.append(sentence[i-1].encode('utf-8'))
                    print "the total result when found = -1 "
                    for dddd in result:
                        print dddd
                    print "-------------------"
                elif(sentence[i-1:i].strip()!=""):
                    if len(result)>0 and len(result[-1])<12:
                        result.append(sentence[i-1:i].encode('utf-8')+result[-1])
                        print """en(result)>0 and len(result[-1])<12:"""
                        for dddd in result:
                            print dddd
                        print "-------------------"
                    else:
                        result.append(sentence[i-1:i].encode('utf-8'))
                        print """len(result)>0 and len(result[-1])<12 is false"""
                        for dddd in result:
                            print dddd
                        print "-------------------"
                i-=1
            else:
                i=found
        print "the finally result is :-----------------------"
        for dddd in result:
            print dddd
        print "----------------------------------------------"
        goodR=[]
        for w in result:
            if w.strip()<>"":
                goodR.append(w)
        return goodR
    
    def segWords2(self,sentence):
        # 將字符的編碼轉(zhuǎn)換為utf-8編碼
        try:
            sentence=sentence.decode('utf-8')
        except:
            return []
        # 進(jìn)行一些下面將會(huì)用到的變量的初始化
        length=len(sentence)
        i=length
        result=[]
        
        while True:
            # 只有當(dāng)i小于等于0的時(shí)候，while循環(huán)才會(huì)跳出
            if i<=0:
                break
            # 
            found=-1
            tempi=i
            # tok 為第i-1個(gè)字符
            tok=sentence[i-1:i]
            # 如果tok為0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符，則將i前移，以便找到一個(gè)不出現(xiàn)上述字符的位置，這些字符就都為英文字符，
            # 即按照英文單詞的切分方法來切分，這里使用循環(huán)是為了找出一個(gè)單詞的開始位置（結(jié)束位置已經(jīng)有了）
            while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
                i-=1
                tok=sentence[i-1:i]
               # print "一個(gè)個(gè)的單個(gè)字符為： "+tok
            # 有了一個(gè)單詞的開始和結(jié)束位置以后，就可以確定這個(gè)單詞是什么了，于是就可以將整個(gè)單詞存入結(jié)果列表result，而不是一個(gè)個(gè)的字符
            if tempi-i>0:
                result.append(sentence[i:tempi].lower().encode('utf-8'))
                #print "將上述單個(gè)的字符連接在一起形成單詞為："
                #print "ddd"
                #print result.decode('utf-8')
                print result[0:]
                iii=0
            # xrange里面的4表明是最大匹配4個(gè)字，比如"中華人民共和國(guó)"就不能完整匹配，把4改為大于等于7的整數(shù)就可以完整匹配了
            for j in xrange(4,0,-1):    # j 分別等于4,3,2,1 ，
                if i-j<0:
                    continue
                utf8Word=sentence[i-j:i].encode('utf-8')#這里的匹配方法為逆向最大匹配方法
                if(self.d.has_key(utf8Word)):
                    print "utf8Word result  "+str(iii)+"   "+utf8Word
                    iii=iii+1
                    found=i-j
                    result.append(utf8Word)
                    break

            #－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－
            # 如果字典中不存在所要查找的詞，則將其直接保存進(jìn)結(jié)果列表result
            if found==-1:
                result.append(sentence[i-1:i].encode('utf-8'))
                print "found==-1  utf-8  :"+sentence[i-1:i].encode('utf-8')
                print "found==-1  :"+sentence[i-1:i]
                i-=1
            # 如果在字典中找到了所需要的單詞，則重置i，詞已經(jīng)在前面保存進(jìn)結(jié)果列表result
            else:
                i=found
            #－－－－－－－－－－－－－－－－－－－－－－－－－－－－----------------------
        # 將結(jié)果輸出
        print "－－－－－－－－－－－"
        print result[0:]
        goodR=[]
        for w in result:
            if w.strip()<>"":
                goodR.append(w)
        return goodR



if __name__=="__main__":
    d=CDict()
    #words=d.segWords("""我愛北京天安門，我叫孫君意，我愛python and c++""".decode('gbk').encode('utf-8'))
    #print "==========保守模式============="
    #for w in words:
     #   print w.decode('utf-8')
        
    words=d.segWords2("""我愛北京天安門，我叫孫君意，我愛python and CAAA++ 我是張永偉中華人民共和國(guó)iwy what？""".decode('gbk').encode('utf-8'))
    #ssss="""我愛北京天安門，我叫孫君意，我愛python and CAAA++ 我是張永偉iwy what？""".decode('gbk').encode('utf-8')
    d=[]
    #for i in ssss:
    #    d[i]=ssss[i]
    print "==========冗余模式============="
    for w in words:
        print w.decode('utf-8')
?? 文件大小 2323 K
?? 上傳用戶 yeling023
?? 所屬分類多國(guó)語(yǔ)言處理
??? 相關(guān)標(biāo)簽

#程序
?? 快捷鍵說明

復(fù)制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號(hào) Ctrl + =
減小字號(hào) Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? cdict.py

?? 快捷鍵說明