?? cdict.py
字號(hào):
# -*- coding: cp936 -*-
# author:sunjoy
# email:ccnusjy@gmail.com
import bsddb,os
import re
class CDict:
def __init__(self):
#print os.path.split(__file__)[0]+'/data/dict.dat'
self.d=bsddb.btopen('data/dict.dat','c')
def __del__(self):
self.d.close()
def segWords(self,sentence):
# 將字符的編碼轉(zhuǎn)換為utf-8編碼
try:
sentence=sentence.decode('utf-8')
except:
return []
# 將字符串中的一些諸如。,,等字符全都用空格來替代
sentence=re.sub(u"[。,,!……!《》<>\"'::?\?、\|“”‘’;]"," ",sentence)
print "sentence :"+sentence
# 進(jìn)行一些下面將會(huì)用到的變量的初始化
length=len(sentence)
i=length
result=[]
#
while True:
# 只有當(dāng)i小于等于0的時(shí)候,while循環(huán)才會(huì)跳出
if i<=0:
break
#
found=-1
tempi=i
# tok 為第i-1個(gè)字符
tok=sentence[i-1:i]
# ------英文的情況
# 如果tok為0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符,則將i前移,以便找到一個(gè)不出現(xiàn)上述字符的位置,這些字符就都為英文字符,可以用英語(yǔ)單詞的方法來切分
while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
i-=1
tok=sentence[i-1:i]
print "tok "+tok
# 如國(guó)tempi-i>0,即存在0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符或他們組成的單詞,則將單詞保存值結(jié)果列表result中
if tempi-i>0:
result.append(sentence[i:tempi].lower().encode('utf-8'))
# ------中文的情況
for j in xrange(4,0,-1): # j 分別等于4,3,2,1
if i-j<0:
continue
utf8Word=sentence[i-j:i].encode('utf-8')
print "utf8Word raw "+utf8Word
if(self.d.has_key(utf8Word)):
found=i-j
result.append(utf8Word)
print "utf8Word result "+utf8Word
#print "the total result "
#for dddd in result:
# print dddd
break
if found==-1:
if i<length and sentence[i].strip()=="":
result.append(sentence[i-1].encode('utf-8'))
print "the total result when found = -1 "
for dddd in result:
print dddd
print "-------------------"
elif(sentence[i-1:i].strip()!=""):
if len(result)>0 and len(result[-1])<12:
result.append(sentence[i-1:i].encode('utf-8')+result[-1])
print """en(result)>0 and len(result[-1])<12:"""
for dddd in result:
print dddd
print "-------------------"
else:
result.append(sentence[i-1:i].encode('utf-8'))
print """len(result)>0 and len(result[-1])<12 is false"""
for dddd in result:
print dddd
print "-------------------"
i-=1
else:
i=found
print "the finally result is :-----------------------"
for dddd in result:
print dddd
print "----------------------------------------------"
goodR=[]
for w in result:
if w.strip()<>"":
goodR.append(w)
return goodR
def segWords2(self,sentence):
# 將字符的編碼轉(zhuǎn)換為utf-8編碼
try:
sentence=sentence.decode('utf-8')
except:
return []
# 進(jìn)行一些下面將會(huì)用到的變量的初始化
length=len(sentence)
i=length
result=[]
while True:
# 只有當(dāng)i小于等于0的時(shí)候,while循環(huán)才會(huì)跳出
if i<=0:
break
#
found=-1
tempi=i
# tok 為第i-1個(gè)字符
tok=sentence[i-1:i]
# 如果tok為0-9A-Za-z\-\+#@_\.中的任何一個(gè)字符,則將i前移,以便找到一個(gè)不出現(xiàn)上述字符的位置,這些字符就都為英文字符,
# 即按照英文單詞的切分方法來切分,這里使用循環(huán)是為了找出一個(gè)單詞的開始位置(結(jié)束位置已經(jīng)有了)
while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
i-=1
tok=sentence[i-1:i]
# print "一個(gè)個(gè)的單個(gè)字符為: "+tok
# 有了一個(gè)單詞的開始和結(jié)束位置以后,就可以確定這個(gè)單詞是什么了,于是就可以將整個(gè)單詞存入結(jié)果列表result,而不是一個(gè)個(gè)的字符
if tempi-i>0:
result.append(sentence[i:tempi].lower().encode('utf-8'))
#print "將上述單個(gè)的字符連接在一起形成單詞為:"
#print "ddd"
#print result.decode('utf-8')
print result[0:]
iii=0
# xrange里面的4表明是最大匹配4個(gè)字,比如"中華人民共和國(guó)"就不能完整匹配,把4改為大于等于7的整數(shù)就可以完整匹配了
for j in xrange(4,0,-1): # j 分別等于4,3,2,1 ,
if i-j<0:
continue
utf8Word=sentence[i-j:i].encode('utf-8')#這里的匹配方法為逆向最大匹配方法
if(self.d.has_key(utf8Word)):
print "utf8Word result "+str(iii)+" "+utf8Word
iii=iii+1
found=i-j
result.append(utf8Word)
break
#-----------------------------------------
# 如果字典中不存在所要查找的詞,則將其直接保存進(jìn)結(jié)果列表result
if found==-1:
result.append(sentence[i-1:i].encode('utf-8'))
print "found==-1 utf-8 :"+sentence[i-1:i].encode('utf-8')
print "found==-1 :"+sentence[i-1:i]
i-=1
# 如果在字典中找到了所需要的單詞,則重置i,詞已經(jīng)在前面保存進(jìn)結(jié)果列表result
else:
i=found
#--------------------------------------------------
# 將結(jié)果輸出
print "-----------"
print result[0:]
goodR=[]
for w in result:
if w.strip()<>"":
goodR.append(w)
return goodR
if __name__=="__main__":
d=CDict()
#words=d.segWords("""我愛北京天安門,我叫孫君意,我愛python and c++""".decode('gbk').encode('utf-8'))
#print "==========保守模式============="
#for w in words:
# print w.decode('utf-8')
words=d.segWords2("""我愛北京天安門,我叫孫君意,我愛python and CAAA++ 我是張永偉中華人民共和國(guó)iwy what?""".decode('gbk').encode('utf-8'))
#ssss="""我愛北京天安門,我叫孫君意,我愛python and CAAA++ 我是張永偉iwy what?""".decode('gbk').encode('utf-8')
d=[]
#for i in ssss:
# d[i]=ssss[i]
print "==========冗余模式============="
for w in words:
print w.decode('utf-8')
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -