?? mybagofwords.py

?? 目前最新研究方向

?? PY

字號:

#!/local/usr/bin/pythonimport sysimport osimport os.pathimport nltkfrom nltk.tokenize import WordTokenizer"""documents in each class are in a subdir of the specified directory. The directory name is the class label.This uses a fixed set of words and poor (whitespace only) punctuation. The features are just frequency-based (numeric). There are a million and one ways to improve this representation.Writes out an ARFF file that can be loaded into WEKA for analysis/classification.""""""the words to count. Here they've been picked manually, and there's not many of them. Where can you get your list of words from? How about from your training set? How many shouldyou keep, and on what basis?"""dirname = sys.argv[1] # the directory with the classes in itfnames = os.listdir(dirname)classes = [ fname for fname in fnames if os.path.isdir(os.path.join(dirname, fname)) ] # just keep the directories## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ####globalWordFreqDist = nltk.FreqDist()docClusterDict = {}attrWords = []tokenizer = WordTokenizer()for classname in classes:	classdir = os.path.join(dirname,classname)	docs=[os.path.join(classdir,fname) for fname in os.listdir(classdir)]	docWordDistArray = []	for doc in docs:		docWordFreqDist = nltk.FreqDist()		words = tokenizer.tokenize(open(doc).read().lower())		[docWordFreqDist.inc(word) for word in words]		# evaluate frequencies of each word		docWordDistArray.append(docWordFreqDist)			# build up a list which index its elements to corresponding docs.		[globalWordFreqDist.inc(word) for word in words]	# evaluate frequencies of the words in all docs	docClusterDict[classname] = docWordDistArray	# categorize docWordFreqDistssys.stderr.write(`len(docClusterDict)`)sys.stderr.write(`len(docWordDistArray)`)attrWords = globalWordFreqDist.keys()###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##	""" write out the ARFF header. Specifications for ARFF are at: http://www.cs.waikato.ac.nz/~ml/weka/arff.html."""header = [ ]header.append("@RELATION test1")for word in attrWords :	header.append("@ATTRIBUTE \"%s\" NUMERIC" %(word))header.append("@ATTRIBUTE class {%s}" %(",".join(classes)))header.append("@DATA")print "\n".join(header)## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ####for classname in classes:	docWordDistArray = docClusterDict[classname]	for docWordFreqDist in docWordDistArray:		thisDocWords = docWordFreqDist.keys()		data = []		for word in attrWords:			if word in thisDocWords:				thisValue = docWordFreqDist[word]			else:				thisValue = 0			data.append("%d"%thisValue)		data.append(classname)		print ','.join(data)###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##	####for classname in classes :##	classdir = os.path.join(dirname, classname)##	docs = [ os.path.join(classdir, fname) for fname in os.listdir(classdir) ]##	for doc in docs :##		counts = {}##		## sys.stderr.write(`doc`) # ## ### #### #######		s = file(doc).read()####		# tokenize the document on whitespace##		toks = s.split()##		##		# count all the tokens##		for tok in toks : ##			if not counts.has_key(tok) :##				counts[tok] = 0##			counts[tok] += 1####		# extract the features we want##		data = [ ]##		for word in words :##			if counts.has_key(word) :##				data.append("%d" %(counts[word]))##			else :##				data.append("0")##		data.append(classname)####		print ",".join(data)

?? 文件大小 2 K

?? 上傳用戶 flp123456

?? 所屬分類其他

??? 相關標簽

#方向

?? 快捷鍵說明

復制代碼 Ctrl + C

搜索代碼 Ctrl + F

全屏模式 F11

切換主題 Ctrl + Shift + D

顯示快捷鍵 ?

增大字號 Ctrl + =

減小字號 Ctrl + -

亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? mybagofwords.py

?? 快捷鍵說明