?? mybagofwords.py
字號:
#!/local/usr/bin/pythonimport sysimport osimport os.pathimport nltkfrom nltk.tokenize import WordTokenizer"""documents in each class are in a subdir of the specified directory. The directory name is the class label.This uses a fixed set of words and poor (whitespace only) punctuation. The features are just frequency-based (numeric). There are a million and one ways to improve this representation.Writes out an ARFF file that can be loaded into WEKA for analysis/classification.""""""the words to count. Here they've been picked manually, and there's not many of them. Where can you get your list of words from? How about from your training set? How many shouldyou keep, and on what basis?"""dirname = sys.argv[1] # the directory with the classes in itfnames = os.listdir(dirname)classes = [ fname for fname in fnames if os.path.isdir(os.path.join(dirname, fname)) ] # just keep the directories## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ####globalWordFreqDist = nltk.FreqDist()docClusterDict = {}attrWords = []tokenizer = WordTokenizer()for classname in classes: classdir = os.path.join(dirname,classname) docs=[os.path.join(classdir,fname) for fname in os.listdir(classdir)] docWordDistArray = [] for doc in docs: docWordFreqDist = nltk.FreqDist() words = tokenizer.tokenize(open(doc).read().lower()) [docWordFreqDist.inc(word) for word in words] # evaluate frequencies of each word docWordDistArray.append(docWordFreqDist) # build up a list which index its elements to corresponding docs. [globalWordFreqDist.inc(word) for word in words] # evaluate frequencies of the words in all docs docClusterDict[classname] = docWordDistArray # categorize docWordFreqDistssys.stderr.write(`len(docClusterDict)`)sys.stderr.write(`len(docWordDistArray)`)attrWords = globalWordFreqDist.keys()###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## """ write out the ARFF header. Specifications for ARFF are at: http://www.cs.waikato.ac.nz/~ml/weka/arff.html."""header = [ ]header.append("@RELATION test1")for word in attrWords : header.append("@ATTRIBUTE \"%s\" NUMERIC" %(word))header.append("@ATTRIBUTE class {%s}" %(",".join(classes)))header.append("@DATA")print "\n".join(header)## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ####for classname in classes: docWordDistArray = docClusterDict[classname] for docWordFreqDist in docWordDistArray: thisDocWords = docWordFreqDist.keys() data = [] for word in attrWords: if word in thisDocWords: thisValue = docWordFreqDist[word] else: thisValue = 0 data.append("%d"%thisValue) data.append(classname) print ','.join(data)###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ####for classname in classes :## classdir = os.path.join(dirname, classname)## docs = [ os.path.join(classdir, fname) for fname in os.listdir(classdir) ]## for doc in docs :## counts = {}## ## sys.stderr.write(`doc`) # ## ### #### ####### s = file(doc).read()#### # tokenize the document on whitespace## toks = s.split()## ## # count all the tokens## for tok in toks : ## if not counts.has_key(tok) :## counts[tok] = 0## counts[tok] += 1#### # extract the features we want## data = [ ]## for word in words :## if counts.has_key(word) :## data.append("%d" %(counts[word]))## else :## data.append("0")## data.append(classname)#### print ",".join(data)
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -