?? doc2mat
字號:
#!/usr/bin/perl## doc2mat## This file contains a simple program for creating a CLUTO-compatible # mat-file from a set of documents. # For more information on how to use it do a 'doc2mat -help'# # V1.0.0 Wed Sep 11 23:13:55 CDT 2002 #use Getopt::Long;use Pod::Usage;#use Pod::Html;#pod2html("doc2mat", "--outfile=doc2mat.html");#==============================================================================# Built-in stop list#==============================================================================%stop_list = ('different','1', 'n','1', 'necessary','1', 'need','1', 'needed','1', 'needing','1', 'newest','1', 'next','1', 'no','1', 'nobody','1', 'non','1', 'noone','1', 'not','1', 'nothing','1', 'now','1', 'nowhere','1', 'of','1', 'off','1', 'often','1', 'new','1', 'old','1', 'older','1', 'oldest','1', 'on','1', 'once','1', 'one','1', 'only','1', 'open','1', 'again','1', 'among','1', 'already','1', 'about','1', 'above','1', 'against','1', 'alone','1', 'after','1', 'also','1', 'although','1', 'along','1', 'always','1', 'an','1', 'across','1', 'b','1', 'and','1', 'another','1', 'ask','1', 'c','1', 'asking','1', 'asks','1', 'backed','1', 'away','1', 'a','1', 'should','1', 'show','1', 'came','1', 'all','1', 'almost','1', 'before','1', 'began','1', 'back','1', 'backing','1', 'be','1', 'became','1', 'because','1', 'becomes','1', 'been','1', 'at','1', 'behind','1', 'being','1', 'best','1', 'better','1', 'between','1', 'big','1', 'showed','1', 'ended','1', 'ending','1', 'both','1', 'but','1', 'by','1', 'asked','1', 'backs','1', 'can','1', 'cannot','1', 'number','1', 'numbers','1', 'o','1', 'few','1', 'find','1', 'finds','1', 'clearly','1', 'her','1', 'herself','1', 'come','1', 'could','1', 'd','1', 'did','1', 'here','1', 'beings','1', 'fact','1', 'far','1', 'felt','1', 'become','1', 'first','1', 'for','1', 'four','1', 'from','1', 'full','1', 'fully','1', 'furthers','1', 'gave','1', 'general','1', 'generally','1', 'get','1', 'gets','1', 'gives','1', 'facts','1', 'go','1', 'going','1', 'good','1', 'goods','1', 'certain','1', 'certainly','1', 'clear','1', 'great','1', 'greater','1', 'greatest','1', 'group','1', 'grouped','1', 'grouping','1', 'groups','1', 'h','1', 'got','1', 'has','1', 'g','1', 'have','1', 'having','1', 'he','1', 'further','1', 'furthered','1', 'had','1', 'furthering','1', 'itself','1', 'faces','1', 'highest','1', 'him','1', 'himself','1', 'his','1', 'how','1', 'however','1', 'i','1', 'if','1', 'important','1', 'interests','1', 'into','1', 'is','1', 'it','1', 'its','1', 'j','1', 'anyone','1', 'anything','1', 'anywhere','1', 'are','1', 'area','1', 'areas','1', 'around','1', 'as','1', 'seconds','1', 'see','1', 'seem','1', 'seemed','1', 'seeming','1', 'seems','1', 'sees','1', 'right','1', 'several','1', 'shall','1', 'she','1', 'enough','1', 'even','1', 'evenly','1', 'over','1', 'p','1', 'part','1', 'parted','1', 'parting','1', 'parts','1', 'per','1', 'down','1', 'place','1', 'places','1', 'point','1', 'pointed','1', 'pointing','1', 'points','1', 'possible','1', 'present','1', 'presented','1', 'presenting','1', 'ends','1', 'high','1', 'mrs','1', 'much','1', 'must','1', 'my','1', 'myself','1', 'presents','1', 'down','1', 'problem','1', 'problems','1', 'put','1', 'puts','1', 'q','1', 'quite','1', 'will','1', 'with','1', 'within','1', 'r','1', 'rather','1', 'really','1', 'room','1', 'rooms','1', 's','1', 'said','1', 'same','1', 'right','1', 'showing','1', 'shows','1', 'side','1', 'sides','1', 'since','1', 'small','1', 'smaller','1', 'smallest','1', 'so','1', 'some','1', 'somebody','1', 'someone','1', 'something','1', 'somewhere','1', 'state','1', 'states','1', 'such','1', 'sure','1', 't','1', 'take','1', 'taken','1', 'than','1', 'that','1', 'the','1', 'their','1', 'then','1', 'there','1', 'therefore','1', 'these','1', 'x','1', 'thought','1', 'thoughts','1', 'three','1', 'through','1', 'thus','1', 'to','1', 'today','1', 'together','1', 'too','1', 'took','1', 'toward','1', 'turn','1', 'turned','1', 'turning','1', 'turns','1', 'two','1', 'still','1', 'u','1', 'under','1', 'until','1', 'up','1', 'others','1', 'upon','1', 'us','1', 'use','1', 'used','1', 'uses','1', 'v','1', 'very','1', 'w','1', 'want','1', 'wanted','1', 'wanting','1', 'wants','1', 'was','1', 'way','1', 'we','1', 'well','1', 'wells','1', 'went','1', 'were','1', 'what','1', 'when','1', 'where','1', 'whether','1', 'which','1', 'while','1', 'who','1', 'whole','1', 'y','1', 'year','1', 'years','1', 'yet','1', 'you','1', 'everyone','1', 'everything','1', 'everywhere','1', 'young','1', 'younger','1', 'youngest','1', 'your','1', 'yours','1', 'z','1', 'ever','1', 'works','1', 'every','1', 'everybody','1', 'f','1', 'face','1', 'other','1', 'our','1', 'out','1', 'just','1', 'interesting','1', 'high','1', 'might','1', 'k','1', 'keep','1', 'keeps','1', 'give','1', 'given','1', 'higher','1', 'kind','1', 'knew','1', 'know','1', 'known','1', 'knows','1', 'l','1', 'large','1', 'largely','1', 'last','1', 'later','1', 'latest','1', 'least','1', 'less','1', 'needs','1', 'never','1', 'newer','1', 'let','1', 'lets','1', 'like','1', 'likely','1', 'long','1', 'high','1', 'longer','1', 'longest','1', 'm','1', 'made','1', 'make','1', 'making','1', 'man','1', 'many','1', 'may','1', 'me','1', 'member','1', 'members','1', 'men','1', 'more','1', 'in','1', 'interest','1', 'interested','1', 'most','1', 'mostly','1', 'mr','1', 'opened','1', 'opening','1', 'new','1', 'opens','1', 'or','1', 'perhaps','1', 'order','1', 'ordered','1', 'ordering','1', 'orders','1', 'differ','1', 'differently','1', 'do','1', 'does','1', 'done','1', 'downed','1', 'downing','1', 'downs','1', 'they','1', 'thing','1', 'things','1', 'think','1', 'thinks','1', 'this','1', 'those','1', 'ways','1', 'why','1', 'without','1', 'work','1', 'worked','1', 'working','1', 'would','1', 'during','1', 'e','1', 'each','1', 'early','1', 'either','1', 'end','1', 'though','1', 'still','1', 'whose','1', 'saw','1', 'say','1', 'says','1', 'them','1', 'second','1', 'any','1', 'anybody','1'); #==============================================================================# Parse Command Line Arguments#==============================================================================$nostem = 0;$nostop = 0;$mystoplist = '';$minwlen = 3;$nlskip = 0;$tokfile = 0;$skipnumeric = 0;$help = '';$docfile = '';$matfile = '';$clabelfile = '';GetOptions('skipnumeric' => \$skipnumeric, 'tokfile' => \$tokfile, 'nostem' => \$nostem,, 'nostop' => \$nostop, 'mystoplist=s' => \$mystoplist, 'minwlen=i' => \$minwlen, 'nlskip=i' => \$nlskip, 'help|?' => \$help);pod2usage(-verbose => 2) if $help;pod2usage(-verbose => 2) if $#ARGV != 1;$docfile = $ARGV[0];$matfile = $ARGV[1];$clabelfile = $matfile . ".clabel";$rlabelfile = $matfile . ".rlabel";$tokenizedfile = $matfile . ".tokens";$tmpmatfile = $matfile . ".tmp";-e $docfile or die "***Error: Input document file ", $docfile, " does not exist.\n";if ($mystoplist) { -e $mystoplist or die "***Error: User supplied stop list file ", $mystoplist, " does not exist.\n";}#==============================================================================# Read the user-supplied stop-list if any #==============================================================================%my_stop_list = ();if ($mystoplist) { print "Reading user supplied stop list file...\n"; open(FPIN, "<$mystoplist"); while (<FPIN>) { tr/A-Z/a-z/; # change to lower case s/^\s+//; # remove leading spaces y/a-z0-9/ /cs; # retain only alpha-numeric entries s/\s+/ /g; # compact spaces chop; @tokens = split(/\s+/, $_); foreach $token (@tokens) { $my_stop_list{$token} = 1; } } close(FPIN); print "Done.\n"; if ($nostop) { %stop_list = (); $nostop = 0; }}#==============================================================================# Setup the data-structures for the stemmer and initialize it#==============================================================================%step2list = ('ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble', 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al', 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');%step3list = ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');$c = "[^aeiou]"; # consonant$v = "[aeiouy]"; # vowel$C = "${c}[^aeiouy]*"; # consonant sequence$V = "${v}[aeiou]*"; # vowel sequence$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1$_v = "^(${C})?${v}"; # vowel in stem#==============================================================================# Get into the main text-processing part of the code#==============================================================================open(DOCFP, "<$docfile");open(MATFP, ">$tmpmatfile");if ($tokfile) { open(TOKENFP, ">$tokenizedfile");}if ($nlskip > 0) { open(RLABELFP, ">$rlabelfile");}%WORDID = ();%WORDNAMES = ();$nrows = 0;$ncols = 0;$nnz = 0;print "Reading document file...\n";while (<DOCFP>) { tr/A-Z/a-z/; y/a-z0-9/ /cs; s/^\s+//; s/\s+/ /g; chop; @tokens = split(/\s+/, $_); # Write the skipped tokens as the row-label of the file if ($nlskip > 0) { for ($i=0; $i<$nlskip; $i++) { print RLABELFP $tokens[$i], " "; } print RLABELFP "\n"; } # Construct the TF-representation for this document %TF = (); for ($i=$nlskip; $i<=$#tokens; $i++) { next if ($skipnumeric && ($tokens[$i] =~ /\d/)); next if (length($tokens[$i]) < $minwlen); if ($nostop) { if ($nostem) { $newword = $tokens[$i]; } else { $newword = stem($tokens[$i]); } if ($tokfile) { print TOKENFP $newword, " "; } $TF{$newword}++; } else { if (!$stop_list{$tokens[$i]} && !$my_stop_list{$tokens[$i]}) { if ($nostem) { $newword = $tokens[$i]; } else { $newword = stem($tokens[$i]); } if ($tokfile) { print TOKENFP $newword, " "; } $TF{$newword}++; } } } if ($tokfile) { print TOKENFP "\n"; } # Write out the vector for this document
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -