?? code4.txt
字號:
英文分詞+提取詞干的修正版
很久以前寫過一個分詞程序,也一直在用,不過之前的若干版本都有一些bug,今天趁著有空修改了一下,實現的功能包括:
1、抽取字符串中的英文單詞,如:"I love you, too."可以分為i, love, you, too四個單詞
2、對于連寫的單詞也可以根據首字母大寫分詞,如:wordStemming可以分為word和stemming
3、提取詞干,仍然采用的是Snowball
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import org.tartarus.snowball.SnowballProgram;
/** *//**
* Tokenizer
*
* @author Peter Cheng
*
*/
public class Tokenizer ...{
/** *//**
* Language
*/
public static String language = "english";
/**//* Stemmer */
private static SnowballProgram stemmer = null;
/**//* Stem method */
private static Method stemMethod = null;
/** *//**
* Tokenize and stem
*
* @param source
* The string to be processed
* @return All the word stems
*/
public static Iterator tokenize(String source) ...{
if (Tokenizer.stemmer == null) ...{
try ...{
Class stemClass = Class.forName("org.tartarus.snowball.ext."
+ Tokenizer.language + "Stemmer");
Tokenizer.stemmer = (SnowballProgram) stemClass.newInstance();
Tokenizer.stemMethod = stemClass
.getMethod("stem", new Class[0]);
} catch (Exception e) ...{
System.out.println("Error when initializing Stemmer!");
System.exit(1);
}
}
/**//* Tokenizer */
ArrayList tokens = new ArrayList();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < source.length(); i++) ...{
char character = source.charAt(i);
if (Character.isLetter(character)) ...{
buffer.append(character);
} else ...{
if (buffer.length() > 0) ...{
tokens.add(buffer.toString());
buffer = new StringBuffer();
}
}
}
if (buffer.length() > 0) ...{
tokens.add(buffer.toString());
}
/**//* All the words */
ArrayList words = new ArrayList();
/**//* All the words consisting of capitals */
ArrayList allTheCapitalWords = new ArrayList();
/**//* Tokenize according to the capitals */
nextToken: for (Iterator allTokens = tokens.iterator(); allTokens
.hasNext();) ...{
String token = (String) allTokens.next();
/**//* The words consisting of capitals */
boolean allUpperCase = true;
for (int i = 0; i < token.length(); i++) ...{
if (!Character.isUpperCase(token.charAt(i))) ...{
allUpperCase = false;
}
}
if (allUpperCase) ...{
allTheCapitalWords.add(token);
continue nextToken;
}
/**//* Other cases */
int index = 0;
nextWord: while (index < token.length()) ...{
nextCharacter: while (true) ...{
index++;
if ((index == token.length())
|| !Character.isLowerCase(token.charAt(index))) ...{
break nextCharacter;
}
}
words.add(token.substring(0, index).toLowerCase());
token = token.substring(index);
index = 0;
continue nextWord;
}
}
/**//* Stemming */
try ...{
for (int i = 0; i < words.size(); i++) ...{
Tokenizer.stemmer.setCurrent((String) words.get(i));
Tokenizer.stemMethod.invoke(Tokenizer.stemmer, new Object[0]);
words.set(i, Tokenizer.stemmer.getCurrent());
}
} catch (Exception e) ...{
e.printStackTrace();
}
words.addAll(allTheCapitalWords);
return words.iterator();
}
}
Trackback: http://tb.blog.csdn.net/TrackBack.aspx?PostId=1543297
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -