?? filedictionaries.java
字號:
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.paoding.analysis.knife;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import net.paoding.analysis.dictionary.BinaryDictionary;
import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.HashBinaryDictionary;
import net.paoding.analysis.dictionary.Hit;
import net.paoding.analysis.dictionary.Word;
import net.paoding.analysis.dictionary.support.detection.Detector;
import net.paoding.analysis.dictionary.support.detection.DifferenceListener;
import net.paoding.analysis.dictionary.support.detection.ExtensionFileFilter;
import net.paoding.analysis.dictionary.support.filewords.FileWordsReader;
import net.paoding.analysis.exception.PaodingAnalysisException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* 中文字典緩存根據(jù)地,為{@link CJKKnife}所用。<br>
* 從本對象可以獲取中文需要的相關(guān)字典。包括詞匯表、姓氏表、計量單位表、忽略的詞或單字等。
* <p>
*
* @author Zhiliang Wang [qieqie.wang@gmail.com]
*
* @see CJKKnife
*
* @since 1.0
*/
public class FileDictionaries implements Dictionaries {
// -------------------------------------------------
protected Log log = LogFactory.getLog(this.getClass());
// -------------------------------------------------
/**
* 詞匯表字典
*/
protected Dictionary vocabularyDictionary;
/**
* lantin+cjk的詞典
*/
protected Dictionary combinatoricsDictionary;
/**
* 姓氏字典
*
*/
protected Dictionary confucianFamilyNamesDictionary;
/**
* 忽略的單字
*/
protected Dictionary noiseCharactorsDictionary;
/**
* 忽略的詞語
*
*/
protected Dictionary noiseWordsDictionary;
/**
* 計量單位
*/
protected Dictionary unitsDictionary;
// -------------------------------------------------
protected Map/* <String, Set<String>> */allWords;
protected String dicHome;
protected String skipPrefix;
protected String noiseCharactor;
protected String noiseWord;
protected String unit;
protected String confucianFamilyName;
protected String combinatorics;
protected String charsetName;
// ----------------------
public FileDictionaries() {
}
public FileDictionaries(String dicHome, String skipPrefix,
String noiseCharactor, String noiseWord, String unit,
String confucianFamilyName, String combinatorics, String charsetName) {
this.dicHome = dicHome;
this.skipPrefix = skipPrefix;
this.noiseCharactor = noiseCharactor;
this.noiseWord = noiseWord;
this.unit = unit;
this.confucianFamilyName = confucianFamilyName;
this.combinatorics = combinatorics;
this.charsetName = charsetName;
}
public String getDicHome() {
return dicHome;
}
public void setDicHome(String dicHome) {
this.dicHome = dicHome;
}
public String getSkipPrefix() {
return skipPrefix;
}
public void setSkipPrefix(String skipPrefix) {
this.skipPrefix = skipPrefix;
}
public String getNoiseCharactor() {
return noiseCharactor;
}
public void setNoiseCharactor(String noiseCharactor) {
this.noiseCharactor = noiseCharactor;
}
public String getNoiseWord() {
return noiseWord;
}
public void setNoiseWord(String noiseWord) {
this.noiseWord = noiseWord;
}
public String getUnit() {
return unit;
}
public void setUnit(String unit) {
this.unit = unit;
}
public String getConfucianFamilyName() {
return confucianFamilyName;
}
public void setConfucianFamilyName(String confucianFamilyName) {
this.confucianFamilyName = confucianFamilyName;
}
public String getCharsetName() {
return charsetName;
}
public void setCharsetName(String charsetName) {
this.charsetName = charsetName;
}
public void setLantinFllowedByCjk(String lantinFllowedByCjk) {
this.combinatorics = lantinFllowedByCjk;
}
public String getLantinFllowedByCjk() {
return combinatorics;
}
// -------------------------------------------------
/**
* 詞匯表字典
*
* @return
*/
public synchronized Dictionary getVocabularyDictionary() {
if (vocabularyDictionary == null) {
// 大概有5639個字有詞語,故取0x2fff=x^13>8000>8000*0.75=6000>5639
vocabularyDictionary = new HashBinaryDictionary(
getVocabularyWords(), 0x2fff, 0.75f);
Dictionary noiseWordsDic = getNoiseWordsDictionary();
for (int i = 0; i < noiseWordsDic.size(); i++) {
Hit hit = vocabularyDictionary.search(noiseWordsDic.get(i), 0, noiseWordsDic.get(i).length());
if (hit.isHit()) {
hit.getWord().setNoiseWord();
}
}
Dictionary noiseCharactorsDic = getNoiseCharactorsDictionary();
for (int i = 0; i < noiseCharactorsDic.size(); i++) {
Hit hit = vocabularyDictionary.search(noiseCharactorsDic.get(i), 0, noiseCharactorsDic.get(i).length());
if (hit.isHit()) {
hit.getWord().setNoiseCharactor();
}
}
}
return vocabularyDictionary;
}
/**
* 姓氏字典
*
* @return
*/
public synchronized Dictionary getConfucianFamilyNamesDictionary() {
if (confucianFamilyNamesDictionary == null) {
confucianFamilyNamesDictionary = new BinaryDictionary(
getConfucianFamilyNames());
}
return confucianFamilyNamesDictionary;
}
/**
* 忽略的詞語
*
* @return
*/
public synchronized Dictionary getNoiseCharactorsDictionary() {
if (noiseCharactorsDictionary == null) {
noiseCharactorsDictionary = new HashBinaryDictionary(
getNoiseCharactors(), 256, 0.75f);
}
return noiseCharactorsDictionary;
}
/**
* 忽略的單字
*
* @return
*/
public synchronized Dictionary getNoiseWordsDictionary() {
if (noiseWordsDictionary == null) {
noiseWordsDictionary = new BinaryDictionary(getNoiseWords());
}
return noiseWordsDictionary;
}
/**
* 計量單位
*
* @return
*/
public synchronized Dictionary getUnitsDictionary() {
if (unitsDictionary == null) {
unitsDictionary = new HashBinaryDictionary(getUnits(), 1024, 0.75f);
}
return unitsDictionary;
}
public synchronized Dictionary getCombinatoricsDictionary() {
if (combinatoricsDictionary == null) {
combinatoricsDictionary = new BinaryDictionary(
getCombinatoricsWords());
}
return combinatoricsDictionary;
}
private Detector detector;
public synchronized void startDetecting(int interval, DifferenceListener l) {
if (detector != null || interval < 0) {
return;
}
Detector detector = new Detector();
detector.setHome(dicHome);
detector.setFilter(new ExtensionFileFilter(".dic"));
detector.setLastSnapshot(detector.flash());
detector.setListener(l);
detector.setInterval(interval);
detector.start(true);
this.detector = detector;
}
public synchronized void stopDetecting() {
if (detector == null) {
return;
}
detector.setStop();
detector = null;
}
/**
*
* @param dicName
*/
protected synchronized void refreshDicWords(String dicPath) {
int index = dicPath.lastIndexOf(".dic");
String dicName = dicPath.substring(0, index);
if (allWords != null) {
try {
Map/* <String, Set<String>> */temp = FileWordsReader
.readWords(dicHome + dicPath, charsetName);
allWords.put(dicName, temp.values().iterator().next());
} catch (FileNotFoundException e) {
// 如果源文件已經(jīng)被刪除了,則表示該字典不要了
allWords.remove(dicName);
} catch (IOException e) {
throw toRuntimeException(e);
}
if (!isSkipForVacabulary(dicName)) {
this.vocabularyDictionary = null;
}
// 如果來的是noiseWord
if (isNoiseWordDicFile(dicName)) {
this.noiseWordsDictionary = null;
// noiseWord和vocabulary有關(guān),所以需要更新vocabulary
this.vocabularyDictionary = null;
}
// 如果來的是noiseCharactors
else if (isNoiseCharactorDicFile(dicName)) {
this.noiseCharactorsDictionary = null;
// noiseCharactorsDictionary和vocabulary有關(guān),所以需要更新vocabulary
this.vocabularyDictionary = null;
}
// 如果來的是單元
else if (isUnitDicFile(dicName)) {
this.unitsDictionary = null;
}
// 如果來的是亞洲人人姓氏
else if (isConfucianFamilyNameDicFile(dicName)) {
this.confucianFamilyNamesDictionary = null;
}
// 如果來的是以字母,數(shù)字等組合類語言為開頭的詞匯
else if (isLantinFollowedByCjkDicFile(dicName)) {
this.combinatoricsDictionary = null;
}
}
}
// ---------------------------------------------------------------
// 以下為輔助性的方式-類私有或package私有
protected Word[] getVocabularyWords() {
Map/* <String, Set<Word>> */dics = loadAllWordsIfNecessary();
Set/* <Word> */set = null;
Iterator/* <Word> */iter = dics.keySet().iterator();
while (iter.hasNext()) {
String name = (String) iter.next();
if (isSkipForVacabulary(name)) {
continue;
}
Set/* <Word> */dic = (Set/* <Word> */) dics.get(name);
if (set == null) {
set = new HashSet/* <Word> */(dic);
} else {
set.addAll(dic);
}
}
Word[] words = (Word[]) set.toArray(new Word[set.size()]);
Arrays.sort(words);
return words;
}
protected Word[] getConfucianFamilyNames() {
return getDictionaryWords(confucianFamilyName);
}
protected Word[] getNoiseWords() {
return getDictionaryWords(noiseWord);
}
protected Word[] getNoiseCharactors() {
return getDictionaryWords(noiseCharactor);
}
protected Word[] getUnits() {
return getDictionaryWords(unit);
}
protected Word[] getCombinatoricsWords() {
return getDictionaryWords(combinatorics);
}
protected Word[] getDictionaryWords(String dicNameRelativeDicHome) {
Map dics;
try {
dics = FileWordsReader.readWords(dicHome + "/"
+ dicNameRelativeDicHome + ".dic", charsetName);
} catch (IOException e) {
throw toRuntimeException(e);
}
Set/* <Word> */set = (Set/* <Word> */) dics.get(dicNameRelativeDicHome);
Word[] words = (Word[]) set.toArray(new Word[set.size()]);
Arrays.sort(words);
return words;
}
// -------------------------------------
/**
* 讀取字典安裝目錄及子孫目錄下的字典文件;并以該字典相對安裝目錄的路徑(包括該字典的文件名,但不包括擴展名)作為key。
* 比如,如果字典安裝在dic目錄下,該目錄下有division/china.dic,則該字典文件對應(yīng)的key是"division/china"
*/
protected synchronized Map/* <String, Set<String>> */loadAllWordsIfNecessary() {
if (allWords == null) {
try {
log.info("loading dictionaries from " + dicHome);
allWords = FileWordsReader.readWords(dicHome, charsetName);
if (allWords.size() == 0) {
String message = "Not found any dictionary files, have you set the 'paoding.dic.home' right? ("
+ this.dicHome + ")";
log.error(message);
throw new PaodingAnalysisException(message);
}
log.info("loaded success!");
} catch (IOException e) {
throw toRuntimeException(e);
}
}
return allWords;
}
// ---------------------------------------
protected final boolean isSkipForVacabulary(String dicNameRelativeDicHome) {
return dicNameRelativeDicHome.startsWith(skipPrefix)
|| dicNameRelativeDicHome.indexOf("/" + skipPrefix) != -1;
}
protected boolean isUnitDicFile(String dicName) {
return dicName.equals(this.unit);
}
protected boolean isNoiseCharactorDicFile(String dicName) {
return dicName.equals(this.noiseCharactor);
}
protected boolean isNoiseWordDicFile(String dicName) {
return dicName.equals(this.noiseWord);
}
protected boolean isConfucianFamilyNameDicFile(String dicName) {
return dicName.equals(this.confucianFamilyName);
}
protected boolean isLantinFollowedByCjkDicFile(String dicName) {
return dicName.equals(this.combinatorics);
}
// --------------------------------------
protected RuntimeException toRuntimeException(IOException e) {
return new PaodingAnalysisException(e);
}
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -