?? combinatoricsknife.java

?? 對Lcuene的良好的封裝,提供了中文分詞字典功能強大
?? JAVA
字號:
/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.knife;

import java.util.HashSet;

import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.Hit;

/**
 * 排列組合Knife。
 * <p>
 * 
 * 該Knife把遇到的非LIMIT字符視為一個單詞分出。<br>
 * 同時如果有以該詞語開頭的字符串在x-for-combinatorics.dic出現也會切出
 * 
 * @author Zhiliang Wang [qieqie.wang@gmail.com]
 * 
 * @since 1.0
 * 
 */
public abstract class CombinatoricsKnife implements Knife, DictionariesWare {

	protected Dictionary combinatoricsDictionary;

	protected HashSet/* <String> */noiseTable;

	public CombinatoricsKnife() {
	}

	public CombinatoricsKnife(String[] noiseWords) {
		setNoiseWords(noiseWords);
	}

	public void setNoiseWords(String[] noiseWords) {
		noiseTable = new HashSet/* <String> */((int) (noiseWords.length * 1.5));
		for (int i = 0; i < noiseWords.length; i++) {
			noiseTable.add(noiseWords[i]);
		}
	}

	public void setDictionaries(Dictionaries dictionaries) {
		combinatoricsDictionary = dictionaries.getCombinatoricsDictionary();
	}

	public int dissect(Collector collector, Beef beef, int offset) {
		// 當point == -1時表示本次分解沒有遇到POINT性質的字符；
		// 如果point != -1，該值表示POINT性質字符的開始位置，
		// 這個位置將被返回，下一個Knife將從point位置開始分詞
		int point = -1;

		// 記錄同質字符分詞結束極限位置(不包括limit位置的字符)-也就是assignable方法遇到LIMIT性質的字符的位置
		// 如果point==-1，limit將被返回，下一個Knife將從limit位置開始嘗試分詞
		int limit = offset + 1;

		// 構建point和limit變量的值:
		// 往前直到遇到LIMIT字符；
		// 其中如果遇到第一次POINT字符，則會將它記錄為point
		GO_UNTIL_LIMIT: while (true) {
			switch (assignable(beef, offset, limit)) {
			case LIMIT:
				break GO_UNTIL_LIMIT;
			case POINT:
				if (point == -1) {
					point = limit;
				}
			}
			limit++;
		}
		// 如果最后一個字符也是ASSIGNED以及POINT，
		// 且beef之前已經被分解了一部分(從而能夠騰出空間以讀入新的字符)，則需要重新讀入字符后再分詞
		if (limit == beef.length() && offset > 0) {
			return -offset;
		}

		// 檢索是否有以該詞語位前綴的詞典詞語
		// 若有，則將它解出
		int dicWordVote = -1;
		if (combinatoricsDictionary != null && beef.charAt(limit) > 0xFF) {
			dicWordVote = tryDicWord(collector, beef, offset, limit);
		}

		// 收集從offset分別到point以及limit的詞
		// 注意這里不收集從point到limit的詞
		// ->當然可能從point到limit的字符也可能是一個詞，不過這不是本次分解的責任
		// ->如果認為它應該是個詞，那么只要配置對應的其它Knife實例，該Knife會有機會把它切出來的
		// ->因為我們會返回point作為下一個Knife分詞的開始。

		int pointVote = collectPoint(collector, beef, offset, point, limit,
				dicWordVote);
		int limitVote = collectLimit(collector, beef, offset, point, limit,
				dicWordVote);

		return nextOffset(beef, offset, point, limit, pointVote, limitVote,
				dicWordVote);
	}

	/**
	 * 通知收集從offset到第一個LIMIT字符的詞，并投票下一個Knife開始的分詞位置。如果不存在POINT字符，則Point的值為-1。
	 * <p>
	 * 
	 * 默認方法實現：如果不存在POINT性質的字符，則直接返回不做任何切詞處理。
	 * 
	 * @param collector
	 * @param beef
	 * @param offset
	 *            本次分解的內容在beef中的開始位置
	 * @param point
	 *            本次分解的內容的第一個POINT性質字符的位置，-1表示不存在該性質的字符
	 * @param limit
	 *            本次分解的內容的LIMIT性質字符
	 * @return 投票下一個Knife開始分詞的位置；-1表示棄權。默認方法實現：棄權。
	 */
	protected int collectPoint(Collector collector, Beef beef, int offset,
			int point, int limit, int dicWordVote) {
		if (point != -1 && dicWordVote == -1) {
			collectIfNotNoise(collector, beef, offset, point);
		}
		return -1;
	}

	/**
	 * 通知收集從offset到第一個LIMIT字符的詞，并投票下一個Knife開始的分詞位置。
	 * <p>
	 * 
	 * 默認方法實現：把從offset位置到limit位置止(不包含邊界)的字符串視為一個詞切出。
	 * 
	 * @param collector
	 * @param beef
	 * @param offset
	 *            本次分解的內容在beef中的開始位置
	 * @param point
	 *            本次分解的內容的第一個POINT性質字符的位置，-1表示不存在該性質的字符
	 * @param limit
	 *            本次分解的內容的LIMIT性質字符
	 * 
	 * @param dicWordVote 
	 * 
	 * @return 投票下一個Knife開始分詞的位置；-1表示棄權。默認方法實現：棄權。
	 */
	protected int collectLimit(Collector collector, Beef beef, int offset,
			int point, int limit, int dicWordVote) {
		if (dicWordVote == -1) {
			collectIfNotNoise(collector, beef, offset, limit);
		}
		return -1;
	}

	/**
	 * 嘗試從combinatorics字典中檢索，如果存在以offset到limit位置止(不包含limit邊界)字符串開始的詞語，則切出該詞語。
	 * <p>
	 * 如沒有檢索到這樣的詞語，則本方法返回-1棄權投票下一個Knife的開始分解位置。<br>
	 * 如果檢索到這樣的詞語，在切出在詞語的同時，投票返回這個詞語的結束位置(詞語本身不包含該結束位置的字符)
	 * <p>
	 * 
	 * (for version 2.0.4+):<br>
	 * 本方法目前存在的局限：<br>
	 * 如果字典中的某個詞語剛好分隔在兩次beef之中，比如"U"剛好是此次beef的最后字符，而"盤"是下一次beef的第一個字符，<br>
	 * 這種情況現在 {@link CombinatoricsKnife}還沒機制辦法識別將之處理為一個詞語
	 * 
	 * @param collector
	 * @param beef
	 * @param offset
	 * @param limit
	 * @return
	 */
	protected int tryDicWord(Collector collector, Beef beef, int offset,
			int limit) {
		int ret = limit;
		for (int end = limit + 1, count = limit - offset + 1; end <= beef
				.length(); end++, count++) {
			Hit hit = combinatoricsDictionary.search(beef, offset, count);
			if (hit.isUndefined()) {
				break;
			} else if (hit.isHit()) {
				collectIfNotNoise(collector, beef, offset, end);
				// 收到詞語，將ret設置為該詞語的end
				ret = end;
			}
			// gotoNextChar為true表示在詞典中存在以當前詞為開頭的詞，
			boolean gotoNextChar = hit.isUnclosed() && end < beef.length()
					&& beef.charAt(end) >= hit.getNext().charAt(count);
			if (!gotoNextChar) {
				break;
			}
		}
		return ret <= limit ? -1 : ret;
		// TODO:
		// 存在的局限:
		// 剛好詞語分隔在兩次beef之中，比如"U"剛好是此次beef的最后字符，而"盤"是下一次beef的第一個字符
		// 這種情況現在CombinatoricsKnife還沒機制辦法識別將之處理為一個詞語
	}

	/**
	 * 當Knife決定切出從offset始到end位置止(不包含結束位置的字符)的詞語時，本方法能夠過濾掉可能是noise的詞，使最終不切出。
	 * 
	 * @param collector
	 * @param beef
	 * @param offset
	 * @param end
	 */
	protected void collectIfNotNoise(Collector collector, Beef beef,
			int offset, int end) {
		// 將offset和end之間的詞(不包含end位置)創建出來給word
		// 如果該詞語為噪音詞，則重新丟棄之(設置為null)，
		String word = beef.subSequence(offset, end).toString();
		if (noiseTable != null && noiseTable.contains(word)) {
			word = null;
		}

		// 否則發送消息給collect方法，表示Knife新鮮出爐了一個內容為word的候選詞語
		// 即：最終決定是否要把這個詞語通知給collector的是collect方法
		if (word != null) {
			doCollect(collector, word, beef, offset, end);
		}
	}

	/**
	 * 
	 * 當Knife決定切出從offset始到end位置止(不包含結束位置的字符)的詞語時，本方法直接調用{@link #doCollect(Collector, String, Beef, int, int)}切出詞語(而不過濾noise詞匯)
	 * 
	 * @param collector
	 * @param beef
	 * @param offset
	 * @param end
	 */
	protected void collect(Collector collector, Beef beef, int offset, int end) {
		String word = beef.subSequence(offset, end).toString();
		doCollect(collector, word, beef, offset, end);
	}

	/**
	 * 收集分解出的候選詞語。 默認實現是將該候選詞語通知給收集器collector。<br>
	 * 子類覆蓋本方法可以更靈活地控制詞語的收錄，例如控制僅當word滿足一些額外條件再決定是否收集，<br>
	 * 或依上下文環境收集更多的相關詞語
	 * 
	 * @param collector
	 * @param word
	 * @param beef
	 * @param offset
	 * @param end
	 */
	protected void doCollect(Collector collector, String word, Beef beef,
			int offset, int end) {
		collector.collect(word, offset, end);
	}

	/**
	 * 根據字符串性質位置，以及分詞結果投票，決出下一個Knife應該從哪一個位置開始探測切詞
	 * 
	 * @param beef
	 * @param offset
	 *            本次分詞的開始位置
	 * @param point
	 *            本次分詞的第一個POINT性質的字符位置，-1表示沒有該性質的字符
	 * @param limit
	 *            本次分詞的第一個LIMIT性質的字符位置
	 * @param pointVote
	 *            收集從offset到第一個POINT性質字符詞匯時的投票，-1表示棄權
	 * @param limitVote
	 *            收集從offset到第一個LIMIT性質字符詞匯時的投票，-1表示棄權
	 * @param dicWordVote
	 *            收集combinatorics詞典詞語時的投票，-1表示棄權
	 * @return
	 */
	protected int nextOffset(Beef beef, int offset, int point, int limit,
			int pointVote, int limitVote, int dicWordVote) {
		int max = pointVote > limitVote ? pointVote : limitVote;
		max = max > dicWordVote ? max : dicWordVote;
		if (max == -1) {
			return point != -1 ? point : limit;
		} else if (max > limit) {
			return max;
		} else {
			return limit;
		}
	}
}
?? 文件大小 3391 K
?? 上傳用戶 a83133937
?? 所屬分類 Java編程
??? 相關標簽

#Lcuene #封裝 #分
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? combinatoricsknife.java

?? 快捷鍵說明