?? cjkknife.java

?? 對Lcuene的良好的封裝,提供了中文分詞字典功能強大
?? JAVA
字號:
/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.knife;

import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.Hit;
import net.paoding.analysis.dictionary.Word;

/**
 * 
 * @author Zhiliang Wang [qieqie.wang@gmail.com]
 * 
 * @since 1.0
 * 
 */
public class CJKKnife implements Knife, DictionariesWare {

	// -------------------------------------------------

	private Dictionary vocabulary;
	private Dictionary noiseWords;
	private Dictionary noiseCharactors;
	private Dictionary units;

	// -------------------------------------------------

	public CJKKnife() {
	}

	public CJKKnife(Dictionaries dictionaries) {
		setDictionaries(dictionaries);
	}

	public void setDictionaries(Dictionaries dictionaries) {
		vocabulary = dictionaries.getVocabularyDictionary();
		noiseWords = dictionaries.getNoiseWordsDictionary();
		noiseCharactors = dictionaries.getNoiseCharactorsDictionary();
		units = dictionaries.getUnitsDictionary();
	}

	// -------------------------------------------------

	/**
	 * 分解以CJK字符開始的，后可帶阿拉伯數(shù)字、英文字母、橫線、下劃線的字符組成的語句
	 */
	public int assignable(Beef beef, int offset, int index) {
		char ch = beef.charAt(index);
		if (CharSet.isCjkUnifiedIdeographs(ch))
			return ASSIGNED;
		if (index > offset) {
			if (CharSet.isArabianNumber(ch) || CharSet.isLantingLetter(ch)
					|| ch == '-' || ch == '_') {
				return POINT;
			}
		}
		return LIMIT;
	}

	public int dissect(Collector collector, Beef beef, int offset) {
		// 當point == -1時表示本次分解沒有遇到POINT性質(zhì)的字符；
		// 如果point != -1，該值表示POINT性質(zhì)字符的開始位置，
		// 這個位置將被返回，下一個Knife將從point位置開始分詞
		int point = -1;

		// 記錄同質(zhì)字符分詞結(jié)束極限位置(不包括limit位置的字符)-也就是assignable方法遇到LIMIT性質(zhì)的字符的位置
		// 如果point==-1，limit將被返回，下一個Knife將從limit位置開始嘗試分詞
		int limit = offset + 1;

		// 構(gòu)建point和limit變量的值:
		// 往前直到遇到LIMIT字符；
		// 其中如果遇到第一次POINT字符，則會將它記錄為point
		GO_UNTIL_LIMIT: while (true) {
			switch (assignable(beef, offset, limit)) {
			case LIMIT:
				break GO_UNTIL_LIMIT;
			case POINT:
				if (point == -1) {
					point = limit;
				}
			}
			limit++;
		}

		// 如果從offset到beef.length()都是本次Knife的責任，則應讀入更多的未讀入字符，以支持一個詞分在兩次beef中的處理
		// 魔幻邏輯：
		// Beef承諾:如果以上GO_UNTIL_LIMIT循環(huán)最終把limit值設置為beef.length則表示還為未讀入字符。
		// 因為beef一定會在文本全部結(jié)束后加入一個char='\0'的值作為最后一個char標志結(jié)束。
		// 這樣以上的GO_UNTIL_LIMIT將在limit=beef.length()之前就已經(jīng)break，此時limit!=beef.length
		if (offset > 0 && limit == beef.length()) {
			return -offset;
		}

		// 記錄當前正在檢視(是否是詞典詞語)的字符串在beef中的始止位置(包含開始位置，不包含結(jié)束位置)
		int curSearchOffset = offset, curSearchEnd;

		// 記錄當前被檢視的字符串的長度，它的值恒等于(curSearchEnd - curSearchOffset)
		int curSearchLength;

		// 當前檢視的字符串的判斷結(jié)果
		Hit curSearch = null;

		// 限制要判斷的字符串的最大開始位置
		// 這個變量不隨著程序的運行而變化
		final int offsetLimit;
		if (point != -1)
			offsetLimit = point;
		else
			offsetLimit = limit;

		// 記錄到當前為止所分出的詞典詞語的最大結(jié)束位置
		int maxDicWordEnd = offset;

		// 記錄最近的不在詞典中的字符串(稱為孤立字符串)在beef的位置，-1表示沒有這個位置
		int isolatedOffset = -1;

		// 記錄到當前為止經(jīng)由詞典所切出詞的最大長度。
		// 用于輔助判斷是否調(diào)用shouldBeWord()方法，以把前后有如引號、書名號之類的，但還沒有被切出的字符串當成一個詞
		// 詳見本方法后面對maxDicWordLength的應用以及shouldBeWord()的實現(xiàn)
		int maxDicWordLength = 0;

		// 第1個循環(huán)定位被檢視字符串的開始位置
		// 被檢視的字符串開始位置的極限是offsetLimit，而非limit
		for (; curSearchOffset < offsetLimit; curSearchOffset++) {

			// 第二個循環(huán)定位被檢視字符串的結(jié)束位置(不包含該位置的字符)
			// 它的起始狀態(tài)是：被檢視的字符串一長度為1，即結(jié)束位置為開始位置+1
			curSearchEnd = curSearchOffset + 1;
			curSearchLength = 1;
			for (; curSearchEnd <= limit; curSearchEnd++, curSearchLength++) {

				// 通過詞匯表判斷，返回判斷結(jié)果curSearch
				curSearch = vocabulary.search(beef, curSearchOffset,
						curSearchLength);

				// ---------------分析返回的判斷結(jié)果--------------------------

				// 1)
				// 從詞匯表中找到了該詞語...
				if (curSearch.isHit()) {

					// 1.1)
					// 確認孤立字符串的結(jié)束位置=curSearchOffset，
					// 并調(diào)用子方法分解把從isolatedOffset開始的到curSearchOffset之間的孤立字符串
					// 孤立字符串分解完畢，將孤立字符串開始位置isolatedOffset清空
					if (isolatedOffset >= 0) {
						dissectIsolated(collector, beef, isolatedOffset,
								curSearchOffset);
						isolatedOffset = -1;
					}

					// 1.2)
					// 更新最大結(jié)束位置
					if (maxDicWordEnd < curSearchEnd) {
						maxDicWordEnd = curSearchEnd;
					}

					// 1.3)
					// 更新詞語最大長度變量的值
					if (curSearchOffset == offset
							&& maxDicWordLength < curSearchLength) {
						maxDicWordLength = curSearchLength;
					}
					
					// 1.2)
					// 通知collector本次找到的詞語
					Word word = curSearch.getWord();
					if (!word.isNoise()) {
						collector.collect(word.getText(), curSearchOffset,
							curSearchEnd);
					}
				}

				// 若isolatedFound==true，表示詞典沒有該詞語
				boolean isolatedFound = curSearch.isUndefined();

				// 若isolatedFound==false，則通過Hit的next屬性檢視詞典沒有beef的從offset到curWordEnd
				// + 1位置的詞
				// 這個判斷完全是為了減少一次詞典檢索而設計的，
				// 如果去掉這個if判斷，并不影響程序的正確性(但是會多一次詞典檢索)
				if (!isolatedFound && !curSearch.isHit()) {
					isolatedFound = curSearchEnd >= limit
							|| beef.charAt(curSearchEnd) < curSearch.getNext()
									.charAt(curSearchLength);
				}
				// 2)
				// 詞匯表中沒有該詞語，且沒有以該詞語開頭的詞匯...
				// -->將它記錄為孤立詞語
				if (isolatedFound) {
					if (isolatedOffset < 0 && curSearchOffset >= maxDicWordEnd) {
						isolatedOffset = curSearchOffset;
					}
					break;
				}

				// ^^^^^^^^^^^^^^^^^^分析返回的判斷結(jié)果^^^^^^^^^^^^^^^^^^^^^^^^
			} // end of the second for loop
		} // end of the first for loop

		// 上面循環(huán)分詞結(jié)束后，可能存在最后的幾個未能從詞典檢索成詞的孤立字符串，
		// 此時isolatedOffset不一定等于一個有效值(因為這些孤立字雖然不是詞語，但是詞典可能存在以它為開始的詞語，
		// 只要執(zhí)行到此才能知道這些雖然是前綴的字符串已經(jīng)沒有機會成為詞語了)
		// 所以不能通過isolatedOffset來判斷是否此時存在有孤立詞，判斷依據(jù)轉(zhuǎn)換為：
		// 最后一個詞典的詞的結(jié)束位置是否小于offsetLimit(!!offsetLimit, not Limit!!)
		if (maxDicWordEnd < offsetLimit) {
			dissectIsolated(collector, beef, maxDicWordEnd, offsetLimit);
		}

		// 現(xiàn)在是利用maxDicWordLength的時候了
		// 如果本次負責的所有字符串文本沒有作為一個詞被切分出(包括詞典切詞和孤立串切分)，
		// 那如果它被shouldBeWord方法認定為應該作為一個詞切分，則將它切出來
		int len = limit - offset;
		if (len > 2 && len != maxDicWordLength
				&& shouldBeWord(beef, offset, limit)) {
			collector.collect(beef.subSequence(offset, limit).toString(),
					offset, limit);
		}

		// 按照point和limit的語義，返回下一個Knife開始切詞的開始位置
		return point == -1 ? limit : point;
	}

	// -------------------------------------------------

	/**
	 * 對孤立字符串分詞
	 * 
	 * @param cellector
	 * @param beef
	 * @param offset
	 * @param count
	 */
	protected void dissectIsolated(Collector collector, Beef beef, int offset,
			int limit) {
		int curSearchOffset = offset;
		int binOffset = curSearchOffset; // 進行一般二元分詞的開始位置
		int tempEnd;

		while (curSearchOffset < limit) {
			// 孤立字符串如果是漢字數(shù)字，比如"五十二萬"，"十三億"，。。。
			tempEnd = collectNumber(collector, beef, curSearchOffset, limit,
					binOffset);
			if (tempEnd > curSearchOffset) {
				curSearchOffset = tempEnd;
				binOffset = tempEnd;
				continue;
			}

			// 魔幻邏輯：
			// noiseWords的詞在語言學上雖然也是詞，但CJKKnife不會把它當成詞匯表中的正常詞，
			// 有些noise詞可能沒有出現(xiàn)詞匯表，則就會被視為孤立字符串在此處理(不被視為詞匯、不進行二元分詞)
			tempEnd = skipNoiseWords(collector, beef, curSearchOffset, limit,
					binOffset);
			if (tempEnd > curSearchOffset) {
				curSearchOffset = tempEnd;
				binOffset = tempEnd;
				continue;
			}

			// 如果當前字符是noise單字，其不參加二元分詞
			Hit curSearch = noiseCharactors.search(beef, curSearchOffset, 1);
			if (curSearch.isHit()) {
				binDissect(collector, beef, binOffset, curSearchOffset);
				binOffset = ++curSearchOffset;
				continue;
			}
			curSearchOffset++;
		}

		// 
		if (limit > binOffset) {
			binDissect(collector, beef, binOffset, limit);
		}
	}

	protected int collectNumber(Collector collector, Beef beef, int offset,
			int limit, int binOffset) {

		// 當前嘗試判斷的字符的位置
		int curTail = offset;
		int number1 = -1;
		int number2 = -1;
		int bitValue = 0;
		int maxUnit = 0;
		boolean hasDigit = false;// 作用：去除沒有數(shù)字只有單位的漢字，如“萬”，“千”
		for (; curTail < limit
				&& (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) {
			// 
			if (bitValue == 2
					&& (beef.charAt(curTail) == '兩'
							|| beef.charAt(curTail) == '倆' || beef
							.charAt(curTail) == '倆')) {
				if (curTail != offset) {
					break;
				}
			}
			// 處理連續(xù)漢字個位值的數(shù)字："三四五六" ->"3456"
			if (bitValue >= 0 && bitValue < 10) {
				hasDigit = true;
				if (number2 < 0)
					number2 = bitValue;
				else {
					number2 *= 10;
					number2 += bitValue;
				}
			} else {
				if (number2 < 0) {
					if (number1 < 0) {
						number1 = 1;
					}
					number1 *= bitValue;
				} else {
					if (number1 < 0) {
						number1 = 0;
					}
					if (bitValue >= maxUnit) {
						number1 += number2;
						number1 *= bitValue;
						maxUnit = bitValue;
					} else {
						number1 += number2 * bitValue;
					}
				}
				number2 = -1;
			}
		}
		if (!hasDigit) {
			return offset;
		}
		if (number2 > 0) {
			if (number1 < 0) {
				number1 = number2;
			} else {
				number1 += number2;
			}
		}
		if (number1 >= 0) {
			// 二元分詞先
			if (offset > binOffset) {
				binDissect(collector, beef, binOffset, offset);
			}
			collector.collect(String.valueOf(number1), offset, curTail);
			
			if (units != null) {
				// 后面可能跟了計量單位
				Hit wd = null;
				Hit wd2 = null;
				int i = curTail + 1;
				while ((wd = units.search(beef, curTail, i - curTail)).isHit()) {
					wd2 = wd;
					i ++;
					if (!wd.isUnclosed()) {
						break;
					}
				}
				i --;
				if (wd2 != null) {
					collector.collect(wd2.getWord().getText(), curTail, i);
					return i;
				}
			}
		}

		// 返回最后一個判斷失敗字符的結(jié)束位置：
		// 該位置要么是offset，要么表示curTail之前的字符(不包括curTail字符)已經(jīng)被認為是漢字數(shù)字
		return curTail;
	}

	protected int skipNoiseWords(Collector collector, Beef beef, int offset,
			int end, int binOffset) {
		Hit word;
		for (int k = offset + 2; k <= end; k++) {
			word = noiseWords.search(beef, offset, k - offset);
			if (word.isHit()) {
				// 二元分詞
				if (binOffset > 0 && offset > binOffset) {
					binDissect(collector, beef, binOffset, offset);
					binOffset = -1;
				}
				offset = k;
			}
			if (word.isUndefined() || !word.isUnclosed()) {
				break;
			}
		}
		return offset;
	}

	protected void binDissect(Collector collector, Beef beef, int offset,
			int limit) {
		// 二元分詞之策略：以W、X、Y、Z表示孤立字符串中的4個漢字
		// X ->X 單個字的孤立字符串作為一個詞
		// XY ->XY 只有兩個字的孤立字符串作為一個詞
		// XYZ ->XY/YZ 多個字(>=3)的孤立字符串"兩兩組合"作為一個詞
		// WXYZ ->WX/XY/YZ 同上

		if (limit - offset == 1) {
			collector.collect(beef.subSequence(offset, limit).toString(),
					offset, limit);
		} else {
			// 窮盡二元分詞
			for (int curOffset = offset; curOffset < limit - 1; curOffset++) {
				collector.collect(beef.subSequence(curOffset, curOffset + 2)
						.toString(), curOffset, curOffset + 2);
			}
		}
	}

	protected boolean shouldBeWord(Beef beef, int offset, int end) {
		char prevChar = beef.charAt(offset - 1);
		char endChar = beef.charAt(end);
		// 中文單雙引號
		if (prevChar == '“' && endChar == '”') {
			return true;
		} else if (prevChar == '‘' && endChar == '’') {
			return true;
		}
		// 英文單雙引號
		else if (prevChar == '\'' && endChar == '\'') {
			return true;
		} else if (prevChar == '\"' && endChar == '\"') {
			return true;
		}
		// 中文書名號
		else if (prevChar == '《' && endChar == '》') {
			return true;
		} else if (prevChar == '〈' && endChar == '〉') {
			return true;
		}
		// 英文尖括號
		else if (prevChar == '<' && endChar == '>') {
			return true;
		}
		return false;
	}

}
?? 文件大小 3391 K
?? 上傳用戶 a83133937
?? 所屬分類 Java編程
??? 相關標簽

#Lcuene #封裝 #分
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? cjkknife.java

?? 快捷鍵說明