?? numberknife.java
字號(hào):
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.paoding.analysis.knife;
import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.Hit;
/**
*
* @author Zhiliang Wang [qieqie.wang@gmail.com]
*
*/
public class NumberKnife extends CombinatoricsKnife implements DictionariesWare {
private Dictionary units;
public NumberKnife() {
}
public NumberKnife(Dictionaries dictionaries) {
setDictionaries(dictionaries);
}
public void setDictionaries(Dictionaries dictionaries) {
super.setDictionaries(dictionaries);
units = dictionaries.getUnitsDictionary();
}
public int assignable(Beef beef, int offset, int index) {
char ch = beef.charAt(index);
if (CharSet.isArabianNumber(ch))
return ASSIGNED;
if (index > offset) {
if (CharSet.isLantingLetter(ch) || ch == '.' || ch == '-' || ch == '_') {
if (CharSet.isLantingLetter(ch)
|| !CharSet.isArabianNumber(beef.charAt(index + 1))) {
//分詞效果
//123.456 ->123.456/
//123.abc.34 ->123/123.abc.34/abc/34/ ["abc"、"abc/34"系由LetterKnife分出,非NumberKnife]
//沒有或判斷!CharSet.isArabianNumber(beef.charAt(index + 1)),則分出"123.",而非"123"
//123.abc.34 ->123./123.abc.34/abc/34/
return POINT;
}
return ASSIGNED;
}
}
return LIMIT;
}
protected int collectLimit(Collector collector, Beef beef,
int offset, int point, int limit, int dicWordVote) {
// "123abc"的直接調(diào)用super的
if (point != -1) {
return super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
}
//
// 2.2兩
// ^=_point
//
final int _point = limit;
// 當(dāng)前嘗試判斷的字符的位置
int curTail = offset;
int number1 = -1;
int number2 = -1;
int bitValue = 0;
int maxUnit = 0;
//TODO:這里又重復(fù)從curTail(其值為offset)判斷,重新遍歷判斷是否為數(shù)字,算是一個(gè)重復(fù)計(jì)算
//但考慮這個(gè)計(jì)算對(duì)中文分詞性能影響微乎其微暫時(shí)先不優(yōu)化
for (; (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) {
//
if (bitValue == 2
&& (beef.charAt(curTail) == '兩' || beef.charAt(curTail) == '倆' || beef
.charAt(curTail) == '倆')) {
if (curTail != offset) {
break;
}
}
// 處理連續(xù)漢字個(gè)位值的數(shù)字:"三四五六" ->"3456"
if (bitValue >= 0 && bitValue < 10) {
if (number2 < 0)
number2 = bitValue;
else {
number2 *= 10;
number2 += bitValue;
}
} else {
if (number2 < 0) {
if (number1 < 0) {
number1 = 1;
}
number1 *= bitValue;
} else {
if (number1 < 0) {
number1 = 0;
}
if (bitValue >= maxUnit) {
number1 += number2;
number1 *= bitValue;
maxUnit = bitValue;
} else {
number1 += number2 * bitValue;
}
}
number2 = -1;
}
}
if (number2 > 0) {
if (number1 < 0) {
number1 = number2;
} else {
number1 += number2;
}
}
if (number1 >= 0 && curTail > _point) {
doCollect(collector, String.valueOf(number1), beef, offset, curTail);
}
else {
super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
}
curTail = curTail > limit ? curTail : limit;
//
// 后面可能跟了計(jì)量單位
if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) {
Hit wd = null;
Hit wd2 = null;
int i = curTail + 1;
while ((wd = units.search(beef, curTail, i - curTail)).isHit()) {
wd2 = wd;
i++;
if (!wd.isUnclosed()) {
break;
}
}
i --;
if (wd2 != null) {
collector.collect(wd2.getWord().getText(), curTail, i);
return i;
}
}
//
return curTail > limit ? curTail : -1;
}
}
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -