?? adjustseg.java
字號:
package org.ictclas4j.segment;
import java.util.ArrayList;
import org.ictclas4j.bean.SegNode;
import org.ictclas4j.utility.NumUtil;
import org.ictclas4j.utility.POSTag;
import org.ictclas4j.utility.Utility;
/**
* 分詞調(diào)整
*
* @author sinboy
* @since 2007.6.1
*/
public class AdjustSeg {
/**
* 對初次分詞結(jié)果進(jìn)行調(diào)整,主要是對時間、日期、數(shù)字等進(jìn)行合并或拆分
*
* @return
*/
public static ArrayList<SegNode> firstAdjust(ArrayList<SegNode> sgs) {
ArrayList<SegNode> wordResult = null;
int index = 0;
int j = 0;
int pos = 0;
if (sgs != null) {
wordResult = new ArrayList<SegNode>();
for (int i = 0; i < sgs.size(); i++, index++) {
SegNode sn = sgs.get(i);
String srcWord = null;
String curWord = sn.getSrcWord();
SegNode newsn = new SegNode();
pos = sn.getPos();
boolean isNum = false;
if ((Utility.isAllNum(curWord) || Utility.isAllChineseNum(curWord))) {
isNum = true;
for (j = i + 1; j < sgs.size() - 1; j++) {
String temp = sgs.get(j).getSrcWord();
// 如果相鄰的幾點(diǎn)字符都是數(shù)字,則把它們進(jìn)行合并
if (Utility.isAllNum(temp) || Utility.isAllChineseNum(temp)) {
isNum = true;
index = j;
curWord += temp;
} else
break;
}
}
// 如果不是數(shù)字,但是可以和前面的數(shù)字構(gòu)成日期,則重新設(shè)置前一個節(jié)點(diǎn)
// 否則,直接把該節(jié)點(diǎn)添加到結(jié)果集中
if (!isNum) {
SegNode prevsn = null;
if (wordResult.size() > 0)
prevsn = wordResult.get(wordResult.size() - 1);
if (Utility.isDelimiter(curWord)) {
// 如果上一個字符也是分隔符,則進(jìn)行合并
if (prevsn != null && Utility.isDelimiter(prevsn.getWord())) {
prevsn.setCol(sn.getCol());
prevsn.appendWord(curWord);
continue;
} else
// 'w'*256;Set the POS with 'w'
pos = POSTag.PUNC;
} else if (curWord.length() == 1 && "月日時分秒".indexOf(curWord) != -1 || "月份".equals(curWord)) {
if (prevsn != null && prevsn.getPos() == -POSTag.NUM) {
prevsn.setCol(sn.getCol());
prevsn.setWord(Utility.UNKNOWN_TIME);
prevsn.setSrcWord(prevsn.getSrcWord() + curWord);
prevsn.setPos(-POSTag.TIME);
continue;
}
} else if ("年".equals(curWord)) {
if (prevsn != null && Utility.isYearTime(prevsn.getSrcWord())) {
prevsn.setCol(sn.getCol());
prevsn.setWord(Utility.UNKNOWN_TIME);
prevsn.setSrcWord(prevsn.getSrcWord() + curWord);
prevsn.setPos(-POSTag.TIME);
continue;
}
}
} else {
// 如果當(dāng)前字符串僅僅是有數(shù)字字符組成的而不是一個數(shù)字,則把他對應(yīng)的原始節(jié)點(diǎn)信息也添加到結(jié)果集中
if (NumUtil.isNumStrNotNum(curWord)) {
for (int k = i; k <= index; k++)
wordResult.add(sgs.get(k));
continue;
}
// 是一個數(shù)字
else {
// 如果是類似這樣的形式:
// 3-4月,即當(dāng)前元素是一個數(shù)字,前一個是分隔符,前前一個也是數(shù)字,則當(dāng)前元素應(yīng)該是數(shù)字
boolean flag = false;
int size = wordResult.size();
if (wordResult.size() > 1) {
SegNode prevPrevsn = wordResult.get(size - 2);
SegNode prevsn = wordResult.get(size - 1);
if (NumUtil.isNumDelimiter(prevPrevsn.getPos(), prevsn.getWord())) {
pos = POSTag.NUM;
flag = true;
}
}
if (!flag) {
if (curWord.indexOf("點(diǎn)") == curWord.length() - 1) {
pos = -POSTag.TIME;
srcWord = curWord;
curWord = Utility.UNKNOWN_TIME;
} else if (curWord.length() > 1) {
String last = curWord.substring(curWord.length() - 1);
// 如果當(dāng)前詞的最后一個字符不是如下幾種情況,則說明他是一個數(shù)字。否則最后一個字符就是一個標(biāo)點(diǎn),并把它分離出來
if ("∶·././".indexOf(last) == -1) {
pos = -POSTag.NUM;
srcWord = curWord;
curWord = Utility.UNKNOWN_NUM;
} else {
if (".".equals(last) || "/".equals(last)) {
pos = -POSTag.NUM;
srcWord = curWord.substring(0, curWord.length() - 1);
curWord = Utility.UNKNOWN_NUM;
index--;
} else if (curWord.length() > 2) {
pos = -POSTag.NUM;
srcWord = curWord.substring(0, curWord.length() - 2);
curWord = Utility.UNKNOWN_NUM;
index -= 2;
}
}
}
}
}
}
int col = index > i ? sgs.get(index).getCol() : sn.getCol();
newsn.setCol(col);
newsn.setRow(sn.getRow());
newsn.setWord(curWord);
newsn.setPos(pos);
newsn.setValue(sn.getValue());
if (srcWord != null)
newsn.setSrcWord(srcWord);
wordResult.add(newsn);
i = index;
}
}
return wordResult;
}
/**
* 對分詞結(jié)果做最終的調(diào)整,主要是人名的拆分或重疊詞的合并
*
* @param optSegPath
* @param personTagger
* @param placeTagger
* @return
*/
public static ArrayList<SegNode> finaAdjust(ArrayList<SegNode> optSegPath, PosTagger personTagger,
PosTagger placeTagger) {
ArrayList<SegNode> result = null;
SegNode wr = null;
if (optSegPath != null && optSegPath.size() > 0 && personTagger != null && placeTagger != null) {
result = new ArrayList<SegNode>();
for (int i = 0; i < optSegPath.size(); i++) {
boolean isBeProcess = false;
wr = optSegPath.get(i);
// if (wr.getPos() == POSTag.NOUN_PERSON
// && (pname = Utility.chineseNameSplit(wr.getSrcWord(),
// personTagger)) != null
// && !"葉利欽".equals(wr.getSrcWord())) {
// if (pname.getFirstName() != null) {
// SegNode wr2 = new SegNode();
// wr2.setWord(pname.getFirstName());
// wr2.setPos(POSTag.NOUN_PERSON);
// result.add(wr2);
// }
//
// if (pname.getMidName() != null) {
// SegNode wr2 = new SegNode();
// wr2.setWord(pname.getMidName());
// wr2.setPos(POSTag.NOUN_PERSON);
// result.add(wr2);
// }
//
// if (pname.getLastName() != null) {
// SegNode wr2 = new SegNode();
// wr2.setWord(pname.getLastName());
// wr2.setPos(POSTag.NOUN_PERSON);
// result.add(wr2);
// }
//
// isBeProcess = true;
// }
// Rule2 for overlap words ABB 一段段、一片片
if (wr.getPos() == POSTag.NUM && i + 2 < optSegPath.size() && optSegPath.get(i + 1).getLen() == 2
&& optSegPath.get(i + 1).getSrcWord().equals(optSegPath.get(i + 2).getSrcWord())) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + optSegPath.get(i + 1).getSrcWord()
+ optSegPath.get(i + 2).getSrcWord());
wr2.setPos(POSTag.NUM);
result.add(wr2);
i += 2;
isBeProcess = true;
}
// Rule3 for overlap words AA
else if (wr.getLen() == 2 && i + 1 < optSegPath.size()
&& wr.getSrcWord().equals(optSegPath.get(i + 1).getSrcWord())) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + optSegPath.get(i + 1).getSrcWord());
wr2.setPos(POSTag.ADJ);
if (wr.getPos() == POSTag.VERB || optSegPath.get(i + 1).getPos() == POSTag.VERB)// 30208='v'8256
wr2.setPos(POSTag.VERB);
if (wr.getPos() == POSTag.NOUN || optSegPath.get(i + 1).getPos() == POSTag.NOUN)// 30208='v'8256
wr2.setPos(POSTag.NOUN);
i += 1;
if (optSegPath.get(i + 1).getLen() == 2) {// AAB:洗/洗/臉、蒙蒙亮
if ((wr2.getPos() == POSTag.VERB && optSegPath.get(i + 1).getPos() == POSTag.NOUN)
|| (wr2.getPos() == POSTag.ADJ && optSegPath.get(i + 1).getPos() == POSTag.ADJ)) {
wr2.setWord(wr2.getWord() + optSegPath.get(i + 1).getSrcWord());
i += 1;
}
}
isBeProcess = true;
result.add(wr2);
}
// Rule 4: AAB 洗/洗澡
else if (wr.getLen() == 2 && i + 1 < optSegPath.size()
&& (wr.getPos() == POSTag.VERB || wr.getPos() == POSTag.ADJ)
&& optSegPath.get(i + 1).getLen() == 4
&& optSegPath.get(i + 1).getSrcWord().indexOf(wr.getSrcWord()) == 0) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getWord() + optSegPath.get(i + 1).getSrcWord());
wr2.setPos(POSTag.ADJ); // 24832=='a'*256
if (wr.getPos() == POSTag.VERB || optSegPath.get(i + 1).getPos() == POSTag.VERB)// 30208='v'8256
wr2.setPos(POSTag.VERB);
i += 1;
isBeProcess = true;
result.add(wr2);
} else if (wr.getPos() / 256 == 'u' && wr.getPos() % 256 != 0)// uj,ud,uv,uz,ul,ug->u
wr.setPos('u' * 256);
// AABB,樸樸素素
else if (wr.getLen() == 2 && i + 2 < optSegPath.size() && optSegPath.get(i + 1).getLen() == 4
&& optSegPath.get(i + 1).getWord().indexOf(wr.getWord()) == 0
&& optSegPath.get(i + 1).getWord().indexOf(optSegPath.get(i + 2).getWord()) == 0) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getWord() + optSegPath.get(i + 1).getWord() + optSegPath.get(i + 2).getWord());
wr2.setPos(optSegPath.get(i + 1).getPos());
i += 2;
isBeProcess = true;
result.add(wr2);
}
// 28275=='n'*256+'s' 地名+X
else if (wr.getPos() == POSTag.NOUN_SPACE && i + 1 < optSegPath.size())// PostFix
{
SegNode next = optSegPath.get(i + 1);
if (placeTagger.getUnknownDict().isExist(next.getSrcWord(), 4)) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + next.getSrcWord());
wr2.setPos(POSTag.NOUN_SPACE);
i += 1;
isBeProcess = true;
result.add(wr2);
} else if ("隊(duì)".equals(next.getSrcWord())) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + next.getSrcWord());
wr2.setPos(POSTag.NOUN_ORG);
i += 1;
isBeProcess = true;
result.add(wr2);
} else if (optSegPath.get(i + 1).getLen() == 2 && "語文字杯".indexOf(next.getSrcWord()) != -1) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + next.getSrcWord());
wr2.setPos(POSTag.NOUN_ZHUAN);
i += 1;
isBeProcess = true;
result.add(wr2);
} else if ("裔".equals(next.getSrcWord())) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + next.getSrcWord());
wr2.setPos(POSTag.NOUN);
i += 1;
isBeProcess = true;
result.add(wr2);
}
} else if (wr.getPos() == POSTag.VERB || wr.getPos() == POSTag.VERB_NOUN ||wr.getPos() == POSTag.NOUN)// v
{
if (i + 1 < optSegPath.size() && "員".equals(optSegPath.get(i + 1).getSrcWord())) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord() + optSegPath.get(i + 1).getSrcWord());
wr2.setPos(POSTag.NOUN);
i += 1;
isBeProcess = true;
result.add(wr2);
}
}
// www/nx ./w sina/nx;
// EIM/nx -601/m
// SHM/nx -/w 101/m
// 28280=='n'*256+'r'
// 27904=='m'*256
else if (wr.getPos() == POSTag.NOUN_LETTER && i + 1 < optSegPath.size()) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord());
wr2.setPos(POSTag.NOUN_LETTER);
while (true) {
SegNode nextSN = optSegPath.get(i + 1);
if (nextSN.getPos() == POSTag.NOUN_LETTER || "..--".indexOf(nextSN.getSrcWord()) != -1
|| (nextSN.getPos() == POSTag.NUM && Utility.isAllNum(nextSN.getSrcWord()))) {
wr2.setWord(wr2.getSrcWord() + nextSN.getSrcWord());
i++;
} else
break;
}
isBeProcess = true;
result.add(wr2);
}
// If not processed,that's mean: not need to adjust;
// just copy to the final result
if (!isBeProcess) {
SegNode wr2 = new SegNode();
wr2.setWord(wr.getSrcWord());
wr2.setPos(wr.getPos());
result.add(wr2);
}
}
}
return result;
}
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -