?? segment.java
字號:
package com.gftech.ictclas4j.segment;
import java.util.ArrayList;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import com.gftech.ictclas4j.bean.Atom;
import com.gftech.ictclas4j.bean.PersonName;
import com.gftech.ictclas4j.bean.SegGraph;
import com.gftech.ictclas4j.bean.Sentence;
import com.gftech.ictclas4j.bean.WordResult;
import com.gftech.ictclas4j.run.Config;
import com.gftech.ictclas4j.utility.Dictionary;
import com.gftech.ictclas4j.utility.DynamicArray;
import com.gftech.ictclas4j.utility.Utility;
import com.gftech.util.GFString;
public class Segment {
private Dictionary coreDict;
private Dictionary bigramDict;
private String src;
private int[] wordPosMapTable;// 保存每個詞在表中對應的絕對位置
private double smoothParam;
private long spendTime;
private String splitedWord;
private UnknownSeg unPerson;
private UnknownSeg unTransPerson;
private UnknownSeg unPlace;
static Logger logger = Logger.getLogger(Segment.class);
public Segment(String src, Dictionary dict, Dictionary biDict) {
PropertyConfigurator.configure(Config.LOG4J_CONF);
this.src = src;
this.coreDict = dict;
this.bigramDict = biDict;
this.smoothParam = 0.1;
this.spendTime = System.currentTimeMillis();
unPerson = new UnknownSeg();
unTransPerson = new UnknownSeg();
unPlace = new UnknownSeg();
unPerson.configure("data\\nr", Utility.TAG_TYPE.TT_PERSON);
unTransPerson.configure("data\\tr", Utility.TAG_TYPE.TT_TRANS_PERSON);
unPlace.configure("data\\ns", Utility.TAG_TYPE.TT_TRANS_PERSON);
split();
}
private void split() {
String result = null;
if (src != null) {
result = "";
SentenceSeg ss = new SentenceSeg(src);
ArrayList<Sentence> sens = ss.getSens();
for (Sentence sen : sens) {
logger.debug(sen);
if (sen.isSeg()) {
AtomSeg as = new AtomSeg(sen.getContent());
ArrayList<Atom> atoms = as.getAtoms();
for (Atom atom : atoms)
logger.info(atom);
GraphGenerate gg = new GraphGenerate(coreDict, bigramDict);
ArrayList<SegGraph> sgs = gg.generate(atoms);
logger.info(sgs);
wordPosMapTable = new int[sgs.size()];
for (int i = 0; i < sgs.size(); i++) {
SegGraph sg = sgs.get(i);
wordPosMapTable[i] = sg.getRow() * Utility.MAX_SENTENCE_LEN + sg.getCol();
}
ArrayList<SegGraph> biSgs = gg.biGenerate(sgs, wordPosMapTable, smoothParam);
logger.info(biSgs);
NShortPath nsp = new NShortPath(biSgs, 1);
int[] bipath = nsp.getNShortPath(true);
int[] unipath = bipath2unipath(wordPosMapTable, bipath);
ArrayList<SegGraph> segPath = getSegPath(atoms, sgs, unipath);
ArrayList<WordResult> rs = generateWord(segPath);
for (WordResult wr : rs)
logger.info(wr);
//對分詞結果進優化
DynamicArray optSeg = new DynamicArray();
optSeg.setSgs(segPath);
unPerson.recognition(rs, optSeg, coreDict);
unTransPerson.recognition(rs, optSeg, coreDict);
unPlace.recognition(rs, optSeg, coreDict);
ArrayList<SegGraph> optsgs = optSeg.getSgs();
wordPosMapTable = new int[optsgs.size()];
for (int i = 0; i < optsgs.size(); i++) {
SegGraph sg = optsgs.get(i);
wordPosMapTable[i] = sg.getRow() * Utility.MAX_SENTENCE_LEN + sg.getCol();
}
ArrayList<SegGraph> biSgs2 = gg.biGenerate(optsgs, wordPosMapTable, smoothParam);
logger.info("bisgs2:" + biSgs2);
NShortPath nsp2 = new NShortPath(biSgs2, 1);
int[] bipath2 = nsp2.getNShortPath(true);
int[] unipath2 = bipath2unipath(wordPosMapTable, bipath2);
ArrayList<SegGraph> segPath2 = getSegPath(atoms, optsgs, unipath2);
ArrayList<WordResult> rs2 = generateWord(segPath2);
for (WordResult wr : rs2)
logger.info(wr);
unPerson.roleTag.setType(Utility.TAG_TYPE.TT_NORMAL);
unPerson.roleTag.posTagging(rs2, coreDict, unPerson.unDict);
result += outputResult(adjust(rs2));
} else
result += sen.getContent();
}
}
spendTime = System.currentTimeMillis() - spendTime;
splitedWord = result;
}
private String outputResult(ArrayList<WordResult> wrList) {
String result = null;
char[] pos = new char[2];
if (wrList != null && wrList.size() > 2) {
result = "";
wrList.remove(0);
wrList.remove(wrList.size() - 1);
for (WordResult wr : wrList) {
pos[0] = (char) (wr.getHandle() / 256);
pos[1] = (char) (wr.getHandle() % 256);
result += wr.getWord() + "/" + pos[0] + pos[1] + " ";
}
}
return result;
}
private int[] bipath2unipath(int[] wordPosMapTable, int[] bipath) {
int[] result = null;
if (bipath != null && wordPosMapTable != null) {
int[] temp = new int[bipath.length + 1];
int wordPos = -1;
int i = 0;
for (int j = 0; i < bipath.length; i++, j++) {
wordPos = wordPosMapTable[bipath[i]];
temp[j] = wordPos / Utility.MAX_SENTENCE_LEN;
}
// if (wordPos > 0)
// temp[i] = wordPos % Utility.MAX_SENTENCE_LEN;
result = Utility.removeInvalid(temp);
}
return result;
}
/**
* 生成最終的分詞路徑
*
* @param unipath
* 由二叉路徑生成的唯一路徑
* @return
*/
private ArrayList<WordResult> generateWord(ArrayList<SegGraph> sgs) {
WordResult[] wordResult = null;
int index = 0;
int nPOS = 0;
double fValue = 0;
int j = 0;
boolean isNum = false;
if (sgs != null) {
wordResult = new WordResult[sgs.size()];
for (int i = 0; i < sgs.size(); i++, index++) {
// String curWord = null;
String snum = sgs.get(i).getWord();
nPOS = sgs.get(i).getPos();
fValue = sgs.get(i).getValue();
for (j = i; j < sgs.size() - 1; j++) {
snum += sgs.get(j + 1).getWord();
if (Utility.isAllNum(snum) || Utility.isAllChineseNum(snum)) {
isNum = true;
wordResult[index] = new WordResult();
wordResult[index].setWord(snum);
} else
break;
}
i = j;
if (!isNum) {
wordResult[index] = new WordResult();
wordResult[index].setWord(sgs.get(i).getWord());
// curWord = sgs.get(i).getWord();
} else {
WordResult wr = wordResult[index];
String word = wr.getWord();
if (word.length() == 2 && "第上成±—+∶·./".indexOf(word) != -1 || word.length() == 1
&& "+-./".indexOf(word) != -1) {
// curWord = word;
}
// 是一個數字
else {
if ("--".equals(word) || "—".equals(word) || "-".equals(word))// The
// delimiter
// "--"
{
nPOS = 30464;// 'w'*256;Set the POS with 'w'
} else {// Adding time suffix
String first = word.substring(0, 1);
if (index > 0
&& (Math.abs(wordResult[index - 1].getHandle()) == 27904 || Math
.abs(wordResult[index - 1].getHandle()) == 29696)
&& ("—".equals(first) || "-".equals(first)) && (word.length()) > 1) {// 3-4月
// //27904='m'*256
// 29696='t'*256
// Split the sInitChar from the original word
wordResult[index + 1].setWord(word.substring(1));
wordResult[index + 1].setValue(wordResult[index].getValue());
wordResult[index + 1].setHandle(27904);
wr.setWord(word.substring(0, 1));
wr.setValue(0);
wr.setHandle(30464);// 'w'*256;
// TODO
// Utility.insertGraph(optGraph,sg,false);
}
// int len = word.length();
String atom = sgs.get(i + 1).getWord();
if (atom.length() == 1 && "月日時分秒".indexOf(atom) != -1 || "月份".equals(atom)) {// 2001年
wr.setWord(word + atom);
// curWord = "未##時";
nPOS = -29696;// 't'*256;//Set the POS with
i++;
// 'm'
} else if ("年".equals(atom)) {
if (Utility.isYearTime(word))// strncmp(sAtom,"年",2)==0&&
{// 1998年,
wordResult[index].setWord(word + atom);
// curWord = "未##時";
nPOS = -29696;// Set the POS with 't'
i++;
} else {
// curWord = "未##數";
nPOS = -27904;// Set the POS with 'm'
}
} else {
// 早晨/t 五點/t
if (word.indexOf("點") == word.length() - 1) {
// curWord = "未##時";
nPOS = -29696;// Set the POS with 't'
} else {
if ("∶·./".indexOf(word.substring(word.length() - 1)) == -1
&& !".".equals(word.substring(word.length() - 1))
&& !"/".equals(word.substring(word.length() - 1))) {
// curWord = "未##數";
// 'm'*256;Set the POS with 'm'
nPOS = -27904;
}
// Get rid of .example 1.
else if (word.length() > 1) {
if (".".equals(word.substring(word.length() - 1))
|| !"/".equals(word.substring(word.length() - 1)))
wr.setWord(word.substring(0, word.length() - 1));
else
wr.setWord(word.substring(0, word.length() - 2));
// curWord = "未##數";
nPOS = -27904;// 'm'*256;Set the POS
// with 'm'
}
}
}
}
fValue = 0;
}
}
wordResult[index].setHandle(nPOS);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -