?? span.java
字號:
package com.gftech.ictclas4j.bean;
import java.util.ArrayList;
import com.gftech.ictclas4j.utility.ContextStat;
import com.gftech.ictclas4j.utility.Dictionary;
import com.gftech.ictclas4j.utility.Utility;
import com.gftech.ictclas4j.utility.Utility.TAG_TYPE;
public class Span {
public ContextStat context;
TAG_TYPE tagType;
int[][] m_nTags;
int[][] m_nBestPrev;
int m_nStartPos;
int[] m_nBestTag;
int m_nCurLength;
String[] m_sWords;
double[][] m_dFrequency;
public int[][] m_nUnknownWords;
public int m_nUnknownIndex;
public int[] m_nWordPosition;
public double[] m_dWordsPossibility;
public Span() {
m_nTags = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
if (tagType != Utility.TAG_TYPE.TT_NORMAL)
m_nTags[0][0] = 100;// Begin tag
else
m_nTags[0][0] = 0;// Begin tag
m_nTags[0][1] = -1;
m_nBestPrev = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
m_nBestTag = new int[Utility.MAX_WORDS_PER_SENTENCE];
m_sWords = new String[Utility.MAX_WORDS_PER_SENTENCE];
m_nUnknownWords = new int[Utility.MAX_UNKNOWN_PER_SENTENCE][2];
m_nWordPosition = new int[Utility.MAX_WORDS_PER_SENTENCE];
m_dWordsPossibility = new double[Utility.MAX_UNKNOWN_PER_SENTENCE];
m_dFrequency = new double[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
tagType=Utility.TAG_TYPE.TT_NORMAL;
}
public boolean loadContext(String fileName) {
if (fileName != null) {
context = new ContextStat();
return context.load(fileName);
}
return false;
}
public void setType(TAG_TYPE type) {
tagType = type;
}
public boolean posTagging(ArrayList<WordResult> wrList, Dictionary coreDict, Dictionary unknownDict) {
int i = 0;
int j, nStartPos;
reset(false);
while (i > -1 && i < wrList.size()) {
nStartPos = i;// Start Position
i = getFrom(wrList, nStartPos, coreDict, unknownDict);
getBestPOS();
switch (tagType) {
case TT_NORMAL:// normal POS tagging
j = 1;
// Store the best POS tagging
while (m_nBestTag[j] != -1 && j < m_nCurLength) {
WordResult wr = wrList.get(j + nStartPos - 1);
wr.setHandle(m_nBestTag[j]);
// Let 。be 0
// Exist and update its frequncy as a POS value
if (wr.getValue() > 0 && coreDict.isExist(wr.getWord(), -1))
wr.setValue(coreDict.getFrequency(wr.getWord(), m_nBestTag[j]));
j += 1;
}
break;
case TT_PERSON:// Person recognition
PersonRecognize(unknownDict);
break;
case TT_PLACE:// Place name recognition
case TT_TRANS_PERSON:// Transliteration Person
PlaceRecognize(coreDict, unknownDict);
break;
default:
break;
}
reset();
}
return true;
}
public boolean reset(boolean isContinue) {
if (!isContinue) {
if (tagType != Utility.TAG_TYPE.TT_NORMAL)
m_nTags[0][0] = 100;// Begin tag
else
m_nTags[0][0] = 0;// Begin tag
m_nUnknownIndex = 0;
m_dFrequency[0][0] = 0;
m_nStartPos = 0;
} else {
// Get the last POS in the last sentence
m_nTags[0][0] = m_nTags[m_nCurLength - 1][0];
m_dFrequency[0][0] = m_dFrequency[m_nCurLength - 1][0];
}
// Get the last POS in the last sentence,set the -1 as end flag
m_nTags[0][1] = -1;
m_nCurLength = 1;
m_nWordPosition[1] = m_nStartPos;
m_sWords[0] = null;
return true;
}
public boolean reset() {
return reset(true);
}
private boolean disamb() {
int i, j, k, nMinCandidate;
double dMinFee = 0;
double dTmp = 0;
for (i = 1; i < m_nCurLength; i++)// For every word
{
for (j = 0; m_nTags[i][j] >= 0; j++)// For every word
{
nMinCandidate = Utility.MAX_POS_PER_WORD + 1;
for (k = 0; m_nTags[i - 1][k] >= 0; k++) {
// ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
// ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
// dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
dTmp = -Math.log(context.getContextPossibility(0, m_nTags[i - 1][k], m_nTags[i][j]));
dTmp += m_dFrequency[i - 1][k];// Add the fees
if (nMinCandidate > 10 || dTmp < dMinFee)// Get the
// minimum fee
{
nMinCandidate = k;
dMinFee = dTmp;
}
}
m_nBestPrev[i][j] = nMinCandidate;// The best previous for j
m_dFrequency[i][j] = m_dFrequency[i][j] + dMinFee;
}
}
return true;
}
public boolean getBestPOS() {
disamb();
for (int i = m_nCurLength - 1, j = 0; i > 0; i--)// ,j>=0
{
if (m_sWords[i] != null) {// Not virtual ending
m_nBestTag[i] = m_nTags[i][j];// Record the best POS and its
// possibility
}
j = m_nBestPrev[i][j];
}
int nEnd = m_nCurLength;// Set the end of POS tagging
if (m_sWords[m_nCurLength - 1] == null)
nEnd = m_nCurLength - 1;
m_nBestTag[nEnd] = -1;
return true;
}
public int getFrom(ArrayList<WordResult> wrList, int index, Dictionary coreDict, Dictionary unknownDict) {
int[] aPOS = new int[Utility.MAX_POS_PER_WORD];
int[] aFreq = new int[Utility.MAX_POS_PER_WORD];
int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
boolean bSplit = false;// Need to split in Transliteration recognition
int i = 1, nPOSCount;
String sCurWord;// Current word
nWordsIndex = i + index - 1;
for (; i < Utility.MAX_WORDS_PER_SENTENCE & nWordsIndex < wrList.size(); i++) {
WordResult wr = wrList.get(nWordsIndex);
String word = wr.getWord();
if (tagType == Utility.TAG_TYPE.TT_NORMAL || !unknownDict.isExist(word, 44)) {
// current word
m_sWords[i] = word;// store
m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].getBytes().length;
} else {
if (!bSplit) {
m_sWords[i] = word.substring(2);// store
bSplit = true;
} else {
// current word
m_sWords[i] = word.substring(2);// store
bSplit = false;
}
m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].length();
}
// Record the position of current word
m_nStartPos = m_nWordPosition[i + 1];
// Move the Start POS to the ending
if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
// Get the POSs from the unknown recognition dictionary
sCurWord = m_sWords[i];
if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0
&& Utility.charType(m_sWords[i - 1]) == Utility.CT_CHINESE) {
if (".".equals(m_sWords[i]))
sCurWord = ".";
else if ("-".equals(m_sWords))
sCurWord = "-";
}
ArrayList<WordItem> wis = unknownDict.getHandle(sCurWord);
nPOSCount = wis.size() + 1;
for (j = 0; j < wis.size(); j++) {
aPOS[j] = wis.get(j).getHandle();
aFreq[j] = wis.get(j).getFrequency();
m_nTags[i][j] = aPOS[j];
m_dFrequency[i][j] = -Math.log((1 + aFreq[j]));
m_dFrequency[i][j] += Math.log((context.getFrequency(0, aPOS[j]) + nPOSCount));
}
if ("始##始".equals(m_sWords[i])) {
m_nTags[i][j] = 100;
m_dFrequency[i][j] = 0;
j++;
} else if ("末##末".equals(m_sWords[i])) {
m_nTags[i][j] = 101;
m_dFrequency[i][j] = 0;
j++;
} else {
wis = coreDict.getHandle(m_sWords[i]);
nFreq = 0;
for (int k = 0; k < wis.size(); k++) {
aFreq[k] = wis.get(k).getFrequency();
nFreq += aFreq[k];
}
if (wis.size() > 0) {
m_nTags[i][j] = 0;
m_dFrequency[i][j] = -Math.log((double) (1 + nFreq));
m_dFrequency[i][j] += Math.log((double) (context.getFrequency(0, 0) + nPOSCount));
j++;
}
}
} else// For normal POS tagging
{
j = 0;
// Get the POSs from the unknown recognition dictionary
if (wr.getHandle() > 0) {// The word has is only one POS
// value
// We have record its POS and nFrequncy in the items.
m_nTags[i][j] = wr.getHandle();
m_dFrequency[i][j] = -Math.log(wr.getValue())
+ Math.log((double) (context.getFrequency(0, m_nTags[i][j]) + 1));
if (m_dFrequency[i][j] < 0)// Not permit the value less
// than 0
m_dFrequency[i][j] = 0;
j++;
} else {// The word has multiple POSs, we should retrieve the
// information from Core Dictionary
if (wr.getHandle() < 0) {// The word has is only one POS
m_nTags[i][j] = -wr.getHandle();
m_dFrequency[i][j++] = wr.getValue();
}
ArrayList<WordItem> wis = coreDict.getHandle(m_sWords[i]);
nPOSCount = wis.size();
for (; j < wis.size(); j++) {
// in the unknown dictionary
aPOS[j] = wis.get(j).getHandle();
aFreq[j] = wis.get(j).getFrequency();
m_nTags[i][j] = aPOS[j];
m_dFrequency[i][j] = -Math.log(1 + aFreq[j])
+ Math.log(context.getFrequency(0, m_nTags[i][j]) + nPOSCount);
}
}
}
if (j == 0) {// We donot know the POS, so we have to guess them
// according
// lexical knowledge
j = guessPOS(i);// Guess the POS of current word
}
m_nTags[i][j] = -1;// Set the ending POS
if (j == 1 && m_nTags[i][j] != Utility.CT_SENTENCE_BEGIN)// No
// ambuguity
{// No ambuguity, so we can break from the loop
i++;
m_sWords[i] = null;
break;
}
if (!bSplit)
nWordsIndex++;
}
if (nWordsIndex == wrList.size())
nRetPos = -1;// Reaching ending
if (m_nTags[i - 1][1] != -1)// ||m_sWords[i][0]==0
{// Set end for words like "張/華/平"
if (tagType != Utility.TAG_TYPE.TT_NORMAL)
m_nTags[i][0] = 101;
else
m_nTags[i][0] = 1;
m_dFrequency[i][0] = 0;
m_sWords[i] = null;// Set virtual ending
m_nTags[i++][1] = -1;
}
m_nCurLength = i;// The current word count
if (nRetPos != -1)
return nWordsIndex + 1;// Next start position
return -1;// Reaching ending
}
/**
* <pre>
*
* BBCD 343 0.003606
* BBC 2 0.000021
* BBE 125 0.001314
* BBZ 30 0.000315
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -