?? cunknowword.java
字號:
package com.gftech.ictclas4j.unknown;
import com.gftech.common.GFCommon;
import com.gftech.ictclas4j.segment.CDynamicArray;
import com.gftech.ictclas4j.segment.CSegGraph;
import com.gftech.ictclas4j.tag.CSpan;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
public class CUnknowWord {
private CSpan m_roleTag;// Role tagging
private int m_nPOS;// The POS of such a category
private byte[] m_sUnknownFlags = new byte[10];
public CDictionary m_dict;// Unknown dictionary
public CUnknowWord() {
}
// Judge whether the name is a given name
public boolean IsGivenName(byte[] sName) {
byte[] sFirstChar = new byte[3];
byte[] sSecondChar = new byte[3];
double dGivenNamePossibility = 0, dSingleNamePossibility = 0;
if (sName.length != 4)
return false;
sFirstChar[0] = sName[0];
sFirstChar[1] = sName[1];
sFirstChar[2] = 0;
sSecondChar[0] = sName[2];
sSecondChar[1] = sName[3];
sSecondChar[2] = 0;
// The possibility of P(Wi|Ti)
dGivenNamePossibility += Math.log((double) m_dict.GetFrequency(
sFirstChar, 2) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 2) + 1.0);
dGivenNamePossibility += Math.log((double) m_dict.GetFrequency(
sSecondChar, 3) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 3) + 1.0);
// The possibility of conversion from 2 to 3
dGivenNamePossibility += Math.log(m_roleTag.m_context
.GetContextPossibility(0, 2, 3) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 2) + 1.0);
// The possibility of P(Wi|Ti)
dSingleNamePossibility += Math.log((double) m_dict.GetFrequency(
sFirstChar, 1) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 1) + 1.0);
dSingleNamePossibility += Math.log((double) m_dict.GetFrequency(
sSecondChar, 4) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 4) + 1.0);
// The possibility of conversion from 1 to 4
dSingleNamePossibility += Math.log(m_roleTag.m_context
.GetContextPossibility(0, 1, 4) + 1.0)
- Math.log(m_roleTag.m_context.GetFrequency(0, 1) + 1.0);
// 張震||m_dict.GetFrequency(sFirstChar,1)/m_dict.GetFrequency(sFirstChar,2)>=10
// The possibility being a single given name is more than being a
// 2-char given name
if (dSingleNamePossibility >= dGivenNamePossibility)
return false;
return true;
}
public boolean Configure(String sConfigFile, Final.TAG_TYPE type) {
String sFilename = sConfigFile + ".dct";
m_dict.Load(sFilename, false);
// Load the unknown recognition context
sFilename = sConfigFile + ".ctx";
m_roleTag.LoadContext(sFilename);
// Set the tagging type
m_roleTag.SetTagType(type);
switch (type) {
case TT_PERSON:
case TT_TRANS_PERSON:// Set the special flag for transliterations
m_nPOS = -28274;// -'n'*256-'r';
GFCommon.bytesCopy(m_sUnknownFlags, "未##人".getBytes(), 0, 8);
break;
case TT_PLACE:
m_nPOS = -28275;// -'n'*256-'s';
GFCommon.bytesCopy(m_sUnknownFlags, "未##地".getBytes(), 0, 8);
break;
default:
m_nPOS = 0;
break;
}
return true;
}
// Unknown word recognition
// pWordSegResult:word Segmentation result;graphOptimum: The optimized
// segmentation graph
// graphSeg: The original segmentation graph
// type: Unknown words type (including person,place,transliterion and so on)
public boolean Recognition(TagWordResult[] pWordSegResult,
CDynamicArray graphOptimum, CSegGraph graphSeg, CDictionary dictCore) {
int nStartPos = 0;
int j = 0;
int nAtomStart;
int nAtomEnd;
int nPOSOriginal = 0;
double dValue = 0;
// Tag the segmentation with unknown recognition roles according the
// core dictionary and unknown recognition dictionary
m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict);
for (int i = 0; i < m_roleTag.m_nUnknownIndex; i++) {
while (j < graphSeg.m_nAtomCount
&& nStartPos < m_roleTag.m_nUnknownWords[i][0]) {
nStartPos += graphSeg.m_nAtomLength[j++];
}
nAtomStart = j;
while (j < graphSeg.m_nAtomCount
&& nStartPos < m_roleTag.m_nUnknownWords[i][1]) {
nStartPos += graphSeg.m_nAtomLength[j++];
}
nAtomEnd = j;
if (nAtomStart < nAtomEnd) {
graphOptimum.GetElement(nAtomStart, nAtomEnd, dValue,
nPOSOriginal, null);
if (dValue > m_roleTag.m_dWordsPossibility[i])
// Set the element with less frequency
graphOptimum.SetElement(nAtomStart, nAtomEnd,
m_roleTag.m_dWordsPossibility[i], m_nPOS,
m_sUnknownFlags);
}
}
return true;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -