?? cseggraph.java
字號:
package com.gftech.ictclas4j.segment;
import com.gftech.common.GFCommon;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.Utility;
public class CSegGraph {
// segGraph: The segmentation word graph
// Row first array
public CDynamicArray m_segGraph;
// pAtoms: the buffer for returned segmented atoms
// Such as a Chinese Char, digit, single byte, or delimiters
public byte[][] m_sAtom = new byte[Final.MAX_SENTENCE_LEN][Final.WORD_MAXLENGTH];
// Save the individual length of atom in the array
public int[] m_nAtomLength = new int[Final.MAX_SENTENCE_LEN];
// pAtoms: the POS property
public int[] m_nAtomPOS = new int[Final.MAX_SENTENCE_LEN];
// The count of atoms
public int m_nAtomCount;
public CSegGraph() {
m_segGraph.SetRowFirst(false);
//segGraph: The segmentation word graph
//Row first array
}
// Generate the segmentation word net according the original sentence
// sSentence: the sentence
// dictCore: core dictionary
// boolean bOriginalFreq=false: output original frequency
public boolean GenerateWordNet(byte[] sSentence, CDictionary dictCore,
boolean bOriginalFreq) {
// Gernerate the word net from the sLine, that's list all the possible word
int i=0,j;
int nLen= sSentence.length;
byte[] sWord=new byte[Final.WORD_MAXLENGTH] ;
byte[] sTempWord=new byte[Final.WORD_MAXLENGTH] ;
byte[] sWordMatch=new byte[Final.WORD_MAXLENGTH];
int nWordIndex=0;
int nHandleTemp=0;
int k,nPOS;
int[] nMatchFreq=new int[20];
int[] nMatchHandle=new int[20];
int nTotalFreq;
int nMatchCount=0;
double dValue=0;
m_nAtomCount=0;
m_segGraph.SetEmpty();//Set segmentation graph empty
AtomSegment(sSentence);
//Atomic Segmentation
for(i=0;i<m_nAtomCount;i++)//Init the cost array
{
if(m_nAtomPOS[i]==Final.CT_CHINESE)//The atom is a Chinese Char
{
if(!bOriginalFreq)//Not original frequency
m_segGraph.SetElement(i,i+1,Math.log(Final.MAX_FREQUENCE),0,null);//init the link with the maximum value
else
m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
}
else//Other atom
{
Utility.strcpy(sWord,m_sAtom[i]);//init the word
dValue=Final.MAX_FREQUENCE;
switch(m_nAtomPOS[i])
{
case Final.CT_INDEX:
case Final.CT_NUM:
nPOS=-27904;//'m'*256
Utility.strcpy(sWord,"未##數".getBytes());
dValue=0;
break;
case Final.CT_DELIMITER:
nPOS=30464;//'w'*256;
break;
case Final.CT_LETTER:
nPOS=-'n'*256-'x';//
dValue=0;
Utility.strcpy(sWord,"未##串".getBytes());
break;
case Final.CT_SINGLE://12021-2129-3121
if(Utility.GetCharCount("+-1234567890".getBytes(),m_sAtom[i])== m_sAtom[i].length)
{
nPOS=-27904;//'m'*256
Utility.strcpy(sWord,"未##數".getBytes());
}
else
{
nPOS=-'n'*256-'x';//
Utility.strcpy(sWord,"未##串".getBytes());
}
dValue=0;
break;
default:
nPOS=m_nAtomPOS[i];//'?'*256;
break;
}
if(!bOriginalFreq)//Not original frequency
m_segGraph.SetElement(i,i+1,0,nPOS,null);//init the link with minimum
else
m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
}
}
i=0;
while(i<m_nAtomCount)//All the word
{
Utility.strcpy(sWord,m_sAtom[i]);//Get the current atom
j=i+1;
if(Utility.strcmp(sWord,"月".getBytes()) &&Utility.strcmp(m_sAtom[i+1],"份".getBytes()) )//Don't split 月份
j+=1;
while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch, nHandleTemp))
{//Add a condition to control the end of string
//retrieve the dictionary with the word
if(Utility.strcmp(sWordMatch,sWord) )//find the current word
{
nTotalFreq=0;
dictCore.GetHandle(sWord, nMatchCount,nMatchHandle,nMatchFreq);
for(k=0;k<nMatchCount;k++)//Add the frequency
{
nTotalFreq+=nMatchFreq[k];
}
//Adding a rule to exclude some words to be formed.
if( sWord.length==4&&i>=1&&(Utility.IsAllNum( m_sAtom[i-1])||Utility.IsAllChineseNum(m_sAtom[i-1]))
&&(Utility.strncmp(sWord,0,"年".getBytes(),2) ||Utility.strncmp(sWord,0,"月".getBytes(),2) ))
{//1年內、1999年末
if(Utility.CC_Find("末內中底前間初".getBytes(),GFCommon.bytesCopy(sWord,2,sWord.length-2)))
break;
}
if(nMatchCount==1)//The possible word has only one POS, store it
{
if(!bOriginalFreq)//Not original frequency
m_segGraph.SetElement(i,j,-Math.log(nTotalFreq+1)+Math.log(Final.MAX_FREQUENCE),nMatchHandle[0],null);
else
m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
}
else
{
if(!bOriginalFreq)//Not original frequency
m_segGraph.SetElement(i,j,-Math.log(nTotalFreq+1)+Math.log(Final.MAX_FREQUENCE),0,null);
else
m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
}
}
Utility.strcat(sWord,m_sAtom[j++]);
}
i+=1;//Start from i++;
}
return true;
}
// Segment the atomic members from the original sentence
// sSentence: the sentence
// pAtoms: the buffer for returned segmented atoms
// Such as a Chinese Char, digit, single byte, or delimiters
protected boolean AtomSegment(byte[] sSentence) {
int i=0, j=0,nCurType,nNextType;
//i is the pointer of sentence string
//j is the pointer of pAtoms
byte[] sChar=new byte[3];
sChar[2]=0;//Set the char ending
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
if(Utility.strncmp(sSentence,0,Final.SENTENCE_BEGIN.getBytes(), Final.SENTENCE_BEGIN.length()) )
{
Utility.strcpy(m_sAtom[j],Final.SENTENCE_BEGIN.getBytes());//Set the first word as sentence begining
m_nAtomLength[j]= Final.SENTENCE_BEGIN.length();
m_nAtomPOS[j]=Final.CT_SENTENCE_BEGIN;//init
i+=m_nAtomLength[j];
j+=1;
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
}
for( i=0;i<sSentence.length;i++)
{
if(Utility.strncmp(sSentence,i,Final.SENTENCE_END.getBytes(), Final.SENTENCE_END.length()) )
{
Utility.strcpy(m_sAtom[j],Final.SENTENCE_END.getBytes());//Set the first word as null
m_nAtomLength[j]= Final.SENTENCE_END.length();
m_nAtomPOS[j]=Final.CT_SENTENCE_END;//init
i+=m_nAtomLength[j];
j+=1;
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
continue;
}
sChar[0]= sSentence[i];//Get the char with first byte
sChar[1]=0;//
i+=1;
if(sChar[0]<0)//Two byte char
{
sChar[1]=sSentence[i];//Get the char with second byte
i+=1;//i increased by 1
}
Utility.strcat(m_sAtom[j],sChar);
nCurType=Utility.charType( sChar[0],sChar[1]);
if(sChar[0]=='.'&&(Utility.charType( sSentence[i],sSentence[i+1])==Final.CT_NUM||( sSentence[i]>='0'&& sSentence[i]<='9')))
nCurType=Final.CT_NUM;//Digit after . indicate . as a point in the numeric
m_nAtomPOS[j]=nCurType;
//Record its property, just convience for continuous processing
if(nCurType==Final.CT_CHINESE||nCurType==Final.CT_INDEX||nCurType==Final.CT_DELIMITER||nCurType==Final.CT_OTHER)
{//Chinese char, index number,delimiter and other is treated as atom
m_nAtomLength[j]= m_sAtom[j].length;//Save its length
j+=1;//Skip to next atom
m_sAtom[j][0]=0;//init
}
else
{//Number,single char, letter
nNextType=255;
if(i< sSentence.length)
nNextType=Utility.charType( sSentence[i] ,sSentence[i+1]);
if(nNextType!=nCurType||i== sSentence.length)
//Reaching end or next char type is different from current char
{
m_nAtomLength[j]= m_sAtom[j].length;//Save its length
j+=1;
m_sAtom[j][0]=0;//init
}
}
}
m_nAtomCount=j;//The count of segmentation atoms
return true;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -