?? segtofile(sen).c
字號:
{
/*得到最佳位置*/
p->m_nBestTag[i]=p->m_nTags[i][j];
}
j=p->m_nBestPrev[i][j];
}
nEnd=p->m_nCurLength;
if(p->m_sWords[p->m_nCurLength-1][0]==0)
nEnd=p->m_nCurLength-1;
p->m_nBestTag[nEnd]=-1;
return TRUE;
}
/**************************************************************************************
46. 計算權重:開始位置,長度,詞典和標注的鏈表,上下文概率
***************************************************************************************/
double ComputePossibility(pSpan p,int nStartPos,int nLength,pDictionary dict)
{
double dRetValue=0,dPOSPoss;
int nFreq,i;
for(i=nStartPos;i<nStartPos+nLength;i++)
{
/*在詞典中查找詞的標注的詞頻*/
nFreq=GetFrequency(dict,p->m_sWords[i],p->m_nBestTag[i]);
/*從二元關系鏈表中查找Symbol的頻率,+1后對數,-總體頻率*/
dPOSPoss=log((double)(GetContextFrequency(p->m_context,0,p->m_nBestTag[i])+1))-log((double)(nFreq+1));
dRetValue+=dPOSPoss;
}
return dRetValue;
}
/**************************************************************************************
47. 人名識別:標注結構和人名詞典
***************************************************************************************/
int PersonRecognize(pSpan p,pDictionary personDict)
{
/*句子中的詞位置為z*/
char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
/*人名的各模式*/
char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
"BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};
/*各因子的系數*/
double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,
0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0};
/*各模式的長度*/
int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
int i,k,nPos,nLittleFreqCount;
int j=1;
int bMatched=FALSE;
/*詞的最佳標注+A */
for(i=1;p->m_nBestTag[i]>-1;i++)
sPOS[i]=p->m_nBestTag[i]+'A';
sPOS[i]=0;
while(j<i)
{
bMatched=FALSE;
for(k=0;!bMatched&&nPatternLen[k]>0;k++)
{
if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(p->m_sWords[j-1],"?¤")!=0&&strcmp(p->m_sWords[j+nPatternLen[k]],"?¤")!=0)
{
/*名字模式為FB,有E,C,G時中斷*/
if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
continue;
nPos=j;
sPersonName[0]=0;
nLittleFreqCount=0;
/*得到每種標注的頻率,位置在++*/
while(nPos<j+nPatternLen[k])
{
if(p->m_nBestTag[nPos]<4&&GetFrequency(personDict,p->m_sWords[nPos],p->m_nBestTag[nPos])<LITTLE_FREQUENCY)
/*小頻率數目++*/
nLittleFreqCount++;
/*人名連接上詞內容*/
strcat(sPersonName,p->m_sWords[nPos]);
nPos+=1;
}
if(strcmp(sPatterns[k],"CDCD")==0)
{
if(GetForeignCharCount(sPersonName)>0)
j+=nPatternLen[k]-1;
continue;
}
p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[j];
p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[j+nPatternLen[k]];
p->m_dWordsPossibility[p->m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(p,j,nPatternLen[k],personDict);
p->m_nUnknownIndex+=1;
j+=nPatternLen[k];
bMatched=TRUE;
}
}
if(!bMatched)
j+=1;
}
return TRUE;
}
/**************************************************************************************
48. 猜位置:標注結構、索引和子索引
***************************************************************************************/
int GuessPOS(pSpan p,int nIndex,int *pSubIndex)
{
int j=0,i=nIndex,nCharType;
unsigned int nLen;
/*正常標注或識別*/
switch(p->m_tagType)
{
case TT_NORMAL:
break;
case TT_PERSON:
j=0;
/*查找××*/
if(CC_Find("××",p->m_sWords[nIndex]))
{
p->m_nTags[i][j]=6;
/*得到上下文頻率*/
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,6)+1);
}
else
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nLen=strlen(p->m_sWords[nIndex]);
if(nLen>=4)
{
p->m_nTags[i][j]=0;
/*長度>4,為標注為0,11,12,13得到上下文頻率*/
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
/*長度=2,為標注為0,11,12,13得到上下文頻率*/
else if(nLen==2)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
p->m_nTags[i][j]=4;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
}
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
}
break;
case TT_PLACE:
j=0;
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nLen=strlen(p->m_sWords[nIndex]);
if(nLen>=4)
{
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
else if(nLen==2)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
p->m_nTags[i][j]=4;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
}
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
break;
case TT_TRANS_PERSON:
j=0;
nLen=strlen(p->m_sWords[nIndex]);
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
if(!IsAllChinese((unsigned char *)p->m_sWords[nIndex]))
{
if(IsAllLetter((unsigned char *)p->m_sWords[nIndex]))
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*2+1);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*2+1);
}
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
else if(nLen>=4)
{
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
else if(nLen==2)
{
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)*2+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
p->m_nTags[i][j]=30;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,30)*8+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*4+1);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*4+1);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*4+1);
p->m_nTags[i][j]=21;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,21)*2+1);
p->m_nTags[i][j]=22;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,22)*2+1);
p->m_nTags[i][j]=23;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,23)*2+1);
}
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
break;
default:
break;
}
*pSubIndex=j;
return TRUE;
}
/**************************************************************************************
49. 獲取:標注結構、詞結構、索引、詞典和未登陸詞詞典
***************************************************************************************/
int GetFrom(pSpan p,PWORD_RESULT pWordItems,int nIndex,pDictionary dictCore, pDictionary dictUnknown)
{
int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
int nFreq=0,j,nRetPos=0,nWordsIndex=0;
int bSplit=FALSE;
int k,i=1,nPOSCount;
char sCurWord[WORD_MAXLENGTH];
unsigned int nLen;
nWordsIndex=i+nIndex-1;
for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
{
if(p->m_tagType==TT_NORMAL||!IsExist(dictUnknown,pWordItems[nWordsIndex].sWord,44))
{
strcpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord);
p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
}
else
{
if(!bSplit)
{
strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord,2);
p->m_sWords[i][2]=0;
bSplit=TRUE;
}
else
{
nLen=strlen(pWordItems[nWordsIndex].sWord+2);
strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);
p->m_sWords[i][nLen]=0;
bSplit=FALSE;
}
p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
}
p->m_nStartPos=p->m_nWordPosition[i+1];
if(p->m_tagType!=TT_NORMAL) {
strcpy(sCurWord,p->m_sWords[i]);
if(p->m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)p->m_sWords[i-1])==CT_CHINESE)
{
if(p->m_sWords[i][0]=='.'&&p->m_sWords[i][1]==0)
strcpy(sCurWord,"£?");
else if(p->m_sWords[i][0]=='-'&&p->m_sWords[i][1]==0)
strcpy(sCurWord,"£-");
}
GetHandle(dictUnknown,sCurWord,&nCount,aPOS,aFreq);
nPOSCount=nCount+1;
for(j=0;j<nCount;j++)
{
p->m_nTags[i][j]=aPOS[j];
p->m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(GetContextFrequency(p->m_context,0,aPOS[j])+nPOSCount));
}
if(strcmp(p->m_sWords[i],"?##?")==0)
{
p->m_nTags[i][j]=100;
p->m_dFrequency[i][j]=0;
j++;
}
else if(strcmp(p->m_sWords[i],"?##?")==0)
{
p->m_nTags[i][j]=101;
p->m_dFrequency[i][j]=0;
j++;
}
else
{
GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
nFreq=0;
for(k=0;k<nCount;k++)
{
nFreq+=aFreq[k];
}
if(nCount>0)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(GetContextFrequency(p->m_context,0,0)+nPOSCount));
j++;
}
}
}
else
{
j=0;
if(pWordItems[nWordsIndex].nHandle>0)
{
p->m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
p->m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+1));
if(p->m_dFrequency[i][j]<0)
p->m_dFrequency[i][j]=0;
j++;
}
else
{
if(pWordItems[nWordsIndex].nHandle<0)
{
p->m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
p->m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
}
GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
nPOSCount=nCount;
for(;j<nCount;j++)
{
p->m_nTags[i][j]=aPOS[j];
p->m_dFrequency[i][j]=-log(1+aFreq[j])+log(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+nPOSCount);
}
}
}
if(j==0)
{
GuessPOS(p,i,&j);
}
p->m_nTags[i][j]=-1;
if(j==1&&p->m_nTags[i][j]!=CT_SENTENCE_BEGIN)
{
i++;
p->m_sWords[i][0]=0;
break;
}
if(!bSplit)
nWordsIndex++;
}
if(pWordItems[nWordsIndex].sWord[0]==0)
nRetPos=-1;
if(p->m_nTags[i-1][1]!=-1)
{
if(p->m_tagType!=TT_NORMAL)
p->m_nTags[i][0]=101;
else
p->m_nTags[i][0]=1;
p->m_dFrequency[i][0]=0;
p->m_sWords[i][0]=0;
p->m_nTags[i++][1]=-1;
}
p->m_nCurLength=i;
if(nRetPos!=-1)
return nWordsIndex+1;
return -1;
}
/*49.設置標注類型,正常、人名、地名或翻譯*/
void SetTagType(pSpan p,enum TAG_TYPE nType)
{
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -