?? unknowword.cpp
字號(hào):
//////////////////////////////////////////////////////////////////////
//ICTCLAS簡(jiǎn)介:計(jì)算所漢語(yǔ)詞法分析系統(tǒng)ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
// 功能有:中文分詞;詞性標(biāo)注;未登錄詞識(shí)別。
// 分詞正確率高達(dá)97.58%(973專(zhuān)家評(píng)測(cè)結(jié)果),
// 未登錄詞識(shí)別召回率均高于90%,其中中國(guó)人名的識(shí)別召回率接近98%;
// 處理速度為31.5Kbytes/s。
//著作權(quán): Copyright?2002-2005中科院計(jì)算所 職務(wù)著作權(quán)人:張華平 劉群
//遵循協(xié)議:自然語(yǔ)言處理開(kāi)放資源許可證1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: UnknowWord.cpp
* Abstract:
* implementation of the CUnknowWord class.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-4-23
*
* Notes: Unknown words recognition based on Role-tagging
*
****************************************************************************/
#include "stdafx.h"
#include "UnknowWord.h"
#include <string.h>
#include <math.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CUnknowWord::CUnknowWord()
{
m_sUnknownFlags[0]=0;
}
CUnknowWord::~CUnknowWord()
{
}
//Unknown word recognition
//pWordSegResult:word Segmentation result;graphOptimum: The optimized segmentation graph
//graphSeg: The original segmentation graph
bool CUnknowWord::Recognition(PWORD_RESULT pWordSegResult, CDynamicArray &graphOptimum,CSegGraph &graphSeg,CDictionary &dictCore)
{
int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
ELEMENT_TYPE dValue;
m_roleTag.POSTagging(pWordSegResult,dictCore,m_dict);
//Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
for(int i=0;i<m_roleTag.m_nUnknownIndex;i++)
{
while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][0])
{
nStartPos+=graphSeg.m_nAtomLength[j++];
}
nAtomStart=j;
while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][1])
{
nStartPos+=graphSeg.m_nAtomLength[j++];
}
nAtomEnd=j;
if(nAtomStart<nAtomEnd)
{
graphOptimum.GetElement(nAtomStart,nAtomEnd,&dValue,&nPOSOriginal);
if(dValue>m_roleTag.m_dWordsPossibility[i])//Set the element with less frequency
graphOptimum.SetElement(nAtomStart,nAtomEnd,m_roleTag.m_dWordsPossibility[i],m_nPOS,m_sUnknownFlags);
}
}
return true;
}
//Load unknown recognition dictionary
//Load context
//type: Unknown words type (including person,place,transliterion and so on)
bool CUnknowWord::Configure(char *sConfigFile,enum TAG_TYPE type)
{
char sFilename[100];
//Load the unknown recognition dictionary
strcpy(sFilename,sConfigFile);
strcat(sFilename,".dct");
m_dict.Load(sFilename);
//Load the unknown recognition context
strcpy(sFilename,sConfigFile);
strcat(sFilename,".ctx");
m_roleTag.LoadContext(sFilename);
//Set the tagging type
m_roleTag.SetTagType(type);
switch(type)
{
case TT_PERSON:
case TT_TRANS_PERSON://Set the special flag for transliterations
m_nPOS=-28274;//-'n'*256-'r';
strcpy(m_sUnknownFlags,"未##人");
break;
case TT_PLACE:
m_nPOS=-28275;//-'n'*256-'s';
strcpy(m_sUnknownFlags,"未##地");
break;
default :
m_nPOS=0;
break;
}
return true;
}
//Judge whether the name is a given name
bool CUnknowWord::IsGivenName(char *sName)
{
char sFirstChar[3],sSecondChar[3];
ELEMENT_TYPE dGivenNamePossibility=0,dSingleNamePossibility=0;
if(strlen(sName)!=4)
return false;
strncpy(sFirstChar,sName,2);
sFirstChar[2]=0;
strncpy(sSecondChar,sName+2,2);
sSecondChar[2]=0;
//The possibility of P(Wi|Ti)
dGivenNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,2)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
dGivenNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,3)+1.0);
//The possibility of conversion from 2 to 3
dGivenNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,2,3)+1.0)-log(m_roleTag.m_context.GetFrequency(0,2)+1.0);
//The possibility of P(Wi|Ti)
dSingleNamePossibility+=log((double)m_dict.GetFrequency(sFirstChar,1)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
dSingleNamePossibility+=log((double)m_dict.GetFrequency(sSecondChar,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,4)+1.0);
//The possibility of conversion from 1 to 4
dSingleNamePossibility+=log(m_roleTag.m_context.GetContextPossibility(0,1,4)+1.0)-log(m_roleTag.m_context.GetFrequency(0,1)+1.0);
if(dSingleNamePossibility>=dGivenNamePossibility)//張震||m_dict.GetFrequency(sFirstChar,1)/m_dict.GetFrequency(sFirstChar,2)>=10
//The possibility being a single given name is more than being a 2-char given name
return false;
return true;
}
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -