?? contextstat.cpp
字號:
//////////////////////////////////////////////////////////////////////
//ICTCLAS簡介:計算所漢語詞法分析系統ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
// 功能有:中文分詞;詞性標注;未登錄詞識別。
// 分詞正確率高達97.58%(973專家評測結果),
// 未登錄詞識別召回率均高于90%,其中中國人名的識別召回率接近98%;
// 處理速度為31.5Kbytes/s。
//著作權: Copyright?2002-2005中科院計算所 職務著作權人:張華平 劉群
//遵循協議:自然語言處理開放資源許可證1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: ContextStat.cpp
* Abstract:
* implementation of the CContextStat class.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-1-24
*
* Notes:
*
****************************************************************************/
#include "stdafx.h"
#include "ContextStat.h"
#include "Utility.h"
#include <memory.h>
#include <stdio.h>
#include <string.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CContextStat::CContextStat()
{
m_pSymbolTable=0;//new buffer for symbol
m_pContext=0;//init with empty
}
CContextStat::~CContextStat()
{
delete [] m_pSymbolTable;
PMYCONTEXT pCur=m_pContext,pTemp;
while(pCur!=NULL)
{//delete the context array
pTemp=pCur->next;
for(int i=0;i<m_nTableLen;i++)
delete [] pCur->aContextArray[i];
delete [] pCur->aContextArray;
delete [] pCur->aTagFreq;
delete pCur;
pCur=pTemp;
}
}
bool CContextStat::SetSymbol(int *nSymbol)
{
memcpy(m_pSymbolTable,nSymbol,sizeof(int)*m_nTableLen);
return true;
}
bool CContextStat::Add(int nKey, int nPrevSymbol, int nCurSymbol, int nFrequency)
{//Add the context symbol to the array
PMYCONTEXT pRetItem,pNew;
int nPrevIndex,nCurIndex;
if(!GetItem(nKey,&pRetItem))//Not get it
{
pNew=new MYCONTEXT;
pNew->nKey=nKey;
pNew->nTotalFreq=0;
pNew->next=NULL;
pNew->aContextArray=new int*[m_nTableLen];
pNew->aTagFreq=new int[m_nTableLen];
memset(pNew->aTagFreq,0,sizeof(int)*m_nTableLen);
for(int i=0;i<m_nTableLen;i++)
{//new buffer for every dimension
pNew->aContextArray[i]=new int[m_nTableLen];
memset(pNew->aContextArray[i],0,sizeof(int)*m_nTableLen);//Init the frequency
}
if(pRetItem==NULL)//Empty, the new item is head
m_pContext=pNew;
else//Link the new item between pRetItem and its next item
{
pNew->next=pRetItem->next;
pRetItem->next=pNew;
}
pRetItem=pNew;
}
nPrevIndex=BinarySearch(nPrevSymbol,m_pSymbolTable,m_nTableLen);
if(nPrevSymbol>256&&nPrevIndex==-1)//Not find, just for 'nx' and other uncommon POS
nPrevIndex=BinarySearch(nPrevSymbol-nPrevSymbol%256,m_pSymbolTable,m_nTableLen);
nCurIndex=BinarySearch(nCurSymbol,m_pSymbolTable,m_nTableLen);
if(nCurSymbol>256&&nCurIndex==-1)//Not find, just for 'nx' and other uncommon POS
nCurIndex=BinarySearch(nCurSymbol-nCurSymbol%256,m_pSymbolTable,m_nTableLen);
if(nPrevIndex==-1||nCurIndex==-1)//error finding the symbol
return false;
pRetItem->aContextArray[nPrevIndex][nCurIndex]+=nFrequency;//Add the frequency
pRetItem->aTagFreq[nPrevIndex]+=nFrequency;
pRetItem->nTotalFreq+=nFrequency;
return true;
}
bool CContextStat::Save(char *sFilename)
{
FILE *fp,*fp1;
PMYCONTEXT pCur;
char sFileShow[100];
int i;
if((fp=fopen(sFilename,"wb"))==NULL)
{
return false;
}
strcpy(sFileShow,sFilename);
strcat(sFileShow,".shw");
if((fp1=fopen(sFileShow,"wb"))==NULL)
{
return false;
}
fwrite(&m_nTableLen,sizeof(m_nTableLen),1,fp);//write the table length
fwrite(m_pSymbolTable,sizeof(int),m_nTableLen,fp);//write the symbol table
fprintf(fp1,"Table Len=%d\nSymbol:\n",m_nTableLen);
for(i=0;i<m_nTableLen;i++)
fprintf(fp1,"%d ",m_pSymbolTable[i]);
fprintf(fp1,"\n");
pCur=m_pContext;
while(pCur!=NULL)
{
fwrite(&pCur->nKey,sizeof(int),1,fp);
fwrite(&pCur->nTotalFreq,sizeof(int),1,fp);
fprintf(fp1,"nKey=%d,Total frequency=%d:\n",pCur->nKey,pCur->nTotalFreq);
fwrite(pCur->aTagFreq,sizeof(int),m_nTableLen,fp);//the every POS frequency
for(i=0;i<m_nTableLen;i++)
{
fwrite(pCur->aContextArray[i],sizeof(int),m_nTableLen,fp);
fprintf(fp1,"No.%2d=%3d: ",i,m_pSymbolTable[i]);
for(int j=0;j<m_nTableLen;j++)
fprintf(fp1,"%5d ",pCur->aContextArray[i][j]);
fprintf(fp1,"total=%d:\n",pCur->aTagFreq[i]);
}
pCur=pCur->next;
}
fclose(fp);
fclose(fp1);
return true;
}
bool CContextStat::Load(char *sFilename)
{
FILE *fp;
if((fp=fopen(sFilename,"rb"))==NULL)
{
return false;
}
PMYCONTEXT pCur=m_pContext,pTemp,pPre=NULL;
while(pCur!=NULL)
{//delete the context array
pTemp=pCur->next;
for(int i=0;i<m_nTableLen;i++)
delete [] pCur->aContextArray[i];
delete [] pCur->aContextArray;
delete [] pCur->aTagFreq;
delete pCur;
pCur=pTemp;
}
fread(&m_nTableLen,sizeof(m_nTableLen),1,fp);//write the table length
if(m_pSymbolTable!=NULL)
delete [] m_pSymbolTable;
m_pSymbolTable=new int[m_nTableLen];//new buffer for symbol
fread(m_pSymbolTable,sizeof(int),m_nTableLen,fp);//write the symbol table
//free exists items
while(!feof(fp))
{//Read the context
pCur=new MYCONTEXT;
pCur->next=NULL;
if(fread(&pCur->nKey,sizeof(int),1,fp)<1)//Read error
{
delete pCur;
break;
}
fread(&pCur->nTotalFreq,sizeof(int),1,fp);
pCur->aTagFreq=new int [m_nTableLen];
fread(pCur->aTagFreq,sizeof(int),m_nTableLen,fp);//the every POS frequency
pCur->aContextArray=new int *[m_nTableLen];
for(int i=0;i<m_nTableLen;i++)
{
pCur->aContextArray[i]=new int[m_nTableLen];
fread(pCur->aContextArray[i],sizeof(int),m_nTableLen,fp);
}
if(pPre==NULL)
m_pContext=pCur;
else
pPre->next=pCur;
pPre=pCur;
}
fclose(fp);
return true;
}
double CContextStat::GetContextPossibility(int nKey, int nPrev, int nCur)
{
PMYCONTEXT pCur;
int nCurIndex=BinarySearch(nCur,m_pSymbolTable,m_nTableLen);
int nPrevIndex=BinarySearch(nPrev,m_pSymbolTable,m_nTableLen);
if(!GetItem(nKey,&pCur)||nCurIndex==-1||nPrevIndex==-1||pCur->aTagFreq[nPrevIndex]==0||pCur->aContextArray[nPrevIndex][nCurIndex]==0)
return 0.000001;//return a lower value, not 0 to prevent data sparse
int nPrevCurConFreq=pCur->aContextArray[nPrevIndex][nCurIndex];
int nPrevFreq=pCur->aTagFreq[nPrevIndex];
return 0.9*(double)nPrevCurConFreq/(double)nPrevFreq+0.1*(double)nPrevFreq/(double)pCur->nTotalFreq;
//0.9 and 0.1 is a value based experience
}
int CContextStat::GetFrequency(int nKey, int nSymbol)
{//Get the frequency which nSymbol appears
PMYCONTEXT pFound;
int nIndex,nFrequency=0;
if(!GetItem(nKey,&pFound))//Not found such a item
return 0;
nIndex=BinarySearch(nSymbol,m_pSymbolTable,m_nTableLen);
if(nIndex==-1)//error finding the symbol
return 0;
nFrequency=pFound->aTagFreq[nIndex];//Add the frequency
return nFrequency;
}
bool CContextStat::GetItem(int nKey,PMYCONTEXT *pItemRet)
{//Get the item according the nKey
PMYCONTEXT pCur=m_pContext,pPrev=NULL;
if(nKey==0&&m_pContext)
{
*pItemRet=m_pContext;
return true;
}
while(pCur!=NULL&&pCur->nKey<nKey)
{//delete the context array
pPrev=pCur;
pCur=pCur->next;
}
if(pCur!=NULL&&pCur->nKey==nKey)
{//find it and return the current item
*pItemRet=pCur;
return true;
}
*pItemRet=pPrev;
return false;
}
bool CContextStat::SetTableLen(int nTableLen)
{
m_nTableLen=nTableLen;//Set the table len
m_pSymbolTable=new int[nTableLen];//new buffer for symbol
m_pContext=0;//init with empty
return true;
}
void CContextStat::ReleaseContextStat()
{
PMYCONTEXT pCur=m_pContext,pTemp,pPre=NULL;
while(pCur!=NULL)
{//delete the context array
pTemp=pCur->next;
for(int i=0;i<m_nTableLen;i++)
{
if(pCur->aContextArray[i]!=NULL)
{
delete [] pCur->aContextArray[i];
pCur->aContextArray[i]=NULL;
}
}
if(pCur->aContextArray!=NULL)
{
delete [] pCur->aContextArray;
pCur->aContextArray=NULL;
}
if(pCur->aTagFreq!=NULL)
{
delete [] pCur->aTagFreq;
pCur->aTagFreq=NULL;
}
delete pCur;
pCur=pTemp;
}
m_pContext=NULL;
if(m_pSymbolTable!=NULL)
{
delete [] m_pSymbolTable;
m_pSymbolTable=NULL;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -