?? utility.cpp
字號:
//////////////////////////////////////////////////////////////////////
//ICTCLAS簡介:計算所漢語詞法分析系統(tǒng)ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
// 功能有:中文分詞;詞性標(biāo)注;未登錄詞識別。
// 分詞正確率高達(dá)97.58%(973專家評測結(jié)果),
// 未登錄詞識別召回率均高于90%,其中中國人名的識別召回率接近98%;
// 處理速度為31.5Kbytes/s。
//著作權(quán): Copyright?2002-2005中科院計算所 職務(wù)著作權(quán)人:張華平 劉群
//遵循協(xié)議:自然語言處理開放資源許可證1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: Utility.c
* Abstract:
* Utility functions for Chinese Language Processing
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-1-8
*
* Notes:
*
****************************************************************************/
#include "stdafx.h"
#include "Utility.h"
#include <stdio.h>
#include <string.h>
/*********************************************************************
*
* Func Name : GB2312_Generate
*
* Description: Generate the GB2312 List file
*
*
* Parameters : sFilename: the file name for the output GB2312 List
*
* Returns : bool
* Author : Kevin Zhang
* History :
* 1.create 2002-1-8
*********************************************************************/
bool GB2312_Generate(char *sFileName)
{
FILE *fp;
unsigned int i,j;
if((fp=fopen(sFileName,"wt"))==NULL)
return false;//fail while opening the file
for(i=161;i<255;i++)
for(j=161;j<255;j++)
fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
fclose(fp);
return true;
}
/*********************************************************************
*
* Func Name : CC_Generate
*
* Description: Generate the Chinese Char List file
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : bool
* Author : Kevin Zhang
* History :
* 1.create 2002-1-8
*********************************************************************/
bool CC_Generate(char *sFileName)
{
FILE *fp;
unsigned int i,j;
if((fp=fopen(sFileName,"wt"))==NULL)
return false;//fail while opening the file
for(i=176;i<255;i++)
for(j=161;j<255;j++)
fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
fclose(fp);
return true;
}
/*********************************************************************
*
* Func Name : CC_Find
*
* Description: Find a Chinese sub-string in the Chinese String
*
*
* Parameters : string:Null-terminated string to search
*
* strCharSet:Null-terminated string to search for
*
* Returns : char *
* Author : Kevin Zhang
* History :
* 1.create 2002-1-8
*********************************************************************/
char *CC_Find(const char *string, const char *strCharSet)
{
char *cp=strstr(string,strCharSet);
if(cp!=NULL&&(cp-string)%2==1)
{
return NULL;
}
return cp;
}
/*********************************************************************
*
* Func Name : charType
*
* Description: Judge the type of sChar or (sChar,sChar+1)
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : int : the type of char
* Author : Kevin Zhang
* History :
* 1.create 2002-1-8
*********************************************************************/
int charType(unsigned char *sChar)
{
if(*sChar<128)
{
if(strchr("\042!,.?()[]{}+=",(int)*sChar))
return CT_DELIMITER;
return CT_SINGLE;
}
else if(*sChar==162)
return CT_INDEX;
else if(*sChar==163&&*(sChar+1)>175&&*(sChar+1)<186)
return CT_NUM;
else if(*sChar==163&&(*(sChar+1)>=193&&*(sChar+1)<=218||*(sChar+1)>=225&&*(sChar+1)<=250))
return CT_LETTER;
else if(*sChar==161||*sChar==163)
return CT_DELIMITER;
else if(*sChar>=176&&*sChar<=247)
return CT_CHINESE;
else
return CT_OTHER;
}
/*********************************************************************
*
* Func Name : GetCCPrefix
*
* Description: Get the max Prefix string made up of Chinese Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
*
* Returns : the end of the sub-sentence
* Author : Kevin Zhang
* History :
* 1.create 2002-1-8
*********************************************************************/
unsigned int GetCCPrefix(unsigned char *sSentence)
{
unsigned int nLen=strlen((const char *)sSentence),nCurPos=0;
while(nCurPos<nLen&&sSentence[nCurPos]>175&&sSentence[nCurPos]<248)
{
nCurPos+=2;//Get next Chinese Char
}
return nCurPos;
}
/*********************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
*
* Returns : the end of the sub-sentence
* Author : Kevin Zhang
* History :
* 1.create 2002-1-24
*********************************************************************/
bool IsAllChinese(unsigned char *sString)
{
unsigned int nLen=strlen((const char *)sString),i=0;
while(i<nLen-1&&sString[i]<248&&sString[i]>175)
{
i+=2;
}
if(i<nLen)
return false;
return true;
}
/*********************************************************************
*
* Func Name : IsAllNonChinese
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
*
* Returns : the end of the sub-sentence
* Author : Kevin Zhang
* History :
* 1.create 2002-1-24
*********************************************************************/
bool IsAllNonChinese(unsigned char *sString)
{
unsigned int nLen=strlen((const char *)sString),i=0;
while(i<nLen)
{
if(sString[i]<248&&sString[i]>175)
return false;
if(sString[i]>128)
i+=2;
else
i+=1;
}
return true;
}
/*********************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
*
* Returns : the end of the sub-sentence
* Author : Kevin Zhang
* History :
* 1.create 2002-1-24
*********************************************************************/
bool IsAllSingleByte(unsigned char *sString)
{
unsigned int nLen=strlen((const char *)sString),i=0;
while(i<nLen&&sString[i]<128)
{
i++;
}
if(i<nLen)
return false;
return true;
}
/*********************************************************************
*
* Func Name : IsAllNum
*
* Description: Judge the string is all made up of Num Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
*
* Returns : the end of the sub-sentence
* Author : Kevin Zhang
* History :
* 1.create 2002-1-24
*********************************************************************/
bool IsAllNum(unsigned char *sString)
{
unsigned int nLen=strlen((const char *)sString),i=0;
char sChar[3];
sChar[2]=0;
if(i<nLen)//Get prefix such as + -
{
sChar[0]=sString[i++];
if(sChar[0]<0)//Get first char
sChar[1]=sString[i++];
else
sChar[1]=0;
if(!strstr("±+—-+",sChar))
{
i=0;
}
}
while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
{
i+=2;
}
if(i<nLen)//Get middle delimiter such as .
{
sChar[0]=sString[i++];
if(sChar[0]<0)//Get first char
sChar[1]=sString[i++];
else
sChar[1]=0;
if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
{//98.1%
while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
{
i+=2;
}
}
else
{
i-=strlen(sChar);
}
}
if(i>=nLen)
return true;
while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
{//single byte number char
i+=1;
}
if(i<nLen)//Get middle delimiter such as .
{
sChar[0]=sString[i++];
if(sChar[0]<0)//Get first char
sChar[1]=sString[i++];
else
sChar[1]=0;
if(CC_Find("∶·./",sChar)||sChar[0]=='.'||sChar[0]=='/')
{//98.1%
while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
{
i+=1;
}
}
else
{
i-=strlen(sChar);
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -