?? diction.cpp
字號:
pszOrgPtr ++;
ucLen = *pszCodePtr; // 規則左部
pszCodePtr ++;
strncpy(pszOrgPtr,pszCodePtr,ucLen);
pszOrgPtr[ucLen] = '\0';
pszOrgPtr += ucLen;
pszCodePtr += ucLen;
ucLen = *pszCodePtr; // 規則右部
pszCodePtr ++;
strncpy(pszOrgPtr,pszCodePtr,ucLen);
pszOrgPtr[ucLen] = '\0';
pszOrgPtr += ucLen;
pszCodePtr += ucLen;
}
}
void EncodeWordRule(LPCSTR pszSouCode,LPSTR pszTarCode,int &nTarCodeLen)
// 對詞典規則的編碼
// pszTarCode 編碼后的規則
// pszSouCode 源詞典中的原始規則
// (根據xuned于1997.10.25的要求)
// 原始規則的格式:形如:
// @1:W=chark->sdafasdf@2:W=chark->sdafasdf
// 編碼后的規則的格式:
// 規則個數(1 byte),規則左部的長度(1 byte),規則左部,
// 規則右部的長度(1 byte),規則右部,...
{
LPSTR pszRuleRightPtr = NULL;
LPSTR pszNextRulePtr = NULL;
LPCSTR pszSouPtr = pszSouCode;
nTarCodeLen = 0;
int nRuleNum = 0; // 規則個數
nTarCodeLen ++;// 跳過第一個字節,用于保存規則的個數
do {
nRuleNum ++;
pszSouPtr ++;
pszRuleRightPtr = strstr(pszSouPtr,"->");
pszNextRulePtr = strchr(pszSouPtr,'@');
if ( pszRuleRightPtr != NULL && pszNextRulePtr != NULL ) {
if ( pszRuleRightPtr > pszNextRulePtr )
pszRuleRightPtr = NULL;
}
if ( pszRuleRightPtr != NULL ) { // 如果有規則右部
*(pszTarCode+nTarCodeLen) = pszRuleRightPtr - pszSouPtr;
nTarCodeLen ++;
strncpy(pszTarCode+nTarCodeLen,pszSouPtr,
pszRuleRightPtr - pszSouPtr);
nTarCodeLen += pszRuleRightPtr - pszSouPtr;
pszRuleRightPtr += 2;
if ( pszNextRulePtr != NULL ) { // 如果有下一條規則
*(pszTarCode+nTarCodeLen) = pszNextRulePtr - pszRuleRightPtr;
nTarCodeLen ++;
strncpy(pszTarCode+nTarCodeLen,pszRuleRightPtr,
pszNextRulePtr - pszRuleRightPtr);
nTarCodeLen += pszNextRulePtr - pszRuleRightPtr;
} else { // 如果沒有下一條規則
*(pszTarCode+nTarCodeLen) = strlen(pszRuleRightPtr);
nTarCodeLen ++;
strcpy(pszTarCode+nTarCodeLen,pszRuleRightPtr);
nTarCodeLen += strlen(pszRuleRightPtr);
}
} else { // 如果沒有規則右部
if ( pszNextRulePtr != NULL ) {// 如果有下一條規則
*(pszTarCode+nTarCodeLen) = pszNextRulePtr - pszSouPtr;
nTarCodeLen ++;
strncpy(pszTarCode+nTarCodeLen,pszSouPtr,
pszNextRulePtr - pszSouPtr);
nTarCodeLen += pszNextRulePtr - pszSouPtr;
} else {// 如果沒有下一條規則
*(pszTarCode+nTarCodeLen) = strlen(pszSouPtr);
nTarCodeLen ++;
strcpy(pszTarCode+nTarCodeLen,pszSouPtr);
nTarCodeLen += strlen(pszSouPtr);
}
*(pszTarCode+nTarCodeLen) = 0;
nTarCodeLen ++;
}
pszSouPtr = pszNextRulePtr;
} while ( pszSouPtr != NULL );
*pszTarCode = (UCHAR)nRuleNum;
}
int CDictIndex::EncodeSingleSlot(Slot *pSlot,LPSTR pszSingleSlot)
{
ObWord* pObject;
int nSingleSlotLen = 0;
if ( pSlot->m_pszSlotValue != NULL ) {
if ( m_mapSlotName.Lookup(pSlot->m_pszSlotName,( CObject*& )pObject) == FALSE )
ASSERT( FALSE ); // 沒有找到Slot Name
if ( pObject->m_nIndex == m_nQualfrCode ) { // 當前槽是量詞
// 當前的槽名是AddQualfr,槽值是漢字的量詞,
// 在SLOT的鏈表中的保存格式是:
// offset 0: AddQualfr的編碼的低位字節
// offset 1: AddQualfr的編碼的高位字節
// offset 2: 量詞的長度
// offset 3--: 量詞
pszSingleSlot[nSingleSlotLen] = LOBYTE(ADDQUALFR); //(WORD)pObject->m_nIndex);
nSingleSlotLen ++;
pszSingleSlot[nSingleSlotLen] = HIBYTE(ADDQUALFR); //(WORD)pObject->m_nIndex);
nSingleSlotLen ++;
pszSingleSlot[nSingleSlotLen] = (BYTE)strlen(pSlot->m_pszSlotValue);
nSingleSlotLen ++;
strcpy(pszSingleSlot+nSingleSlotLen,pSlot->m_pszSlotValue);
nSingleSlotLen += strlen(pSlot->m_pszSlotValue);
} else {
if ( m_mapSlotValue[pObject->m_nIndex].Lookup(pSlot->m_pszSlotValue,
( CObject*& )pObject) == FALSE ) {
ASSERT( FALSE ); // 沒有找到Slot Value
}
pszSingleSlot[nSingleSlotLen] = LOBYTE((WORD)pObject->m_nIndex);
nSingleSlotLen ++;
pszSingleSlot[nSingleSlotLen] = HIBYTE((WORD)pObject->m_nIndex);
nSingleSlotLen ++;
}
} else { // 沒有槽值的槽
if ( pSlot->m_pszSlotName[0] != '@' ) { // 不是規則
if ( m_mapNoValueSlot.Lookup(pSlot->m_pszSlotName,( CObject*& )pObject) == FALSE )
ASSERT( FALSE ); // 沒有找到Slot Name
pszSingleSlot[nSingleSlotLen] = LOBYTE((WORD)pObject->m_nIndex);
nSingleSlotLen ++;
pszSingleSlot[nSingleSlotLen] = HIBYTE((WORD)pObject->m_nIndex);
nSingleSlotLen ++;
} else { // 是規則
pszSingleSlot[nSingleSlotLen] = LOBYTE(RULE_CODE);
nSingleSlotLen ++;
pszSingleSlot[nSingleSlotLen] = HIBYTE(RULE_CODE);
nSingleSlotLen ++;
nSingleSlotLen ++;
int nRuleCodeLen;
EncodeWordRule(pSlot->m_pszSlotName,
pszSingleSlot+nSingleSlotLen,nRuleCodeLen);
pszSingleSlot[nSingleSlotLen-1] = (BYTE)nRuleCodeLen;
nSingleSlotLen += nRuleCodeLen;
}
}
return nSingleSlotLen;
}
int CDictIndex::EncodeSlotLink(Slot *pFirstSlot,LPSTR pszSlotLink)
// 對槽鏈表編碼
{
Slot *pCurrSlot = pFirstSlot;
int nSlotLinkLen = 0;
do {
nSlotLinkLen += EncodeSingleSlot(pCurrSlot,
pszSlotLink+nSlotLinkLen);
pCurrSlot = pCurrSlot->m_pNextSlot;
} while ( pCurrSlot != NULL );
return nSlotLinkLen;
}
int CDictIndex::CalculateSlotNum(Slot *pFirstSlot)
// 計算鏈表中槽的數量,返回數量
{
Slot *pCurrSlot = pFirstSlot;
int nSlotNum = 0;
do {
pCurrSlot = pCurrSlot->m_pNextSlot;
if ( pCurrSlot != NULL )
//if ( pCurrSlot->m_bIsTranRule == FALSE ) // 忽略翻譯規則
nSlotNum ++;
} while ( pCurrSlot != NULL );
return nSlotNum+1;
}
int CDictIndex::EncodeSingleChinesePart(ChinesePart *pCurrChinese,
LPSTR pszSinglePartInfo)
{
int nSingleChinPartLen = 0;
pszSinglePartInfo[nSingleChinPartLen] = pCurrChinese->m_nChineseLen;//中文譯文長度
nSingleChinPartLen ++;
memcpy(pszSinglePartInfo+nSingleChinPartLen,
pCurrChinese->m_pszChinese,pCurrChinese->m_nChineseLen);// 中文譯文
nSingleChinPartLen += pCurrChinese->m_nChineseLen;
ObWord* pObject;
if ( pCurrChinese->m_pszCate != NULL ) {
if ( m_mapCate.Lookup(pCurrChinese->m_pszCate,( CObject*& )pObject) == FALSE ) {
ASSERT( FALSE ); // 沒有找到Cate
}
pszSinglePartInfo[nSingleChinPartLen] = (UCHAR)pObject->m_nIndex;//詞性
nSingleChinPartLen ++;
} else {
pszSinglePartInfo[nSingleChinPartLen] = (UCHAR)0; // No Cate
nSingleChinPartLen ++;
}
if ( pCurrChinese->m_pszHead != NULL ) {
if ( m_mapHead.Lookup(pCurrChinese->m_pszHead,( CObject*& )pObject) == FALSE ) {
ASSERT( FALSE ); // 沒有找到Head
}
pszSinglePartInfo[nSingleChinPartLen] = (UCHAR)pObject->m_nIndex;//語義大類
nSingleChinPartLen ++;
} else {
pszSinglePartInfo[nSingleChinPartLen] = (UCHAR)0; // No Head
nSingleChinPartLen ++;
}
// Add for Debug Begin
//if ( strcmp(pCurrChinese->m_pszChinese,"幾") == 0 ) {
// ASSERT(FALSE);
//}
// Add for Debug End
if ( pCurrChinese->m_pFirstSlot != NULL ) {
pszSinglePartInfo[nSingleChinPartLen] = CalculateSlotNum(pCurrChinese->m_pFirstSlot);//槽的數量
nSingleChinPartLen ++;
nSingleChinPartLen += EncodeSlotLink(pCurrChinese->m_pFirstSlot,
pszSinglePartInfo+nSingleChinPartLen);
} else {
pszSinglePartInfo[nSingleChinPartLen] = (UCHAR)0;//槽的數量
nSingleChinPartLen ++;
}
return nSingleChinPartLen;
}
int CDictIndex::EncodeChinesePartInfo(COneWord *pOneWord,
LPSTR pszChinesePartInfo)
// 編碼中文譯文及相關內容部分
{
int nChinesePartInfoLen = 0;
if ( pOneWord->m_pFirstChinese != NULL ) {
ChinesePart *pCurrChinese = pOneWord->m_pFirstChinese;
do {
nChinesePartInfoLen += EncodeSingleChinesePart(pCurrChinese,
pszChinesePartInfo+nChinesePartInfoLen);
pCurrChinese = pCurrChinese->m_pNextPart;
} while ( pCurrChinese != NULL );
}
return nChinesePartInfoLen;
}
int CDictIndex::CalcuChinPartNum(ChinesePart *pFirstChinese)
// 計算中文譯文的個數
{
ChinesePart *pCurrChinese = pFirstChinese;
int nTotalNum = 0;
do {
nTotalNum ++;
pCurrChinese = pCurrChinese->m_pNextPart;
} while ( pCurrChinese != NULL );
return nTotalNum;
}
int CDictIndex::EncodeDictRecord(COneWord *pOneWord,LPSTR pszWordInfo)
{
int nNowSite = 0;
// 是否存在原形 ( 1 Byte )
pszWordInfo[nNowSite] = (UCHAR)pOneWord->m_bIsExistOrig;
nNowSite ++;
// 是否存在兼類
pszWordInfo[nNowSite] = (UCHAR)pOneWord->m_bIsExistAmbig;
nNowSite ++;
// 下面緊跟一個標志和標志相關的其他內容,參考下面說明
// 原形的存儲
if ( pOneWord->m_bIsExistOrig ) {
pszWordInfo[nNowSite] = (UCHAR)DICT_FLAG_ORIGIN; // 存儲內容的標志
nNowSite ++;
pszWordInfo[nNowSite] = (UCHAR)pOneWord->m_nWordStyle;// 詞的類型:過去式,進行式...(1 Byte) m_nWordStyle
nNowSite ++;
pszWordInfo[nNowSite] = (UCHAR)pOneWord->m_nOrigLen; // 原形長度 ( 1 Byte )
nNowSite ++;
memcpy(pszWordInfo+nNowSite,pOneWord->m_pszOrig,pOneWord->m_nOrigLen);// 原形
nNowSite += pOneWord->m_nOrigLen;
}
// 兼類的存儲
if ( pOneWord->m_bIsExistAmbig ) {
pszWordInfo[nNowSite] = (UCHAR)DICT_FLAG_AMBIG; // 存儲內容的標志
nNowSite ++;
pszWordInfo[nNowSite] = (UCHAR)pOneWord->m_nAmbigLen; // 兼類長度 ( 1 Byte )
nNowSite ++;
memcpy(pszWordInfo+nNowSite,pOneWord->m_pszAmbig,pOneWord->m_nAmbigLen);// 原形
nNowSite += pOneWord->m_nAmbigLen;
}
if ( pOneWord->m_pFirstChinese != NULL ) { // 有中文
pszWordInfo[nNowSite] = (UCHAR)DICT_FLAG_CHINESE;
nNowSite ++;
pszWordInfo[nNowSite] = (UCHAR)CalcuChinPartNum(pOneWord->m_pFirstChinese);
nNowSite ++;
} else { // 無中文
pszWordInfo[nNowSite] = (UCHAR)DICT_FLAG_CHINESE;
nNowSite ++;
pszWordInfo[nNowSite] = 0; // // 中文譯文的個數
nNowSite ++;
}
nNowSite += EncodeChinesePartInfo(pOneWord,pszWordInfo+nNowSite);
return nNowSite;
}
BOOL CDictIndex::InsertOneWordToIndex(Dictionary *pobDiction,
FILE *fpIndexDat,
CHuffman *pHuffman,
COneWord *pOneWord)
// 舊函數,現不使用
{
// 如果該詞條為空,則跳過該詞條
if ( pOneWord->m_pszEnglish == NULL )
return TRUE;
int nTarBuffSize = DIC_WORD_LEN*10;
LPSTR pszTarBuff = new char[nTarBuffSize];
memset(pszTarBuff,nTarBuffSize,0);
int nTarLen;
pHuffman->CompressString(pOneWord->m_pszEnglish,
pOneWord->m_nEnglishLen,
pszTarBuff,nTarLen);
if ( nTarLen > DIC_WORD_LEN ) {
//ASSERT( FALSE );// 詞典中英文詞條太長
// 跳過這個詞
delete pszTarBuff;
return TRUE;
} else {
m_nTotalWordsNum ++;
int nWordInfoLen;
nWordInfoLen = EncodeDictRecord(pOneWord,m_pszWordInfoBuff);
long lDataSite = WriteIndexData(fpIndexDat,m_pszWordInfoBuff,
nWordInfoLen);
CompressIndexOffsetInfo(lDataSite,nWordInfoLen,
pobDiction->m_pszOffset);
memset(pobDiction->m_pszWord,0x0,DIC_WORD_LEN);
memcpy(pobDiction->m_pszWord,pszTarBuff,nTarLen);
pobDiction->insert();
}
delete pszTarBuff;
return TRUE;
}
int EncodeChinFormatCheck(LPSTR pszWordInfo)
{
int nNowSite = 0;
int nChineseLen = (UCHAR)pszWordInfo[nNowSite]; // 中文譯文長度
nNowSite ++;
nNowSite += nChineseLen;
// 詞性
nNowSite ++;
// 語義大類
nNowSite ++;
int nSlotNum = (UCHAR)pszWordInfo[nNowSite];
nNowSite ++;
WORD wTep;
int nQualfrLen;
int nDicRule;
for ( int Loop=0;Loop<nSlotNum;Loop++ ) {
wTep = MAKEWORD(pszWordInfo[nNowSite],pszWordInfo[nNowSite+1]);
nNowSite += 2;
if ( wTep == ADDQUALFR ) { // 量詞的解碼
nQualfrLen = (BYTE) *(pszWordInfo+nNowSite);
nNowSite ++;
nNowSite += nQualfrLen;
} else if ( wTep == RULE_CODE ) { // 規則的解碼
nDicRule = (BYTE) *(pszWordInfo+nNowSite);
nNowSite ++;
nNowSite += nDicRule;
} else { // 普通槽
}
}
return nNowSite;
}
BOOL EncodeResultFormatCheck(LPSTR pszWordInfo,int nWordInfoLen)
// 編碼格式檢查
{
int nNowSite = 0;
nNowSite ++;
nNowSite ++;
int Loop;
DictChin *pLastChin = NULL;
int nOrigLen;
int nAmbigLen;
int nChinNum;
do {
switch ( (UCHAR)pszWordInfo[nNowSite] ) {
case DICT_FLAG_ORIGIN:
nNowSite ++;
nNowSite ++;
nOrigLen = (UCHAR)pszWordInfo[nNowSite];
nNowSite ++;
nNowSite += nOrigLen;
break;
case DICT_FLAG_AMBIG:
nNowSite ++;
nAmbigLen = (UCHAR)pszWordInfo[nNowSite];
nNowSite ++;
// 兼類內容
nNowSite += nAmbigLen;
break;
case DICT_FLAG_CHINESE:
nNowSite ++;
nChinNum = (UCHAR)pszWordInfo[nNowSite];
nNowSite ++;
for ( Loop=0;Loop<nChinNum;Loop++ )
nNowSite += EncodeChinFormatCheck(pszWordInfo+nNowSite);
break;
default:
return FALSE; // 編碼格式錯誤
}
} while ( nNowSite < nWordInfoLen );
return TRUE;
}
BOOL CDictIndex::OutputOneWordToTempDatFile(FILE *fpEnglishInfo,
FILE *fpTranDat,
COneWord *pOneWord)
// 將單詞的詞條內容輸出到中間文件fpTranDat中,
// 同時將對應的單詞的英文和詞條內容在fpTranDat中的位置輸出到fpEnglishInfo中
{
// 如果該詞條為空,則跳過該詞條
if ( pOneWord->m_pszEnglish == NULL )
return TRUE;
m_nTotalWordsNum ++;
int nWordInfoLen;
nWordInfoLen = EncodeDictRecord(pOneWord,m_pszWordInfoBuff);
// 編碼格式檢查
if ( EncodeResultFormatCheck(m_pszWordInfoBuff,nWordInfoLen) == FALSE )
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -