?? ftwbrk.cpp
字號:
#include "stdafx.h"
#include <stdio.h>
#include <string.h>
#include "ftwbrk.h"
unsigned int TBL_CHTYPE_WITHLEADING[96] =
{
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, // First element is bogus
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONSLOWER, TCT_THAICONSLOWER,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONSUPPER,
TCT_THAICONS, TCT_THAICONSUPPER, TCT_THAICONS, TCT_THAICONSUPPER,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONSLOWER, TCT_THAICONS, TCT_THAICONSLOWER, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAIBREAK,
TCT_THAI, TCT_THAIUPPERVOWEL, TCT_THAI, TCT_THAIVOWEL,
TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL,
TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_BREAK,
TCT_BREAK, TCT_CONTROL , TCT_CONTROL, TCT_THAI,
TCT_THAI|TCT_LEADINGVOWEL, TCT_THAI|TCT_LEADINGVOWEL, TCT_THAI|TCT_LEADINGVOWEL, TCT_THAI|TCT_LEADINGVOWEL,
TCT_THAI|TCT_LEADINGVOWEL, TCT_THAI, TCT_THAI, TCT_THAIDIAC,
TCT_THAITONE, TCT_THAITONE, TCT_THAITONE, TCT_THAITONE,
TCT_THAITONE, TCT_THAI, TCT_THAI, TCT_THAI,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIBREAK, TCT_THAIBREAK,
TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL ,TCT_THAILOWERVOWEL, TCT_ENGLISH,
};
unsigned int TBL_CHTYPE[96] =
{
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, // first element is bogus
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONSLOWER, TCT_THAICONSLOWER,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONSUPPER,
TCT_THAICONS, TCT_THAICONSUPPER, TCT_THAICONS, TCT_THAICONSUPPER,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONSLOWER, TCT_THAICONS, TCT_THAICONSLOWER, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAICONS,
TCT_THAICONS, TCT_THAICONS, TCT_THAICONS, TCT_THAIBREAK,
TCT_THAI, TCT_THAIUPPERVOWEL, TCT_THAI, TCT_THAI,
TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL, TCT_THAIUPPERVOWEL,
TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_BREAK,
TCT_BREAK, TCT_CONTROL, TCT_CONTROL, TCT_THAI,
TCT_THAI, TCT_THAI, TCT_THAI, TCT_THAI,
TCT_THAI, TCT_THAI, TCT_THAI, TCT_THAIDIAC,
TCT_THAITONE, TCT_THAITONE, TCT_THAITONE, TCT_THAITONE,
TCT_THAITONE, TCT_THAIDIAC, TCT_THAI, TCT_THAI,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIDIGIT,
TCT_THAIDIGIT, TCT_THAIDIGIT, TCT_THAIBREAK, TCT_THAIBREAK,
TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_THAILOWERVOWEL, TCT_ENGLISH,
};
unsigned long THAICHARTYPETABLE[256] =
{
/*00 XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
10 XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,
XT_CONTROL, XT_CONTROL, XT_CONTROL, XT_CONTROL,*/
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/*20*/ XT_WHITESPACE,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
/*28*/ XT_WRDBEG,
XT_WRDEND,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT | XT_BOTH,
XT_PUNCT,
/*30*/ XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
/*38*/ XT_DIGIT | XT_ENG,
XT_DIGIT | XT_ENG,
XT_PUNCT,
XT_PUNCT,
XT_WRDBEG,
XT_PUNCT,
XT_WRDEND,
XT_SNTEND,
/*40*/ XT_PUNCT,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
/*48*/ XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
/*50*/ XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
/*58*/ XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_ENG | XT_UCASE,
XT_WRDBEG,
XT_PUNCT,
XT_WRDEND,
XT_PUNCT,
XT_PUNCT,
/*60*/ XT_PUNCT,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
/*68*/ XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
/*70*/ XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
/*78*/ XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_ENG | XT_LCASE,
XT_WRDBEG,
XT_PUNCT,
XT_WRDEND,
XT_PUNCT,
0,
/*80*/ XT_THA | XT_CONS,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_PUNCT,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
/*88*/ XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_DIAC,
/*90*/ XT_THA | XT_CONS,
XT_PUNCT | XT_BOTH | XT_WRDBEG,
XT_PUNCT | XT_BOTH | XT_WRDEND,
XT_PUNCT | XT_BOTH | XT_WRDBEG,
XT_PUNCT | XT_BOTH | XT_WRDEND,
XT_PUNCT,
XT_PUNCT,
XT_PUNCT,
/*98*/ XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_DIAC,
/*A0*/ XT_WHITESPACE,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
/*A8*/ XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
/*B0*/ XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
/*B8*/ XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
/*C0*/ XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
/*C8*/ XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_CONS,
XT_THA | XT_SYMBOL,
/*D0*/ XT_THA | XT_FVOW,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_FVOW,
XT_THA | XT_FVOW,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
/*D8*/ XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_HARDSPACE,
XT_PUNCT,
0,
0,
XT_PUNCT,
/*E0*/ XT_THA | XT_LVOW,
XT_THA | XT_LVOW,
XT_THA | XT_LVOW,
XT_THA | XT_LVOW,
XT_THA | XT_LVOW,
XT_THA | XT_FVOW,
XT_THA,
XT_THA | XT_ZWIDTH | XT_DIAC,
/*E8*/ XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_TONE,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_THA | XT_ZWIDTH | XT_DIAC,
XT_PUNCT,
/*F0*/ XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
/*F8*/ XT_DIGIT | XT_THA,
XT_DIGIT | XT_THA,
XT_PUNCT,
XT_PUNCT,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
XT_THA | XT_ZWIDTH,
0,
};
//-------------------------------------------------------------------------------
// Implementation of ThaiFunc class
//-------------------------------------------------------------------------------
ThaiFunc::ThaiFunc()
{
}
ThaiFunc::~ThaiFunc()
{
}
bool ThaiFunc::IsThaiChar(unsigned short ch)
{
return ((unsigned int(ch) >> 8) & 0x00ff)==0x000e;
}
unsigned int ThaiFunc::ThaiCharType(unsigned short ch)
{
return TBL_CHTYPE[(unsigned char)ch];
}
void ThaiFunc::InitializeTable()
{
memcpy(tblCharType, TBL_CHTYPE, 96*sizeof(unsigned int));
}
//-------------------------------------------------------------------------------
// Implementation of ThaiLexicon class
//-------------------------------------------------------------------------------
ThaiLexicon::ThaiLexicon()
{
m_trie = NULL;
m_cWord = 0;
m_cNode =
m_nSize = 0;
m_MemDict = NULL;
memset((void*)&m_DictHeader, 0L, sizeof(DICTHEADER));
}
ThaiLexicon::~ThaiLexicon()
{
if (m_MemDict)
delete m_MemDict;
DestroyTrie(m_trie);
}
void ThaiLexicon::DestroyTrie(LPTRIE pTrie)
{
if (pTrie)
{
DestroyTrie(_lptrie(pTrie->child));
DestroyTrie(_lptrie(pTrie->next));
delete pTrie;
}
}
bool ThaiLexicon::AddWord(unsigned char *szNew,unsigned int cb)
{
unsigned int iNext = 0;
LPTRIE pTrie;
if (cb <= 0) return false;
if (m_cWord == 0) // insert the first word?
{
m_trie = new TRIE; // allocate root note;
m_trie->next = m_trie->child = NULL;
m_trie->ch = szNew[iNext];
m_trie->wrap = 0;
m_cNode++;
}
pTrie = m_trie; // init Trie walker
while(iNext < cb)
{
LNewLoop:
if (pTrie->ch == szNew[iNext]) // matched character
{
LCharMatched:
if (++iNext == cb) // last character?
break;
if ((!pTrie->child) || (_lptrie(pTrie->child)->ch > szNew[iNext]))
{
LPTRIE pTmp = new TRIE; // create new node and insert in next level
m_cNode++;
pTmp->ch = szNew[iNext];
pTmp->next = pTrie->child;
pTmp->child = NULL;
pTmp->wrap = false;
pTrie->child = pTmp;
pTrie = _lptrie(pTrie->child);
goto LCharMatched;
}
pTrie = _lptrie(pTrie->child);
goto LNewLoop;
}
else
if ((!pTrie->next) || (_lptrie(pTrie->next)->ch > szNew[iNext]))
{
LPTRIE pTmp = new TRIE; // create new node and insert in this level
m_cNode++;
pTmp->ch = szNew[iNext];
pTmp->next = pTrie->next;
pTmp->child = NULL;
pTmp->wrap = false;
pTrie->next = pTmp;
pTrie = pTmp;
goto LCharMatched; // we need speed more than spaces
}
pTrie = _lptrie(pTrie->next);
goto LNewLoop;
}
if (pTrie->wrap)
return false; // this word already exist
else
{
pTrie->wrap = true;
m_cWord++; // advanced word count
}
return true;
}
void ThaiLexicon::RefreshTrie()
{
bool fMemValid = false;
if (!m_MemDict) // memory has been allocated already ?
{
if (m_MemDict = new unsigned char[m_cNode*10])
fMemValid = true;
}
#if 0 // C++ Syntax not available.
else
{
// try to resize existing memory block
unsigned char *pMemTmp = (unsigned char*)GlobalReAlloc(m_MemDict,m_cNode*10,GMEM_FIXED);
if (pMemTmp)
{
m_MemDict = pMemTmp;
fMemValid = false;
}
}
#endif
if (fMemValid)
m_nSize = Trie2Mem(m_trie, 0, false);
}
int ThaiLexicon::Trie2Mem(LPTRIE pTrie, int iStart, bool fWrap)
{
int iNext,iCur;
DIFFINDEX cbDiff;
int iNode,cNode = EnumNode(pTrie); // enum node in same level
LPTRIE pTrieWalk = pTrie;
iCur = iStart;
iNext = iStart + 1 + 3 * cNode; // we need 3 bytes to store address to child node
for (iNode = 0; iNode < cNode; iNode++)
{
cbDiff.l = iNext;
m_MemDict[iCur++] = (cbDiff.b.b2 | (fWrap?ENDWORD:0)); // store address of node 'iNode'
m_MemDict[iCur++] = cbDiff.b.b1;
m_MemDict[iCur++] = cbDiff.b.b0;
unsigned char nSubWordLen = CountUniqueStr(_lptrie(pTrieWalk));
if (nSubWordLen > 1)
iNext = InsertSubWord(_lptrie(pTrieWalk),nSubWordLen,iNext);
else
{
m_MemDict[iNext++] = pTrieWalk->ch;
if (pTrieWalk->child)
iNext = Trie2Mem(_lptrie(pTrieWalk->child),iNext,pTrieWalk->wrap);
else
{
m_MemDict[iNext++] = (ENDCHILD | ENDWORD);
}
}
pTrieWalk = _lptrie(pTrieWalk->next);
}
m_MemDict[iCur++] = (ENDCHILD | (fWrap?ENDWORD:0)); // write end of child
return iNext;
}
int ThaiLexicon::EnumNode(LPTRIE pTrie)
{
int cNode = 0;
while(pTrie)
{
pTrie = _lptrie(pTrie->next);
cNode++;
}
return cNode;
}
unsigned char ThaiLexicon::CountUniqueStr(LPTRIE pTrie)
{
LPTRIE pTrieWalk = pTrie;
unsigned char nLevel = 0;
while (pTrieWalk && !(pTrieWalk->next))
{
pTrieWalk = _lptrie(pTrieWalk->child);
nLevel++;
}
return nLevel;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -