?? freqcounter.cpp
字號:
// FreqCounter.cpp: implementation of the CFreqCounter class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "CPT.h"
#include "FreqCounter.h"
#include <math.h>
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
ChChar CFreqCounter::chars[MAXNUMCHINESECHAR];
UINT CFreqCounter::retvalue=CFreqCounter::InitChineseChars();
CFreqCounter::CFreqCounter(UINT nDepth)
{
m_nSizeArray = 0;
m_aucArray = NULL;
m_tucRoot=NULL;
InitCounterTree(&m_tucRoot, nDepth, 0, MAXNUMCHINESECHAR-1);
}
CFreqCounter::~CFreqCounter()
{
if (m_tucRoot)
ReleaseCounterTree(m_tucRoot);
for (UINT i=0; i<m_nSizeArray; i++)
if (m_aucArray[i].next)
delete m_aucArray[i].next;
if (m_nSizeArray)
delete m_aucArray;
}
// Construct the lookup table for Chinese Chars
UINT CFreqCounter::InitChineseChars()
{
unsigned char i, j;
int k=0;
for (i=0xb0; i<=0xf7; i++)
for (j=0xa1; j<=0xfe; j++)
{
ASSERT(k<MAXNUMCHINESECHAR);
chars[k][0]=i;
chars[k++][1]=j;
}
return 0;
}
void CFreqCounter::InitCounterTree(TUCounter ** tucTree, UINT nDepth, int nLeft, int nRight)
{
TUCounter *p;
int i=(nLeft+nRight)/2;
if (nLeft>nRight || nDepth==0)
return;
p=new TUCounter;
*tucTree = p;
(p->ch)[0]=chars[i][0];
(p->ch)[1]=chars[i][1];
p->counter=0;
p->left=p->right=NULL;
InitCounterTree(&(p->left), nDepth-1, nLeft, i-1);
InitCounterTree(&(p->right), nDepth-1, i+1, nRight);
}
void CFreqCounter::ReleaseCounterTree(TUCounter *root)
{
if (root==NULL)
return;
ReleaseCounterTree(root->left);
ReleaseCounterTree(root->right);
delete root;
}
void CFreqCounter::AddGram(ChChar first)
{
TUCounter * p=m_tucRoot;
TUCounter **q=&m_tucRoot;
while (p)
{
register int i = CompareChineseChar(first, p->ch);
if (i==0)
{
(p->counter)++;
return;
}
else if (i<0)
{
q=&(p->left);
p=p->left;
}
else
{
q=&(p->right);
p=p->right;
}
}
p=new TUCounter;
p->ch[0]=first[0];
p->ch[1]=first[1];
p->counter=1;
p->left=p->right=NULL;
*q=p;
}
void CFreqCounter::AddGram(ChChar first, ChChar second)
{
int l=0, r=m_nSizeArray-1, j, k;
do
{
k=(l+r)/2;
j=CompareChineseChar(first, m_aucArray[k].ch);
if (j<0)
r=k-1;
else if (j>0)
l=k+1;
else
break;
}
while (1);
// must found
if (CFreqCounter * p=m_aucArray[k].next)
p->AddGram(second);
}
void CFreqCounter::AddGram(ChChar first, ChChar second, ChChar third)
{
int l=0, r=m_nSizeArray-1, j, k;
do
{
k=(l+r)/2;
j=CompareChineseChar(first, m_aucArray[k].ch);
if (j<0)
r=k-1;
else if (j>0)
l=k+1;
else
break;
}
while (1);
// must found
if (CFreqCounter * p=m_aucArray[k].next)
p->AddGram(second, third);
}
void CFreqCounter::PostFirstScan()
{
m_nSizeArray=CountUniqueGrams(m_tucRoot);
if (m_nSizeArray)
{
m_aucArray = new AUCounter[m_nSizeArray];
UINT i=OutputTreeToArray(0, m_tucRoot);
}
ReleaseCounterTree(m_tucRoot);
m_tucRoot=NULL;
}
void CFreqCounter::InitSecondScan()
{
for (UINT i=0; i<m_nSizeArray; i++)
m_aucArray[i].next=new CFreqCounter((UINT)(log((double)m_aucArray[i].counter)/2));
}
void CFreqCounter::PostSecondScan()
{
for (UINT i=0; i<m_nSizeArray; i++)
m_aucArray[i].next->PostFirstScan();
}
void CFreqCounter::InitThirdScan()
{
for (UINT i=0; i<m_nSizeArray; i++)
m_aucArray[i].next->InitSecondScan();
}
void CFreqCounter::PostThirdScan()
{
for (UINT i=0; i<m_nSizeArray; i++)
m_aucArray[i].next->PostSecondScan();
}
UINT CFreqCounter::CountUniqueGrams(TUCounter * tucTree)
{
if (tucTree == NULL)
return 0;
UINT l=CountUniqueGrams(tucTree->left), \
r=CountUniqueGrams(tucTree->right);
if (tucTree->counter)
return l+r+1;
else return l+r;
}
UINT CFreqCounter::OutputTreeToArray(UINT left, TUCounter * tuc)
{
if (tuc == NULL)
return 0;
UINT i=OutputTreeToArray(left, tuc->left);
if (tuc->counter)
{
m_aucArray[left+i].ch[0]=tuc->ch[0];
m_aucArray[left+i].ch[1]=tuc->ch[1];
m_aucArray[left+i].counter=tuc->counter;
m_aucArray[left+i].next= NULL;
i++;
}
i += OutputTreeToArray(left+i, tuc->right);
return i;
}
void CFreqCounter::OutputUnigramFrequency(CFile &fout, const void * buf, size_t len)
{
AUCounter *p=new AUCounter[m_nSizeArray];
memmove(p, m_aucArray, m_nSizeArray*sizeof(AUCounter));
qsort(p, m_nSizeArray, sizeof(AUCounter), AUCounter::CompareCounterUnit);
for (UINT i=0; i<m_nSizeArray; i++)
{
char buf1[50];
if (buf)
fout.Write(buf, len);
fout.Write(&(p[i].ch), sizeof(ChChar));
sprintf(buf1, " %u\n", p[i].counter);
fout.Write(buf1, strlen(buf1));
}
delete p;
}
void CFreqCounter::OutputBigramFrequency(CFile &fout, const void * buf, size_t len)
{
char* buf1=new char[len+sizeof(ChChar)];
if (buf)
memmove(buf1, buf, len);
for (UINT i=0; i<m_nSizeArray; i++)
{
memmove(buf1+len, &(m_aucArray[i].ch), sizeof(ChChar));
m_aucArray[i].next->OutputUnigramFrequency(fout, buf1, len+2);
}
delete buf1;
}
void CFreqCounter::OutputTrigramFrequency(CFile &fout, const void * buf, size_t len)
{
char* buf1=new char[len+sizeof(ChChar)];
if (buf)
memmove(buf1, buf, len);
for (UINT i=0; i<m_nSizeArray; i++)
{
memmove(buf1+len, &(m_aucArray[i].ch), sizeof(ChChar));
m_aucArray[i].next->OutputBigramFrequency(fout, buf1, len+2);
}
delete buf1;
}
double AUMutual::moffset=0;
void CFreqCounter::OutputAllMutual(CFile &fout)
{
UINT n=0, i, j, k;
int l, r, m, ret;
ChChar cc;
for (i=0; i<m_nSizeArray; i++)
n+=m_aucArray[i].next->m_nSizeArray;
AUMutual *p=new AUMutual[n];
for (i=0, k=0; i<m_nSizeArray; i++)
for (j=0; j<m_aucArray[i].next->m_nSizeArray; j++, k++)
{
p[k].ch[0][0]=m_aucArray[i].ch[0];
p[k].ch[0][1]=m_aucArray[i].ch[1];
p[k].ch[1][0]=m_aucArray[i].next->m_aucArray[j].ch[0];
p[k].ch[1][1]=m_aucArray[i].next->m_aucArray[j].ch[1];
cc[0]=p[k].ch[1][0];
cc[1]=p[k].ch[1][1];
p[k].c[0]=m_aucArray[i].counter;
p[k].counter=m_aucArray[i].next->m_aucArray[j].counter;
l=0; r=m_nSizeArray-1;
do
{
m=(l+r)/2;
ret = CompareChineseChar(cc, m_aucArray[m].ch);
if (ret < 0)
r = m-1;
else if (ret > 0)
l = m+1;
else
break;
}
while (1);
// must found
p[k].c[1]=m_aucArray[m].counter;
p[k].CalculateMutual();
}
qsort(p, n, sizeof(AUMutual), AUMutual::CompareMutual);
for (i=0; i<n; i++)
{
char buf[100];
fout.Write(p[i].ch, 2*sizeof(ChChar));
sprintf(buf, " %u %u %u %g\n", p[i].c[0], \
p[i].c[1], p[i].counter, p[i].mutual);
fout.Write(buf, strlen(buf));
}
delete p;
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -