?? dfa.cs
字號(hào):
?/***************************************************************************************
* KTDictSeg 簡(jiǎn)介: KTDictSeg 是由KaiToo搜索開(kāi)發(fā)的一款基于字典的簡(jiǎn)單中英文分詞算法
* 主要功能: 中英文分詞,未登錄詞識(shí)別,多元歧義自動(dòng)識(shí)別,全角字符識(shí)別能力
* 主要性能指標(biāo):
* 分詞準(zhǔn)確度:90%以上(有待專(zhuān)家的權(quán)威評(píng)測(cè))
* 處理速度: 600KBytes/s
*
* 版本: V1.0 Bata
* Copyright(c) 2007 http://www.kaitoo.com
* 作者:肖波
* 授權(quán): 開(kāi)源GPL
* 公司網(wǎng)站: http://www.kaitoo.com
* 個(gè)人博客: http://blog.csdn.net/eaglet; http://www.cnblogs.com/eaglet
* 聯(lián)系方式: blog.eaglet@gmail.com
* ***************************************************************************************/
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
namespace FTAlgorithm
{
/// <summary>
/// 有窮自動(dòng)機(jī)
/// 單元結(jié)構(gòu)
/// </summary>
class T_DfaUnit
{
public T_DfaUnit Childs; //該節(jié)點(diǎn)的后趨節(jié)點(diǎn)
public T_DfaUnit NextFriend; //該節(jié)點(diǎn)的下一個(gè)伙伴節(jié)點(diǎn)
public String QuitWord; //結(jié)束時(shí)應(yīng)返回的字符串,如果為null,表示沒(méi)有結(jié)束
public object Tag; //對(duì)于技術(shù)字符串的標(biāo)簽
public Char Char; //當(dāng)前字符
public bool NeedTrans; //是否需要轉(zhuǎn)義
}
class T_InnerInfo
{
public int Rank;
public object Tag;
}
/// <summary>
/// 單詞有窮自動(dòng)機(jī)
/// </summary>
class CWordDfa
{
bool m_UseRank ;
Hashtable m_WordsTbl; //存儲(chǔ)需要提取的單詞的表
Hashtable m_FstCharTbl; //首字Hash表,作為有窮自動(dòng)機(jī)的入口
private T_DfaUnit AddChar(T_DfaUnit cur, Char c, String quitWord, bool needTrans, object tag)
{
T_DfaUnit unit = new T_DfaUnit();
unit.Char = c;
unit.NeedTrans = needTrans;
unit.Childs = null;
unit.QuitWord = quitWord;
if (quitWord != null)
{
unit.Tag = tag;
}
unit.NextFriend = null;
if (cur == null)
{
Debug.Assert(m_FstCharTbl[c] == null);
m_FstCharTbl[c] = unit;
}
else
{
if (cur.Childs == null)
{
cur.Childs = unit;
}
else
{
T_DfaUnit friend = cur.Childs;
T_DfaUnit oldFriend = friend;
while (friend != null)
{
oldFriend = friend;
friend = friend.NextFriend;
}
oldFriend.NextFriend = unit;
}
}
return unit;
}
/// <summary>
/// 轉(zhuǎn)義符號(hào)比較
/// </summary>
/// <param name="trans">轉(zhuǎn)義符號(hào)</param>
/// <param name="c">實(shí)際字符</param>
/// <returns>相等返回true</returns>
private bool TransCharEqual(Char trans, Char c)
{
return false;
}
/// <summary>
/// 獲取單詞對(duì)應(yīng)的標(biāo)簽
/// </summary>
/// <param name="word"></param>
/// <returns></returns>
public object GetTag(String word)
{
T_InnerInfo innerInfo = (T_InnerInfo)m_WordsTbl[word];
if (innerInfo != null)
{
return innerInfo.Tag;
}
else
{
return null;
}
}
/// <summary>
/// 獲取單詞對(duì)應(yīng)的權(quán)重級(jí)別
/// </summary>
/// <param name="word">單詞</param>
/// <returns>級(jí)別,未找到單詞,返回0</returns>
public int GetRank(String word)
{
if (!m_UseRank)
{
return 0;
}
T_InnerInfo innerInfo = (T_InnerInfo)m_WordsTbl[word];
if (innerInfo != null)
{
return innerInfo.Rank;
}
else
{
return 0;
}
}
public T_DfaUnit Next(T_DfaUnit cur, Char c)
{
if (cur == null)
{
T_DfaUnit unit = (T_DfaUnit)m_FstCharTbl[c];
if (unit == null)
{
return null;
}
else
{
return unit;
}
}
else
{
T_DfaUnit unit = cur.Childs;
while (unit != null)
{
if (unit.NeedTrans)
{
if (TransCharEqual(unit.Char, c))
{
return cur;
}
}
if (unit.Char == c)
{
return unit;
}
unit = unit.NextFriend;
}
}
return null;
}
/// <summary>
/// 遍歷有窮自動(dòng)機(jī),獲取最后一個(gè)和輸入單詞匹配的單元
/// </summary>
/// <param name="word">單詞</param>
/// <param name="pos">輸出位置</param>
/// <returns>最后一個(gè)匹配單元,如果第一個(gè)字符就不能匹配,返回null</returns>
private T_DfaUnit GetLastMatchUnit(String word, out int pos, object tag)
{
pos = 0;
T_DfaUnit cur = null;
T_DfaUnit last = null;
while (pos < word.Length)
{
last = cur;
cur = Next(cur, word[pos]);
if (cur == null)
{
return last;
}
pos++;
}
cur.QuitWord = word;
cur.Tag= tag;
return cur;
}
/// <summary>
/// 向有窮自動(dòng)機(jī)輸入單詞
/// </summary>
/// <param name="word">單詞</param>
/// <param name="rank">單詞的權(quán)重</param>
public void InsertWordToDfa(String word)
{
InsertWordToDfa(word, 0, null);
}
/// <summary>
/// 向有窮自動(dòng)機(jī)輸入單詞
/// </summary>
/// <param name="word">單詞</param>
/// <param name="tag">標(biāo)簽</param>
public void InsertWordToDfa(String word, object tag)
{
InsertWordToDfa(word, 0, tag);
}
/// <summary>
/// 向有窮自動(dòng)機(jī)輸入單詞
/// </summary>
/// <param name="word">單詞</param>
/// <param name="rank">單詞的權(quán)重</param>
public void InsertWordToDfa(String word, int rank, object tag)
{
if (word == null || word == "")
{
return;
}
if (rank != 0)
{
m_UseRank = true;
}
if (m_WordsTbl[word] != null)
{
return;
}
T_InnerInfo innerInfo = new T_InnerInfo();
innerInfo.Rank = rank;
innerInfo.Tag = tag;
m_WordsTbl[word] = innerInfo;
int pos;
T_DfaUnit unit = GetLastMatchUnit(word, out pos, tag);
bool needTrans = false;
for (int i = pos; i < word.Length; i++)
{
if (!needTrans && word[i] == '\\')
{
if (i == word.Length - 1)
{
//最后一個(gè)字符是轉(zhuǎn)義符號(hào)
throw (new Exception("Last char is trans char!"));
}
//轉(zhuǎn)義
needTrans = true;
continue;
}
if (i == word.Length - 1)
{
unit = AddChar(unit, word[i], word, needTrans, tag);
}
else
{
unit = AddChar(unit, word[i], null, needTrans, tag);
}
needTrans = false;
}
}
public CWordDfa()
{
m_WordsTbl = new Hashtable();
m_FstCharTbl = new Hashtable();
m_UseRank = false;
}
}
}
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -