?? extractwords.cs
字號:
?/***************************************************************************************
* KTDictSeg 簡介: KTDictSeg 是由KaiToo搜索開發(fā)的一款基于字典的簡單中英文分詞算法
* 主要功能: 中英文分詞,未登錄詞識別,多元歧義自動識別,全角字符識別能力
* 主要性能指標(biāo):
* 分詞準(zhǔn)確度:90%以上(有待專家的權(quán)威評測)
* 處理速度: 600KBytes/s
*
* 版本: V1.2.02
* Copyright(c) 2007 http://www.kaitoo.com
* 作者:肖波
* 授權(quán): 開源GPL
* 公司網(wǎng)站: http://www.kaitoo.com
* 個人博客: http://blog.csdn.net/eaglet; http://www.cnblogs.com/eaglet
* 聯(lián)系方式: blog.eaglet@gmail.com
* ***************************************************************************************/
using System;
using System.Collections.Generic;
using System.Collections;
using System.Text;
using System.Diagnostics;
namespace FTAlgorithm
{
public enum T_Direction
{
/// <summary>
/// 從左到右
/// </summary>
LeftToRight = 0,
/// <summary>
/// 從右到左
/// </summary>
RightToLeft = 1,
}
/// <summary>
/// 單詞信息
/// </summary>
public class T_WordInfo
{
/// <summary>
/// 單詞
/// </summary>
public String Word;
/// <summary>
/// 單詞首字符在全文中的位置
/// </summary>
public int Position;
/// <summary>
/// 單詞的權(quán)重級別
/// </summary>
public int Rank;
/// <summary>
/// 單詞對應(yīng)的標(biāo)記
/// </summary>
public object Tag;
}
public delegate bool CompareByPosFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);
public delegate bool SelectByFreqFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);
/// <summary>
/// 從全文中提取指定的單詞,及其位置
/// </summary>
public class CExtractWords
{
CWordDfa m_WordDfa;
List<int> m_GameNodes;
int m_MinSpace;
int m_MinDeep;
T_Direction m_MatchDirection;
CompareByPosFunc m_CompareByPos;
SelectByFreqFunc m_SelectByFreq;
public CompareByPosFunc CompareByPosEvent
{
get
{
return m_CompareByPos;
}
set
{
m_CompareByPos = value;
}
}
public SelectByFreqFunc SelectByFreqEvent
{
get
{
return m_SelectByFreq;
}
set
{
m_SelectByFreq = value;
}
}
/// <summary>
/// 匹配方向
/// </summary>
public T_Direction MatchDirection
{
get
{
return m_MatchDirection;
}
set
{
m_MatchDirection = value;
}
}
public CExtractWords()
{
m_MatchDirection = T_Direction.LeftToRight;
m_WordDfa = new CWordDfa();
}
public object GetTag(String word)
{
return m_WordDfa.GetTag(word);
}
public void InsertWordToDfa(String word, object tag)
{
m_WordDfa.InsertWordToDfa(word, tag);
}
private bool CompareGroup(List<T_WordInfo> words, List<int> pre, List<int> cur, T_Direction direction)
{
int i ;
if (direction == T_Direction.LeftToRight)
{
i = 0;
}
else
{
i = cur.Count - 1;
}
while ((direction == T_Direction.LeftToRight && i < cur.Count) ||
(direction == T_Direction.RightToLeft && i >= 0))
{
if (i >= pre.Count)
{
break;
}
int preId = (int)pre[i];
int curId = (int)cur[i];
if (((T_WordInfo)words[curId]).Word.Length > ((T_WordInfo)words[preId]).Word.Length)
{
return true;
}
else if (((T_WordInfo)words[curId]).Word.Length < ((T_WordInfo)words[preId]).Word.Length)
{
return false;
}
if (direction == T_Direction.LeftToRight)
{
i++;
}
else
{
i--;
}
}
return false;
}
/// <summary>
/// 博弈樹
/// </summary>
/// <param name="words"></param>
/// <param name="nodes"></param>
/// <param name="init"></param>
/// <param name="begin"></param>
/// <param name="end"></param>
/// <param name="spaceNum"></param>
/// <param name="deep"></param>
/// <returns></returns>
private List<int> GameTree(List<T_WordInfo> words, List<int> nodes, bool init, int begin, int end, ref int spaceNum, ref int deep)
{
if (init)
{
int startPos = ((T_WordInfo)words[begin]).Position;
for (int i = begin; i <= end ; i++)
{
T_WordInfo wordInfo = (T_WordInfo)words[i];
spaceNum = wordInfo.Position - startPos;
deep = 0;
List<int> oneNodes;
if (i == end)
{
oneNodes = new List<int>();
oneNodes.Add(i);
deep++;
}
else
{
oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);
}
if (oneNodes != null)
{
bool select = false;
if (m_MinSpace > spaceNum ||
(m_MinSpace == spaceNum && deep < m_MinDeep))
{
select = true;
if (m_MinSpace == 0)
{
if (SelectByFreqEvent != null)
{
select = SelectByFreqEvent(words, m_GameNodes, oneNodes);
}
}
}
else if (m_MinDeep == deep && m_MinSpace == spaceNum)
{
if (m_CompareByPos != null && m_MinSpace == 0)
{
select = m_CompareByPos(words, m_GameNodes, oneNodes);
}
else
{
select = CompareGroup(words, m_GameNodes, oneNodes, MatchDirection);
}
}
if (select)
{
m_MinDeep = deep;
m_MinSpace = spaceNum;
m_GameNodes.Clear();
foreach (int obj in oneNodes)
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -