?? ktdictsegtokenizer.cs
字號:
/**********************************************
* 采用KTDictSeg的Lucene.Net 中文分詞分析器
* 參考 suyuan 的開源代碼修改
* suyuan 的開源代碼出處 http://www.cnblogs.com/suyuan/archive/2008/03/25/1120827.html
*********************************************/
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using KTDictSeg;
using FTAlgorithm;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.KTDictSeg
{
public class KTDictSegTokenizer : Tokenizer
{
private static CSimpleDictSeg m_SimpleDictSeg;
private List<T_WordInfo> m_WordList = new List<T_WordInfo>();
private int m_Position = -1; //詞匯在緩沖中的位置.
public KTDictSegTokenizer(System.IO.TextReader input)
: base(input)
{
//這里用了一個第三方的中文分詞組件.
if (m_SimpleDictSeg == null)
{
try
{
m_SimpleDictSeg = new CSimpleDictSeg();
m_SimpleDictSeg.LoadConfig("KTDictSeg.xml");
m_SimpleDictSeg.LoadDict();
}
catch (Exception e)
{
m_SimpleDictSeg = null;
throw e;
}
}
m_WordList = m_SimpleDictSeg.SegmentToWordInfos(input.ReadToEnd());
}
//DotLucene的分詞器簡單來說,就是實現Tokenizer的Next方法,把分解出來的每一個詞構造為一個Token,因為Token是DotLucene分詞的基本單位。
public override Token Next()
{
int length = 0; //詞匯的長度.
int start = 0; //開始偏移量.
m_Position++;
if (m_Position < m_WordList.Count)
{
length = m_WordList[m_Position].Word.Length;
start = m_WordList[m_Position].Position;
return new Token(m_WordList[m_Position].Word, start, start + length);
}
return null;
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -