?? simpledictseg.cs
字號:
{
//按行讀取中文停用詞
string strEngStop = swEngFile.ReadLine();
//如果哈希表中不包括該停用詞則添加到哈希表中
if (!m_EngStopwordTbl.Contains(strEngStop))
{
m_EngStopwordTbl.Add(strEngStop, numEngStop);
numEngStop++;
}
}
swChrFile.Close();
swEngFile.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 將中文停用詞保存到文件中
/// </summary>
/// <param name="fileName">要保存文件名</param>
/// <remarks>對文件存取的異常不做異常處理,由調用者進行異常處理</remarks>
public void SaveChsStopwordDict(String fileName)
{
try
{
//創建一個新的存儲中文停用詞的文本文件,若該文件存在則覆蓋
FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("UTF-8"));
//遍歷中文停用詞表,寫入文件
foreach (DictionaryEntry i in m_ChsStopwordTbl)
{
sw.WriteLine(i.Key.ToString());
}
sw.Close();
fs.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 將英文停用詞保存到文件中
/// </summary>
/// <param name="fileName">要保存文件名</param>
/// <remarks>對文件存取的異常不做異常處理,由調用者進行異常處理</remarks>
public void SaveEngStopwordDict(String fileName)
{
try
{
//創建一個新的存儲英文停用詞的文本文件,若該文件存在則覆蓋
FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("UTF-8"));
//遍歷英文停用詞表,寫入文件
foreach (DictionaryEntry i in m_EngStopwordTbl)
{
sw.WriteLine(i.Key.ToString());
}
sw.Close();
fs.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 增加一個中文停用詞
/// </summary>
/// <param name="word"></param>
public void AddChsStopword(String word)
{
//如果原來詞庫中已存在,則不做任何操作
if (m_ChsStopwordTbl.Contains(word))
{
return;
}
else
{
m_ChsStopwordTbl.Add(word, m_ChsStopwordTbl.Count);
}
}
/// <summary>
/// 刪除一個中文停用詞
/// </summary>
/// <param name="word"></param>
public void DelChsStopword(String word)
{
//如果原來詞庫中不存在,則不做任何操作
m_ChsStopwordTbl.Remove(word);
}
/// <summary>
/// 增加一個英文停用詞
/// </summary>
/// <param name="word"></param>
public void AddEngStopword(String word)
{
//如果原來詞庫中已存在,則不做任何操作
if (m_EngStopwordTbl.Contains(word))
{
return;
}
else
{
m_EngStopwordTbl.Add(word, m_EngStopwordTbl.Count);
}
}
/// <summary>
/// 刪除一個英文停用詞
/// </summary>
/// <param name="word"></param>
public void DelEngStopword(String word)
{
//如果原來詞庫中不存在,則不做任何操作
m_EngStopwordTbl.Remove(word);
}
#endregion
#region 加載字典
public void LoadDict()
{
LoadDict(false);
}
/// <summary>
/// 加載字典
/// </summary>
/// <param name="clear">是否清除詞頻</param>
public void LoadDict(bool clear)
{
//加載姓名前綴后綴統計表
m_MatchNameRule.LoadNameTraffic(m_DictPath + "Name.dct");
//加載字典
m_Dict = Dict.LoadFromBinFileEx(m_DictPath + "Dict.dct");
m_DictMgr.Dict = m_Dict;
foreach (T_DictStruct word in m_Dict.Dicts)
{
if (clear)
{
word.Frequency = 0;
}
m_ExtractWords.InsertWordToDfa(word.Word, word);
m_POS.AddWordPos(word.Word, word.Pos);
}
//加載未登錄詞統計字典
if (File.Exists(m_DictPath + "UnknownWords.dct"))
{
m_UnknownWordsDict = Dict.LoadFromBinFileEx(m_DictPath + "UnknownWords.dct");
}
else
{
m_UnknownWordsDict = new T_DictFile();
}
m_UnknownWordsDictMgr.Dict = m_UnknownWordsDict;
if (clear)
{
m_MatchNameRule.ClearNameTraffic();
}
m_MatchNameRule.TrafficUnknownWordHandle = TrafficUnknownWord;
}
public void SaveDict()
{
m_MatchNameRule.SaveNameTraffic(m_DictPath + "Name.dct");
foreach (T_DictStruct word in m_Dict.Dicts)
{
T_DictStruct dict = (T_DictStruct)m_ExtractWords.GetTag(word.Word);
if (dict != null)
{
word.Frequency = dict.Frequency;
}
}
Dict.SaveToBinFileEx(m_DictPath + "Dict.dct", m_Dict);
Dict.SaveToBinFileEx(m_DictPath + "UnknownWords.dct", m_UnknownWordsDict);
}
#endregion
#region 分詞屬性
bool m_MatchName;
/// <summary>
/// 是否匹配漢語人名
/// </summary>
public bool MatchName
{
get
{
return m_MatchName;
}
set
{
m_MatchName = value;
}
}
T_Direction m_MatchDirection;
/// <summary>
/// 匹配方向
/// 默認為從左至右匹配,即正向匹配
/// </summary>
public T_Direction MatchDirection
{
get
{
return m_MatchDirection;
}
set
{
m_MatchDirection = value;
}
}
bool m_FilterStopWords;
/// <summary>
/// 是否過濾停用詞
/// </summary>
public bool FilterStopWords
{
get
{
return m_FilterStopWords;
}
set
{
if (value)
{
if (m_ChsStopwordTbl.Count == 0 || m_EngStopwordTbl.Count == 0)
{
LoadStopwordsDict(m_DictPath + CHS_STOP_WORD_FILENAME, m_DictPath + ENG_STOP_WORD_FILENAME);
}
}
m_FilterStopWords = value;
}
}
#endregion
#region 分詞
private void InsertWordToArray(String word, List<String> arr)
{
arr.Add(word);
}
/// <summary>
/// 預分詞
/// </summary>
/// <param name="str">要分詞的句子</param>
/// <returns>預分詞后的字符串輸出</returns>
private List<String> PreSegment(String str)
{
ArrayList initSeg = new ArrayList();
if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
{
return new List<String>();
}
List<String> retWords = new List<String>();
int i = 0;
m_ExtractWords.MatchDirection = MatchDirection;
while (i < initSeg.Count)
{
String word = (String)initSeg[i];
if (word == "")
{
word = " ";
}
if (i < initSeg.Count - 1)
{
bool mergeOk = false;
if (((word[0] >= '0' && word[0] <= '9') ||(word[0] >= '0' && word[0] <= '9')) &&
((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
(word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
)
{
//合并浮點數
word = MergeFloat(initSeg, i, ref i);
mergeOk = true;
}
else if ((word[0] >= 'a' && word[0] <= 'z') ||
(word[0] >= 'A' && word[0] <= 'Z')
)
{
//合并成英文專業名詞
String specialEnglish = MergeEnglishSpecialWord(m_ExtractWords, initSeg, i, ref i);
if (specialEnglish != null)
{
InsertWordToArray(specialEnglish, retWords);
continue;
}
//合并郵件地址
if ((String)initSeg[i + 1] != "")
{
if (((String)initSeg[i + 1])[0] == '@')
{
word = MergeEmail(initSeg, i, ref i);
mergeOk = true;
}
}
}
if (mergeOk)
{
InsertWordToArray(word, retWords);
continue;
}
}
if (word[0] < 0x4e00 || word[0] > 0x9fa5)
{
//英文或符號,直接加入
InsertWordToArray(word, retWords);
}
else
{
List<T_WordInfo> words = m_ExtractWords.ExtractFullTextMaxMatch(word);
int lastPos = 0;
bool lstIsName = false; //前一個詞是人名
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -