?? simpledictseg.cs
字號:
foreach (T_WordInfo wordInfo in words)
{
if (lastPos < wordInfo.Position)
{
/*
String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
InsertWordToArray(unMatchWord, retWords);
*/
//中間有未匹配詞,將單個字逐個加入
for (int j = lastPos; j < wordInfo.Position; j++)
{
InsertWordToArray(word[j].ToString(), retWords);
}
}
lastPos = wordInfo.Position + wordInfo.Word.Length ;
//統計中文姓名的后綴
if (AutoStudy && lstIsName)
{
T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
{
m_MatchNameRule.AddBefore(wordInfo.Word);
}
lstIsName = false;
}
//統計中文姓名的前綴
//如總統,主席等
if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
{
if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
{
T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
m_MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
}
lstIsName = true;
}
InsertWordToArray(wordInfo.Word, retWords);
}
if (lastPos < word.Length)
{
//尾部有未匹配詞,將單個字逐個加入
for (int j = lastPos; j < word.Length; j++)
{
InsertWordToArray(word[j].ToString(), retWords);
}
//InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
}
}
i++;
}
return retWords;
}
private void TrafficUnknownWord(String word, T_POS Pos)
{
if (word.Length <= 1 || word.Length > 3)
{
return;
}
T_DictStruct unknownWord = m_UnknownWordsDictMgr.GetWord(word);
if (unknownWord == null)
{
m_UnknownWordsDictMgr.InsertWord(word, 1, (int)Pos);
return;
}
//如果是屏蔽的未登錄詞,則不加入
//屏蔽的未登錄詞用詞性等于0來表示
if (unknownWord.Pos == 0)
{
return;
}
unknownWord.Pos |= (int)Pos;
unknownWord.Frequency++;
if (unknownWord.Frequency > UnknownWordsThreshold && AutoInsertUnknownWords)
{
T_DictStruct w = m_DictMgr.GetWord(word);
if (w == null)
{
m_DictMgr.InsertWord(word, unknownWord.Frequency, unknownWord.Pos);
m_ExtractWords.InsertWordToDfa(word, unknownWord);
m_POS.AddWordPos(word, unknownWord.Pos);
}
else
{
w.Pos |= unknownWord.Pos;
w.Frequency += unknownWord.Frequency;
}
unknownWord.Frequency = 0;
}
}
/// <summary>
/// 召回未登錄詞
/// </summary>
/// <returns></returns>
private List<String> RecoverUnknowWord(List<String> words)
{
List<String> retWords = new List<String>();
int i = 0;
int j = 0;
while (i < words.Count)
{
String w = (String)words[i];
if (i == words.Count-1)
{
retWords.Add(w);
break;
}
if (m_POS.IsUnknowOneCharWord(w))
{
String word = w;
i++;
while (m_POS.IsUnknowOneCharWord(words[i]))
{
word += (String)words[i];
i++;
if (i >= words.Count)
{
break;
}
}
if (AutoStudy)
{
TrafficUnknownWord(word, T_POS.POS_A_NZ);
//將所有連續單字組成一個詞,假設其為未登錄詞,進行統計
if (j < i && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
{
j = i;
if (j < words.Count)
{
String longWord = word;
while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
{
longWord += words[j];
j++;
if (j >= words.Count)
{
break;
}
}
TrafficUnknownWord(longWord, T_POS.POS_A_NZ);
}
}
}
retWords.Add(word);
continue;
}
else
{
if (AutoStudy)
{
//將所有連續單字組成一個詞,假設其為未登錄詞,進行統計
if (j <= i && w.Length == 1 && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
{
j = i + 1;
String word = w;
if (j < words.Count)
{
while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
{
word += words[j];
j++;
if (j >= words.Count)
{
break;
}
}
TrafficUnknownWord(word, T_POS.POS_A_NZ);
}
}
}
retWords.Add(w);
}
i++;
}
return retWords;
}
/// <summary>
/// 分詞,不屏蔽停用詞
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private List<String> SegmentNoStopWord(String str)
{
List<String> preWords = PreSegment(str);
List<String> retWords = new List<String>();
int index = 0 ;
while (index < preWords.Count)
{
int next = -1;
foreach (IRule rule in m_Rules)
{
if (!m_MatchName && rule is MatchName)
{
continue;
}
next = rule.ProcRule(preWords, index, retWords);
if (next > 0)
{
index = next;
break;
}
}
if (next > 0)
{
continue;
}
retWords.Add(preWords[index]);
index++;
}
//return retWords;
List<String> retStrings = RecoverUnknowWord(retWords);
if (AutoStudy)
{
foreach (String word in retStrings)
{
T_DictStruct dict = (T_DictStruct)m_ExtractWords.GetTag(word);
if (dict != null)
{
dict.Frequency++;
}
}
}
return retStrings;
}
/// <summary>
/// 定期保存最新的字典和統計信息
/// </summary>
private void SaveDictOnTime()
{
if (!AutoStudy)
{
return;
}
TimeSpan s = DateTime.Now - m_LastSaveTime;
if (s.TotalSeconds > AutoSaveInterval)
{
m_LastSaveTime = DateTime.Now;
SaveDict();
}
}
/// <summary>
/// 分詞并輸出單詞信息列表
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public List<T_WordInfo> SegmentToWordInfos(String str)
{
//定時保存字典
SaveDictOnTime();
List<String> words = SegmentNoStopWord(str);
List<T_WordInfo> retWords = new List<T_WordInfo>();
int position = 0;
foreach (String word in words)
{
if (m_FilterStopWords)
{
if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
{
position += word.Length;
continue;
}
}
T_WordInfo wordInfo = new T_WordInfo();
wordInfo.Word = word;
wordInfo.Position = position;
retWords.Add(wordInfo);
position += word.Length;
}
return retWords;
}
/// <summary>
/// 分詞只輸出單詞列表
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public List<String> Segment(String str)
{
//定時保存字典
SaveDictOnTime();
List<String> words = SegmentNoStopWord(str);
if (!m_FilterStopWords)
{
return words;
}
else
{
List<String> retWords = new List<String>();
foreach (String word in words)
{
if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
{
continue;
}
retWords.Add(word);
}
return retWords;
}
}
#endregion
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -