?? document.cs
字號:
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace ClusterUsingKmeans
{
public class Document
{
LoadDict LD = LoadDict.getDict();
private Dictionary<string, Word> MyWord;
public Document()
{ }
public Document(string fileName)
{
MyWord = new Dictionary<string, Word>();
this._fileName = fileName;
string strFile = IOControl.ReadFileUsingDefault(fileName);
_length = strFile.Length;
string[] tempWord = WordSegment.SegmentWord(strFile);
double num;
double num2;
long D = 0x5f5e100;
long Dw = 0;
//計算詞頻
foreach (string str in tempWord)
{
if (!MyWord.ContainsKey(str))
{
Regex regex = new Regex("[\u4e00-\u9fa5]"); //中文
if (regex.IsMatch(str))
{
Word W = new Word(str);
W.WordFrequency = 1;
addWord(W);
}
}
else
{
Word W = getWordByKey(str);
W.WordFrequency += 1;
}
}
//計算詞的特征值
List<string> NoNeedWord = new List<string>();
foreach (string str in MyWord.Keys)
{
if (LD.sogou.ContainsKey(str))
{
num = ((double)(MyWord[str].WordFrequency)) / ((double)_length);
num2 = 0;
Dw = int.Parse(LD.sogou[str].ToString());
num2 = Math.Abs(Math.Log(((double)D) / ((double)Dw)));
Word W = getWordByKey(str);
W.CharacterValue = num * num2;
}
else
{
NoNeedWord.Add(str);
}
}
//去掉無用詞
foreach (string str in NoNeedWord)
{
deleteWord(str);
}
}
public double SimilitudeValueToDocumentUsingCos(Document Doc)
{
double num = 0;
double d = 0;
double num3 = 0;
double num4 = 0;
foreach (string str in MyWord.Keys)
{
if (Doc.MyWord.ContainsKey(str))
{
// a
num += ((double)MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue); //d1*c1
d += ((double)MyWord[str].CharacterValue) * ((double)MyWord[str].CharacterValue); //|d1|
num3 += ((double)Doc.MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);//|c1|
num4 += 1;
}
}
if (((num4 / ((double)MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc.MyWord.Count)) <= 0.1))
{
return 0;
}
d = Math.Sqrt(d);
num3 = Math.Sqrt(num3);
return (num / (d * num3));
}
public double SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc)
{
double num = 0;
double d = 0;
double num3 = 0;
double num4 = 0;
foreach (string str in MyWord.Keys)
{
if (Doc.MyWord.ContainsKey(str))
{
// a
num += ((double)MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue); //d1*c1
d += ((double)MyWord[str].CharacterValue) * ((double)MyWord[str].CharacterValue); //|d1|
num3 += ((double)Doc.MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);//|c1|
num4 += 1;
}
}
if (((num4 / ((double)MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc.MyWord.Count)) <= 0.1))
{
return 0;
}
return (num / (d + num3 - num));
}
private string _fileName;
public string FileName
{
get { return _fileName; }
set { _fileName = value; }
}
private int _length;
public int Length
{
get { return _length; }
set { _length = value; }
}
public bool deleteWord(string key)
{
return MyWord.Remove(key);
}
public void addWord(string key)
{
if (MyWord.ContainsKey(key))
{
return;
}
else
{
MyWord.Add(key, new Word(key));
}
}
public void addWord(Word word)
{
if (MyWord.ContainsKey(word.Key))
{
return;
}
else
{
MyWord.Add(word.Key, word);
}
}
public Word getWordByKey(string key)
{
if (MyWord.ContainsKey(key))
{
return MyWord[key];
}
else
{
return null;
}
}
public Dictionary<string, Word> .KeyCollection getAllWordKeys()
{
return MyWord.Keys;
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -