?? vsm.java
字號:
package yus.baseline;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
public class Vsm {
final static String[] FILES = { "1.txt", "2.txt", "3.txt", "4.txt",
"5.txt", "6.txt", "7.txt", "8.txt", "9.txt", "10.txt", "11.txt",
"12.txt", "13.txt", "14.txt", "15.txt", "16.txt", "17.txt",
"18.txt", "19.txt", "20.txt", "21.txt", "22.txt", "23.txt",
"24.txt", "25.txt", "26.txt", "27.txt", "28.txt", "29.txt",
"30.txt", "31.txt", "32.txt", "33.txt", "34.txt", "35.txt",
"36.txt", "37.txt", "38.txt", "39.txt", "40.txt", "41.txt",
"42.txt", "43.txt", "44.txt", "45.txt", "46.txt", "47.txt",
"48.txt", "49.txt", "50.txt", "51.txt", "52.txt", "53.txt",
"54.txt", "55.txt", "56.txt", "57.txt", "58.txt", "59.txt",
"60.txt", "61.txt", "62.txt", "63.txt", "64.txt", "65.txt",
"66.txt", "67.txt", "68.txt", "69.txt", "70.txt", "71.txt",
"72.txt", "73.txt", "74.txt", "75.txt", "76.txt", "77.txt",
"78.txt", "79.txt", "80.txt", "81.txt", "82.txt", "83.txt",
"84.txt", "85.txt", "86.txt", "87.txt", "88.txt", "89.txt",
"90.txt", "91.txt", "92.txt", "93.txt", "94.txt", "95.txt",
"96.txt", "97.txt", "98.txt", "99.txt", "100.txt" };
final static String StopWordFile = "stop_words_ch.txt";
final static int Dimensionality = 100;
final static double MIN = -9999.0;
public static void main(String[] args) throws IOException {
long startTime = System.currentTimeMillis();
Map<String, Integer> wordInFileHM = new HashMap<String, Integer>();// N(f,w)
Map<String, Integer> wordHM = new HashMap<String, Integer>();// N(w)
Map<String, Integer> totalWordHM = new HashMap<String, Integer>();// 每個文件詞的總數,所有文件詞的總數
Map<String, Double> valueHM = new HashMap<String, Double>();// 詞語的信息增益
List<String> wordList = new ArrayList<String>();
wordList = statistic(wordInFileHM, wordHM, totalWordHM, wordList);
computePlusValue(valueHM, wordInFileHM, wordHM, totalWordHM, wordList);
selectFeatureByIG(wordList, valueHM);
selectFeatureByTFIDF(wordList, wordInFileHM, totalWordHM);
long endTime = System.currentTimeMillis();
System.out.println("The spending time is: "
+ String.valueOf(endTime - startTime) + " ms");
}
private static String[][][] selectFeatureByIG(List<String> wordList,
Map<String, Double> valueHM) {
String[] waitSelect = new String[wordList.size()];
int i = 0;
for (String word : wordList) {
waitSelect[i++] = word;
}
px(waitSelect, valueHM);
String[][][] featurePlus = new String[FILES.length][Dimensionality][2];
for (int m = 0; m < FILES.length; m++) {
for (int n = 0; n < Dimensionality; n++) {
featurePlus[m][n][0] = waitSelect[n];
if (valueHM.get(FILES[m] + "/" + waitSelect[n]) == null) {
featurePlus[m][n][1] = String.valueOf(MIN);
} else {
featurePlus[m][n][1] = String.valueOf(valueHM.get(FILES[m]
+ "/" + waitSelect[n]));
}
}
}
return featurePlus;
}
private static String[][][] selectFeatureByTFIDF(List<String> wordList,
Map<String, Integer> wordInFileHM, Map<String, Integer> totalWordHM) {
// TODO Auto-generated method stub
String[] waitSelect = new String[wordList.size()];
int i = 0;
for (String word : wordList) {
waitSelect[i++] = word;
}
String[][][] featureVSM = new String[FILES.length][Dimensionality][2];
int[] nj = new int[wordList.size()];
int k = 0;
for (String word : wordList) {
for (int j = 0; j < FILES.length; j++) {
if (wordInFileHM.get(FILES[j] + "/" + word) != null) {
nj[k]++;
}
}
k++;
}
Map<String, Double> tfidfHM = new HashMap<String, Double>();
int fileLen = FILES.length, wordLen = waitSelect.length;
for (int m = 0; m < fileLen; m++) {
for (int n = 0; n < wordLen; n++) {
double subValue = 0;
try {
subValue = wordInFileHM.get(FILES[m] + "/" + waitSelect[n])
* Math.log(totalWordHM.get(
"totalWords" + "/" + "allFiles")
.doubleValue()
/ nj[n]);
} catch (Exception e) {
subValue = 0;
}
try {
double v = tfidfHM.get(waitSelect[n]);
tfidfHM.put(waitSelect[n], v + subValue);
} catch (Exception e) {
tfidfHM.put(waitSelect[n], subValue);
}
tfidfHM.put(FILES[m] + "/" + waitSelect[n], subValue);
}
}
px(waitSelect, tfidfHM);
for (int m = 0; m < fileLen; m++) {
for (int n = 0; n < Dimensionality; n++) {
featureVSM[m][n][0] = waitSelect[n];
if (wordInFileHM.get(FILES[m] + "/" + waitSelect[n]) == null) {
featureVSM[m][n][1] = String.valueOf(0);
} else {
featureVSM[m][n][1] = String.valueOf(tfidfHM
.get(waitSelect[n]));
}
}
}
return featureVSM;
}
/*
* 排序
*/
private static void px(String[] waitSelect, Map<String, Double> valueHM) {
// TODO Auto-generated method stub
if (waitSelect.length <= 1)
return;
double d = valueHM.get(waitSelect[0]);
String s = waitSelect[0];
int i = 0, j = waitSelect.length - 1;
while (i < j) {
while (valueHM.get(waitSelect[j]) < d && i < j) {
j--;
}
if (i < j) {
waitSelect[i] = waitSelect[j];
i++;
}
while (valueHM.get(waitSelect[i]) > d && i < j) {
i++;
}
if (i < j) {
waitSelect[j] = waitSelect[i];
j--;
}
}
waitSelect[i] = s;
String[] s1 = new String[i];
for (int k = 0; k < s1.length; k++) {
s1[k] = waitSelect[k];
}
String[] s2 = new String[waitSelect.length - i - 1];
for (int k = 0; k < s2.length; k++) {
s2[k] = waitSelect[i + 1 + k];
}
px(s1, valueHM);
px(s2, valueHM);
for (int k = 0; k < s1.length; k++) {
waitSelect[k] = s1[k];
}
for (int k = 0; k < s2.length; k++) {
waitSelect[i + 1 + k] = s2[k];
}
}
/*
* 計算信息增益
*/
private static void computePlusValue(Map<String, Double> valueHM,
Map<String, Integer> wordInFileHM, Map<String, Integer> wordHM,
Map<String, Integer> totalWordHM, List<String> wordList) {
// TODO Auto-generated method stub
int fileLen = FILES.length;
for (int i = 0; i < fileLen; i++) {
for (String word : wordList) {
if (wordInFileHM.get(FILES[i] + "/" + word) != null) {
double wf, w, nwf, nf;
wf = wordInFileHM.get(FILES[i] + "/" + word).doubleValue();
w = wordHM.get("total" + "/" + word).doubleValue();
nwf = totalWordHM.get("totalWords" + "/" + FILES[i])
.doubleValue()
- wf;
nf = totalWordHM.get("totalWords" + "/" + "allFiles")
.doubleValue()
- w;
double subValue = wf * Math.log(wf / w * fileLen) + nwf
* Math.log(nwf / nf * fileLen);
try {
double v = valueHM.get(word);
valueHM.put(word, v + subValue);
} catch (Exception e) {
valueHM.put(word, subValue);
}
valueHM.put(FILES[i] + "/" + word, subValue);
} else {
double nwf = totalWordHM.get("totalWords" + "/" + FILES[i])
.doubleValue();
double nf = totalWordHM
.get("totalWords" + "/" + "allFiles").doubleValue()
- wordHM.get("total" + "/" + word).doubleValue();
double subValue = nwf * Math.log(nwf / nf * fileLen);
try {
double v = valueHM.get(word);
valueHM.put(word, v + subValue);
} catch (Exception e) {
valueHM.put(word, subValue);
}
valueHM.put(FILES[i] + "/" + word, subValue);
}
}
}
}
private static List<String> statistic(Map<String, Integer> wordInFileHM,
Map<String, Integer> wordHM, Map<String, Integer> totalWordHM,
List<String> wordList) throws IOException {
// TODO Auto-generated method stub!
Map<String, Integer> stopWordHM = new HashMap<String, Integer>();
getStopWordTable(stopWordHM);
int fileLen = FILES.length;
String path = new File("").getAbsolutePath() + "/text/desFile/";
for (int i = 0; i < fileLen; i++) {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(path + FILES[i])));
String line = br.readLine();
while (line != null) {
if (!"".equals(line.trim())) {
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String cluster = tokenizer.nextToken().trim();
String[] a = cluster.split("/");
if (stopWordHM.get(a[0]) == null) {
a[0] = delQJFH(a[0]);
if (!"".equals(a[0])) {
addToDic(wordInFileHM, FILES[i], a[0].trim());
addToDic(wordHM, "total", a[0].trim());
addToDic(totalWordHM, "totalWords", FILES[i]);
addToDic(totalWordHM, "totalWords", "allFiles");
if (!wordList.contains(a[0])) {
wordList.add(a[0]);
}
}
}
}
}
line = br.readLine();
}
br.close();
}
return delSmallWord(wordList, wordHM);
}
private static List<String> delSmallWord(List<String> wordList,
Map<String, Integer> wordHM) {
// TODO Auto-generated method stub
int threshold = 10;
List<String> list = new ArrayList<String>();
for (String word : wordList) {
if (wordHM.get("total/" + word) > threshold) {
list.add(word);
}
}
return list;
}
private static String delQJFH(String s) throws UnsupportedEncodingException {
// TODO Auto-generated method stub
try {
while (isSymbol(s)) {
s = s.substring(1);
}
return s;
} catch (Exception e) {
return "";
}
}
private static boolean isSymbol(String s)
throws UnsupportedEncodingException {
byte[] b = s.getBytes("GBK");
if (b[0] == -95)
return true;
else
return false;
}
private static void addToDic(Map<String, Integer> hm, String file,
String word) {
// TODO Auto-generated method stub
String s = file + "/" + word;
try {
int v = hm.get(s);
hm.put(s, v + 1);
} catch (Exception e) {
hm.put(s, 1);
}
}
private static void getStopWordTable(Map<String, Integer> stopWordHM)
throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(StopWordFile)));
String line = br.readLine();
int i = 0;
while (line != null) {
stopWordHM.put(line.trim(), i++);
line = br.readLine();
}
br.close();
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -