?? textfreq.java
字號:
package textfreqnew;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
//import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
//import java.io.PrintWriter;
import java.util.*;
//import java.lang.*;
import java.lang.Math;
//import textfreq.CountedSet;
//import java.util.StringTokenizer;
/**
* 結合wordbase來計算每個文本中的詞頻,再計算wordbase中每個詞的文檔頻率,然后根據詞頻和文檔頻率計算每個詞在每個文檔中的權重,把權重存儲在指定的文本中.
* */
class Counter{
int i=1;
public String toString(){
return Integer.toString(i);
}
}
public class TextFreq{
private List reads=new ArrayList();
public static Map hm1=new HashMap();//存儲詞和詞頻
public static Map hm2=new HashMap();//存儲詞庫
public static Map hm3=new HashMap();//存儲文檔頻率
//public static int i=0;
public static int N=1560;
public Double d1;
public Double d2;
public Double W;
//public static Double log(Double N/d2);
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// TODO Auto-generated method stub
String filedest="E:\\experiment\\trainstopwordnew1";
TextFreq f=new TextFreq();
File file=new File(filedest);
//讀取filedest中的所有文件 并存貯在reads中
f.read(file.listFiles());
if(f.getReads()!=null)
{
f.read(f.getReads());
}
}
/**
* @param files
* 讀取文件 但還沒有讀取文件的內容
*/
public void read(File []files){
for(int i=0;i<files.length;i++)
{
if(files[i].isDirectory())
{
this.read(files[i].listFiles());
}else
{
this.reads.add(files[i]);
}
}
}
//讀取文件內容
public void read(List files) throws FileNotFoundException
{
//int i=0;
//CountedSet cs=new CountedSet();
Iterator iterator=files.iterator();
File file=null;
while(iterator.hasNext())
{
file=(File) iterator.next();
System.out.println("讀取文件"+file.getName()+"內容");
LineNumberReader reader;
//System.out.println("avsdadfa");
try
{
reader = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(file))));
// BufferedReader inputStream=new BufferedReader(new FileReader("E:\\experiment\\wordbase2.txt"));
//System.out.println("avsdadfa");
String line1;
//String line2;
PrintWriter fos=new PrintWriter(new File("E:\\experiment\\textfreq\\"+file.getName()));
//FileOutputStream fos=new FileOutputStream(new File("E:\\test1\\textfreq\\"+file.getName()));
//BufferedOutputStream fos2=new BufferedOutputStream(new FileOutputStream("E:\\test1\\wordfreq.txt"));
// BufferedOutputStream fos3=new BufferedOutputStream(new FileOutputStream("E:\\test1\\wordwight.txt"));
//統計每個文本中的詞頻,并把詞和相應的詞頻存放到hm1中
while((line1=reader.readLine())!=null){
if(hm1.containsKey(line1))
((Counter)hm1.get(line1)).i++;
else
hm1.put(line1, new Counter());
}
// 輸出hm1,測試是否成功存儲到hm1中
//System.out.println(hm1);
// 把詞頻寫入到文本中
//Iterator iter=cs.getMap().keySet().iterator();
// Iterator iter=cs.getMap().values().iterator();
Set keySet=hm1.keySet();
String text;
int count;
for(Iterator iter=hm1.keySet().iterator();iter.hasNext();){
text=(String)iter.next();
count=((Counter)hm1.get(text)).i;
//count=(hm1.get(text)).
fos.write(text+":"+count+"\r\n");
}
fos.close();
hm1.clear();
//把詞庫存儲到hm2中
/* while((line2=inputStream.readLine())!=null){
if(!(hm2.containsKey(line2)))
hm2.put(line2, new Counter());
else
((Counter)hm2.get(line2)).i++;
}
*/
//統計文檔頻率,把hm1中的詞與wordbase(hm2)中的詞進行比較,如果hm1中有wordbase中的詞,則加1
/* Iterator iter2=hm2.keySet().iterator();
Iterator iter1=hm1.keySet().iterator();
while(iter2.hasNext()){
String s2=iter2.next().toString();
if(hm1.containsKey(s2)){
if(!(hm3.containsKey(s2))){
hm3.put(s2, new Counter());
}
else{
((Counter)hm3.get(s2)).i++;
}
}
}*/
//輸出hm3,測試文檔頻率
//System.out.println(hm3);
//計算權重,W=tf*idf,其中tf=cs.getMap().values(),df=hm3.values(),idf=log(N/df)
/*Iterator iter31=hm3.keySet().iterator();//文檔頻率
Iterator iter11=hm1.keySet().iterator();//詞頻
while((iter31.hasNext())&(iter11.hasNext())){
String s3=iter31.next().toString();
String s4=iter11.next().toString();
if(s3.equals(s4)){
d1=Double.valueOf(hm1.get(s4).toString());//詞s3的詞頻值
d2=Double.valueOf(hm3.get(s3).toString());//詞s3的文檔數
Double a=java.lang.Math.log(N/d2);
W=d1*a;
//W=d1*log(N/d2);
//System.out.println("asd:-----------------------");
//System.out.println(W);
String sw=W.toString()+"\r\n";
byte[] bw=sw.getBytes();
fos3.write(bw,0,bw.length);
//算法沒有問題,但是把w存儲到fos3中時,被復寫,正在找原因。。。。。
}
//hm3.clear();
}
fos3.close();*/
//hm1.clear();
//hm2.clear();
//hm3.clear();
//hm3.clear();
//cs.getMap().clear();
}
catch (IOException e)
{
System.out.println("Install tips are not found!");
e.printStackTrace();
}
}
}
public List getReads() {
return reads;
}
public void setReads(List reads) {
this.reads = reads;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -