?? generateshortbnc.java
字號(hào):
package ijp.assignment1.utils;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
/**
* Generates a serialised Map containing a shortened version of the British
* National Corpus containing only words which occur over 100 times in the corpus.
* It is not necessary to alter this file to complete AIPJ assignment 1.
*
* <p> This class may come in
* useful for msc projects </p>
*
* @author Judy Robertson
*/
public class GenerateShortBNC {
/**
* A map data structure for relating words with their frequency in the
* BNC
*/
public Map bnc;
/**Open the bnc word frequencies file. The file is formatted as lines of:
* 1: frequency
* 2: word
* 3: pos
* 4: number of files the word occurs in
* Only the word and frequency are required here
* **/
private void readinBNC() {
String line;
BNCWord temp = new BNCWord();
ArrayList filecontents = new ArrayList();
String word = "";
String pos = "";
int frequency = 0;
String s;
BufferedReader in;
try {
in =
new BufferedReader(
new FileReader(
System.getProperty("user.dir") + File.separator + "data"
+ File.separator + "all_num_o5.txt"));
//get a line at a time out the file and store it as a string
while ((s = in.readLine()) != null) {
filecontents.add(s);
}
// in.close();
} catch (EOFException e) {
e.printStackTrace();
} catch (IOException o) {
o.printStackTrace();
}
StringTokenizer tokens;
//now process each line of text
for (int i = 0; i < filecontents.size(); i++) {
temp = new BNCWord();
line = (String) filecontents.get(i);
tokens = new StringTokenizer(line, " ");
if (tokens.hasMoreTokens()){
frequency = Integer.parseInt(tokens.nextToken());
}
if (tokens.hasMoreTokens()){
word = tokens.nextToken().toLowerCase();
}
if ((word != null) && (frequency > 100)) {
temp.setWord(word);
temp.setFrequency(frequency);
bnc.put(word, temp);
}
}
}
/**
*This writes out the BNC words as serialised objects
*/
public void writeBNC() {
File f = new File(System.getProperty("user.dir") + File.separator +
"data" + File.separator + "bncobjects.dat");
try {
FileOutputStream fstrm = new FileOutputStream(f);
ObjectOutput ostrm = new ObjectOutputStream(fstrm);
ostrm.writeObject(bnc);
ostrm.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Reads in the text file containing the BNC information, filters out
* infrequent words and then writes out a map containing BNC words.
*
* @param args No arguments needed
*/
public static void main(String args[]) {
GenerateShortBNC g = new GenerateShortBNC();
g.readinBNC();
g.writeBNC();
}
/**
* Constructs a new GenerateShortBNC object and initialises the data
* structure
*/
public GenerateShortBNC() {
bnc = new HashMap();
}
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -