?? addcollocation.java.bak
字號:
import java.util.*;
import java.io.*;
public class AddCollocation//將collocation在原語料中標記出來
{
public static void main(String[] args) throws IOException
{
File fin = new File("#.txt");
File fin2 = new File("collocation_seg.txt");
FileWriter fout = new FileWriter("#_$.txt");
Scanner scan = new Scanner (fin);
while (scan.hasNextLine())
{
StringBuffer str1 = new StringBuffer(scan.nextLine());//將原語料中的一行賦給字符串str1
Scanner scan2 = new Scanner (fin2);
while(scan2.hasNextLine())
{
String str2 =scan2.nextLine();//將collocation表中的一行賦給str2
Scanner scan3 = new Scanner(str2).useDelimiter("\\s+");
String str3 = scan3.next();
char letter='E';//用來保存褒貶詞的詞性,即+,-,*,~
while (scan3.hasNext())
{
String str4 = scan3.next();
letter = str4.charAt(0);
if(letter != '?') str3 = str3 + " " + str4;
else break;
}
int idx = str1.indexOf(str3);//idx用來保存索引
int slen1 = str3.length();
while(idx != -1)
{
int flag0=0;
int flag[]=new int[5];
flag[0]=str1.indexOf(",",idx);
flag[1]=str1.indexOf("。",idx);
flag[2]=str1.indexOf("?",idx);
flag[3]=str1.indexOf("!",idx);
flag[4]=str1.indexOf(";",idx);
Arrays.sort(flag);
for(int j=0;j<5;j++)
{
if(flag[j]!=-1)
{
flag0=flag[j];
break;
}
}
if (flag0==0) flag0=str1.length();
String str_senten=str1.substring(idx,flag0);
int temp = str2.indexOf("?");
String substr2 = str2.substring(temp+5);
scan3 = new Scanner(substr2).useDelimiter("\\s+");
String str_3 = scan3.next();
while (scan3.hasNext())
{
String str4 = scan3.next();
letter = str4.charAt(0);
if(letter != '+' && letter != '-' && letter != '*' && letter != '~') str_3 = str_3 + " " + str4;
else break;
}
int inde = str_senten.indexOf(str_3);
int slen2 = str_3.length();
if(inde != -1)
{
/*index[i] = idx+inde; letters[i]=letter;length[i]=slen;
i++;*/
int temp_flag = idx-1;
char ch = str1.charAt(temp_flag);
while(ch == '+'||ch == '-'||ch == '*'||ch == '~'||ch == 'a'||ch == 'b'||ch == 'c'||ch == 'd'||ch == 'n'||ch == '0')
{
ch = str1.charAt(--temp_flag);
}
if(ch =='#')
{
str1.deleteCharAt(temp_flag);
idx--;
}
temp_flag = idx+inde-1;
ch = str1.charAt(temp_flag);
while(ch == '+'||ch == '-'||ch == '*'||ch == '~'||ch == 'a'||ch == 'b'||ch == 'c'||ch == 'd'||ch == 'n'||ch == '0')
{
ch = str1.charAt(--temp_flag);
}
if(ch =='#')
{
str1.deleteCharAt(temp_flag);
inde--;
}
str1.insert(idx+inde,"#^"+letter);
String sub_str=str1.substring(0,idx);
int index=sub_str.lastIndexOf(" ");
addFactor(str1,index,slen1);
sub_str=str1.substring(0,idx+inde);
index=sub_str.lastIndexOf(" ");
addFactor(str1,index,slen2);
str1.insert(idx,"#^"+letter);
System.out.println(str_3);
}
idx = str1.indexOf(str3,idx+5);
}
scan3.close();
}
scan2.close();
fout.write(str1.toString()+"\n");
/*if(i>0)//str1中找到至少一個褒貶詞
{
int a,b,t,u;
char v;
for(a=1;a<i;a++)//用冒泡法對數組index排序
for (b=i-1;b>=a;b--)
{
if (index[b-1]>index[b])
{ u = index[b-1];v = letters[b-1];t = length[b-1];
index[b-1]=index[b];letters[b-1]=letters[b];length[b-1]=length[b];
index[b]=u;letters[b]=v;length[b]=t;
}
}
int m;
String str5;
for (m=0;m<i;m++)
{
if(m==0)str5 = str1.substring(0,index[m]);
else str5 = str1.substring(index[m-1],index[m]);
fout.write(str5 + "#^" + letters[m]);//^代表collocation
}
str5 = str1.substring(index[m-1]);
fout.write(str5);
}
else fout.write(str1);
fout.write("\n");*/
}
scan.close();
fout.close();
}
//str1,index,slen1
public static void addFactor(StringBuffer str1,int index,int slen)throws IOException
{
int flag1=0,flag2=0;
int length1=0,temp_length=0;
int idx = str1.indexOf("#^");
char temp_letter='E';
File fin22 = new File("factor_seg.txt");
Scanner scan22 = new Scanner (fin22);
while(scan22.hasNextLine())
{
String str22 =scan22.nextLine();//將影響因子詞表中的一行賦給str22
Scanner scan33 = new Scanner (str22).useDelimiter("\\s+");
String str33 = scan33.next();
char lett='E';//用來保存影響因子的程度,即a,b,c,d,n,0
while (scan33.hasNext())
{
String str44 = scan33.next();
lett = str44.charAt(0);
if(lett != 'a' && lett != 'b' && lett != 'c' && lett != 'd'&& lett != 'n'&& lett != '0'&& lett != 'A')
str33 = str33 + " " + str44;
else
{if (lett=='A') lett=str44.charAt(1);
break;
}
}
String str55;
if(str33.charAt(0)!='A')//看褒貶詞前并且緊挨褒貶詞的是不是影響因子
{
try
{
str55 = str1.substring(index-str33.length(),index);//2代表兩個空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33))
{
flag1++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
else
{
try
{
str55 = str1.substring(index-5-str33.length(),index);//5代表的/u和兩個空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33+" 的/u")||str55.equals(str33+" 地/u")||str55.equals(str33+" 得/u"))
{
flag2++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
}
}
else//看褒貶詞后并且緊挨褒貶詞的是不是影響因子
{
str33 = str33.substring(5);
try
{
str55 = str1.substring(index+2+slen,index+2+slen+str33.length());
}
catch (StringIndexOutOfBoundsException sibe)
{
str55 = "";
}
if (str55.equals(str33))
{
str1.insert(idx+3,lett);
break;
}
else
{
try
{
str55 = str1.substring(index+slen+2,index+slen+2+5+str33.length());
}
catch (StringIndexOutOfBoundsException sibe)
{
str55 = "";
}
if (str55.equals("的/u "+str33)||str55.equals("地/u "+str33)||str55.equals("得/u "+str33))
{
str1.insert(idx+3,lett);
break;
}
}
}
}//退出while(scan22)循環
if(flag1>0)
{
str1.insert(idx+3,temp_letter);
length1+=2;
length1+=temp_length;
}
else if(flag2>0)
{
str1.insert(idx+3,temp_letter);
length1+=7;
length1+=temp_length;
}
while(flag1>0||flag2>0)
{
flag1=0;
flag2=0;
temp_length=0;
Scanner scan20 = new Scanner (fin22);
while(scan20.hasNextLine())
{
String str22 =scan20.nextLine();//將影響因子詞表中的一行賦給str22
Scanner scan33 = new Scanner (str22).useDelimiter("\\s+");
String str33 = scan33.next();
char lett='E';//用來保存影響因子的程度,即a,b,c,d,n,0
while (scan33.hasNext())
{
String str44 = scan33.next();
lett = str44.charAt(0);
if(lett != 'a' && lett != 'b' && lett != 'c' && lett != 'd'&& lett != 'n'&& lett != '0'&& lett != 'A')
str33 = str33 + " " + str44;
else
{if (lett=='A') lett=str44.charAt(1);
break;
}
}
String str55;
if(str33.charAt(0)!='A')//看褒貶詞前并且緊挨褒貶詞的是不是影響因子
{
try
{
str55 = str1.substring(index-length1-str33.length(),index-length1);//2代表兩個空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33))
{
flag1++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
else
{
try
{
str55 = str1.substring(index-5-length1-str33.length(),index-length1);//5代表的/u和兩個空格
}
catch (StringIndexOutOfBoundsException sibe)
{
str55="";
}
if (str55.equals(str33+" 的/u")||str55.equals(str33+" 地/u")||str55.equals(str33+" 得/u"))
{
flag2++;
if(str33.length()>temp_length)
{
temp_length=str33.length();
temp_letter=lett;
}
}
}
}
}//退出內while循環(scan20.hasNextLine())
if(flag1>0)
{
str1.insert(idx+3,temp_letter);
length1+=2;
length1+=temp_length;
}
else if(flag2>0)
{
str1.insert(idx+3,temp_letter);
length1+=7;
length1+=temp_length;
}
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -