?? utility.java
字號:
package org.ictclas4j.utility;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.PersonName;
import org.ictclas4j.segment.PosTagger;
public class Utility {
public static final int CC_NUM = 6768;
// The number of Chinese Char,including 5 empty position between 3756-3761
public static final int WORD_MAXLENGTH = 100;
public static final int WT_DELIMITER = 0;
public static final int WT_CHINESE = 1;
public static final int WT_OTHER = 2;
public static final int CT_SENTENCE_BEGIN = 1;// Sentence begin
public static final int CT_SENTENCE_END = 4;// Sentence ending
public static final int CT_SINGLE = 5;// SINGLE byte
public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
public static final int CT_OTHER = CT_SINGLE + 12;// Other
public static final int MAX_WORDS = 650;
public static final int MAX_SEGMENT_NUM = 10;
public static final String POSTFIX_SINGLE = "壩邦堡杯城池村單島道堤店洞渡隊法峰府岡港閣宮溝國海號河湖環集江獎礁角街井郡坑口礦里嶺樓路門盟廟弄牌派坡鋪旗橋區渠泉人山省市水寺塔臺灘壇堂廳亭屯灣文屋溪峽縣線鄉巷型洋窯營嶼語園苑院閘寨站鎮州莊族陂庵町";
public static final String[] POSTFIX_MUTIPLE = { "半島", "草原", "城市", "大堤", "大公國", "大橋", "地區", "帝國", "渡槽", "港口",
"高速公路", "高原", "公路", "公園", "共和國", "谷地", "廣場", "國道", "海峽", "胡同", "機場", "集鎮", "教區", "街道", "口岸", "碼頭", "煤礦",
"牧場", "農場", "盆地", "平原", "丘陵", "群島", "沙漠", "沙洲", "山脈", "山丘", "水庫", "隧道", "特區", "鐵路", "新村", "雪峰", "鹽場", "鹽湖",
"漁場", "直轄市", "自治區", "自治縣", "自治州", "" };
public static final String TRANS_ENGLISH = "·—阿埃艾愛安昂敖奧澳笆芭巴白拜班邦保堡鮑北貝本比畢彼別波玻博勃伯泊卜布才采倉查差柴徹川茨慈次達大戴代丹旦但當道德得的登迪狄蒂帝丁東杜敦多額俄厄鄂恩爾伐法范菲芬費佛夫福弗甫噶蓋干岡哥戈革葛格各根古瓜哈海罕翰汗漢豪合河赫亨侯呼胡華霍基吉及加賈堅簡杰金京久居君喀卡凱坎康考柯科可克肯庫奎拉喇萊來蘭郎朗勞勒雷累楞黎理李里莉麗歷利立力連廉良列烈林隆盧虜魯路倫侖羅洛瑪馬買麥邁曼茅茂梅門蒙盟米蜜密敏明摩莫墨默姆木穆那娜納乃奈南內尼年涅寧紐努諾歐帕潘畔龐培佩彭皮平潑普其契恰強喬切欽沁泉讓熱榮肉儒瑞若薩塞賽桑瑟森莎沙山善紹舍圣施詩石什史士守斯司絲蘇素索塔泰坦湯唐陶特提汀圖土吐托陀瓦萬王旺威韋維魏溫文翁沃烏吾武伍西錫希喜夏相香歇謝辛新牙雅亞彥堯葉依伊衣宜義因音英雍尤于約宰澤增詹珍治中仲朱諸卓孜祖佐伽婭尕腓滕濟嘉津賴蓮琳律略慕妮聶裴浦奇齊琴茹珊衛欣遜札哲智茲芙汶迦珀琪梵斐胥黛";
public static final String TRANS_RUSSIAN = "·阿安奧巴比彼波布察茨大德得丁杜爾法夫伏甫蓋格哈基加堅捷金卡科可克庫拉萊蘭勒雷里歷利連列盧魯羅洛馬梅蒙米姆娜涅寧諾帕潑普奇齊喬切日薩色山申什斯索塔坦特托娃維文烏西希謝亞耶葉依伊以扎佐柴達登蒂戈果海赫華霍吉季津柯理琳瑪曼穆納尼契欽丘桑沙舍泰圖瓦萬雅卓茲";
public static final String TRANS_JAPANESE = "安奧八白百邦保北倍本比濱博步部彩菜倉昌長朝池赤川船淳次村大代島稻道德地典渡爾繁飯風福岡高工宮古谷關廣桂貴好浩和合河黑橫恒宏后戶荒繪吉紀佳加見健江介金今進井靜敬靖久酒菊俊康可克口梨理里禮栗麗利立涼良林玲鈴柳隆鹿麻瑪美萌彌敏木納南男內鳥寧朋片平崎齊千前淺橋琴青清慶秋丘曲泉仁忍日榮若三森紗杉山善上伸神圣石實矢世市室水順司松泰桃藤天田土萬望尾未文武五舞西細夏憲相小孝新星行雄秀雅亞巖楊洋陽遙野也葉一伊衣逸義益櫻永由有佑宇羽郁淵元垣原遠月悅早造則澤增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜瀨鳩筱";
// Translation type
public static final int TT_ENGLISH = 0;
public static final int TT_RUSSIAN = 1;
public static final int TT_JAPANESE = 2;
// Seperator type
public static final String SEPERATOR_C_SENTENCE = "。!?:;…";
public static final String SEPERATOR_C_SUB_SENTENCE = "、,()“”‘’";
public static final String SEPERATOR_E_SENTENCE = "!?:;";
public static final String SEPERATOR_E_SUB_SENTENCE = ",()\"'";
public static final String SEPERATOR_LINK = "\n\r ";
// Sentence begin and ending string
public static final String SENTENCE_BEGIN = "始##始";
public static final String SENTENCE_END = "末##末";
// Seperator between two words
public static final String WORD_SEGMENTER = "@";
public static final int MAX_WORDS_PER_SENTENCE = 120;
public static final int MAX_UNKNOWN_PER_SENTENCE = 200;
public static final int MAX_POS_PER_WORD = 20;
public static final int LITTLE_FREQUENCY = 6;
public enum TAG_TYPE {
TT_NORMAL, TT_PERSON, TT_PLACE, TT_TRANS_PERSON
};
public static final int MAX_FREQUENCE = 2079997;// 7528283+329805
// //1993123+86874
public static final int MAX_SENTENCE_LEN = 2000;
public static final double INFINITE_VALUE = 10000.00;
// 平滑參數
public static final double SMOOTH_PARAM = 0.1;
public static final String UNKNOWN_PERSON = "未##人";
public static final String UNKNOWN_SPACE = "未##地";
public static final String UNKNOWN_NUM = "未##數";
public static final String UNKNOWN_TIME = "未##時";
public static final String UNKNOWN_LETTER = "未##串";
public static boolean gbGenerate(String fileName) {
File file;
int i, j;
file = new File(fileName);
try {
PrintWriter out = new PrintWriter(new FileOutputStream(file));
if (!file.canWrite())
return false;// fail while opening the file
for (i = 161; i < 255; i++)
for (j = 161; j < 255; j++)
out.println("" + i + j + "," + i + "," + j);
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return true;
}
/***************************************************************************
*
* Func Name : CC_Generate
*
* Description: Generate the Chinese Char List file
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : public static boolean Author : Kevin Zhang History : 1.create
* 2002-1-8
**************************************************************************/
public static boolean CC_Generate(String fileName) {
File file;
int i, j;
file = new File(fileName);
try {
PrintWriter out = new PrintWriter(new FileOutputStream(file));
for (i = 176; i < 255; i++)
for (j = 161; j < 255; j++)
out.println("" + i + j + "," + i + "," + j);
out.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return true;
}
/***************************************************************************
*
* Func Name : CC_Find
*
* Description: Find a Chinese sub-string in the Chinese String
*
*
* Parameters : string:Null-terminated string to search
*
* strCharSet:Null-terminated string to search for
*
* Returns : String Author : Kevin Zhang History : 1.create 2002-1-8
**************************************************************************/
public static boolean CC_Find(final byte[] string, final byte[] strCharSet) {
if (string != null && strCharSet != null) {
int index = strstr(string, strCharSet);
if (index != -1 && (index % 2 == 1)) {
return false;
}
}
return true;
}
/***************************************************************************
*
* Func Name : charType
*
* Description: Judge the type of sChar or (sChar,sChar+1)
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : int : the type of char Author : Kevin Zhang History : 1.create
* 2002-1-8
**************************************************************************/
public static int charType(String str) {
if (str != null && str.length() > 0) {
byte[] b = str.getBytes();
byte b1 = b[0];
byte b2 = b.length > 1 ? b[1] : 0;
if (getUnsigned(b1) < 128) {
if ("\"!,.?()[]{}+=".indexOf((char) b1) != -1)
return CT_DELIMITER;
return CT_SINGLE;
} else if (getUnsigned(b1) == 162)
return CT_INDEX;
else if (getUnsigned(b1) == 163 && getUnsigned(b2) > 175 && getUnsigned(b2) < 186)
return CT_NUM;
else if (getUnsigned(b1) == 163
&& (getUnsigned(b2) >= 193 && getUnsigned(b2) <= 218 || getUnsigned(b2) >= 225
&& getUnsigned(b2) <= 250))
return CT_LETTER;
else if (getUnsigned(b1) == 161 || getUnsigned(b1) == 163)
return CT_DELIMITER;
else if (getUnsigned(b1) >= 176 && getUnsigned(b1) <= 247)
return CT_CHINESE;
}
return CT_OTHER;
}
/***************************************************************************
*
* Func Name : GetCCPrefix
*
* Description: Get the max Prefix string made up of Chinese Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-8
**************************************************************************/
public static int getCCPrefix(byte[] sSentence) {
int nLen = sSentence.length;
int nCurPos = 0;
while (nCurPos < nLen && getUnsigned(sSentence[nCurPos]) > 175 && getUnsigned(sSentence[nCurPos]) < 248) {
nCurPos += 2;// Get next Chinese Char
}
return nCurPos;
}
/***************************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllChinese(String str) {
if (str != null) {
String temp = str + " ";
for (int i = 0; i < str.length(); i++) {
byte[] b = temp.substring(i, i + 1).getBytes();
if (b.length == 2) {
if (!(getUnsigned(b[0]) < 248 && getUnsigned(b[0]) > 175)
|| !(getUnsigned(b[0]) < 253 && getUnsigned(b[0]) > 160))
return false;
}
}
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllNonChinese
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllNonChinese(byte[] sString) {
int nLen = sString.length;
int i = 0;
while (i < nLen) {
if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
return false;
if (sString[i] < 0)
i += 2;
else
i += 1;
}
return true;
}
/***************************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllSingleByte(String str) {
if (str != null) {
int len = str.length();
int i = 0;
byte[] b = str.getBytes();
while (i < len && b[i] < 128) {
i++;
}
if (i < len)
return false;
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllNum
*
* Description: Judge the string is all made up of Num Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllNum(String str) {
if (str != null) {
int i = 0;
String temp = str + " ";
// 判斷開頭是否是+-之類的符號
if ("±+—-+".indexOf(temp.substring(0, 1)) != -1)
i++;
/** 如果是全角的0123456789 字符* */
while (i < str.length() && "0123456789".indexOf(str.substring(i, i + 1)) != -1)
i++;
// Get middle delimiter such as .
if (i < str.length()) {
String s = str.substring(i, i + 1);
if ("∶·./".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98.1%
i++;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -