?? segmenter.pl

?? 中文分詞算法。Perl語言編寫。wordlist.txt為詞庫。
?? PL
字號:
#!/usr/bin/perl

# Read in the lexicon
open(WRDS, "wordlist.txt") or die "Can't open wordlist\n";
while (<WRDS>) {
    chomp;
    $cwords{$_} = 1;
    if (length($_) == 6) {
	if (!exists($cwords{substr($_, 0, 4)})) { 
	    $cwords{substr($_, 0, 4)} = 2;
	}
    } 
#    if (length($_) == 8) {
#	if (!exists($cwords{substr($_, 0, 4)})) { 
#	    $cwords{substr($_, 0, 4)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 6)})) { 
#	    $cwords{substr($_, 0, 6)} = 2;
#	}
#    } 
#    if (length($_) == 10) {
#	if (!exists($cwords{substr($_, 0, 4)})) { 
#	    $cwords{substr($_, 0, 4)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 6)})) { 
#	    $cwords{substr($_, 0, 6)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 8)})) { 
#	    $cwords{substr($_, 0, 8)} = 2;
#	}
#    } 
}
close(WRDS);

# Numbers
$numbers  = "零○一二三四五六七八九十百千萬億０１２３４５６７８９．點第";
$numbers .= "多半數幾倆卅兩壹貳叁肆伍陸柒捌玖拾伯仟";
for ($n = 0; $n < length($numbers); $n+=2) {
    $cnumbers{substr($numbers, $n, 2)} = 1;
}

# Wide ASCII words
$wascii =  "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ．";
$wascii .= "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ－";
$wascii .= "";
for ($n = 0; $n < length($wascii); $n+=2) {
    $cascii{substr($wascii, $n, 2)} = 1;
}

# Foreign name transliteration characters
$foreign =  "阿克拉加內亞斯貝巴爾姆愛蘭尤利西詹喬伊費杰羅納布可夫福赫勒柯特";
$foreign .= "勞倫坦史芬尼根登都伯林伍泰胥黎俄科索沃金森奧霍瓦茨普蒂塞維大利";
$foreign .= "格萊德岡薩雷墨哥弗庫澳馬哈多茲戈烏奇切諾戴里諸塞吉基延科達塔博";
$foreign .= "卡雅來莫波艾哈邁蓬安盧什比摩曼乃休合賴米那迪凱萊溫帕桑佩蒙博托";
$foreign .= "謝格澤洛及希卜魯匹齊茲印古埃努烈達累法賈圖喀土穆腓基冉休蓋耶沙";
$foreign .= "遜賓麥華萬";
for ($n = 0; $n < length($foreign); $n+=2) {
    $cforeign{substr($foreign, $n, 2)} = 1;
}

#Chinese surnames
$surname  = "艾安敖白班包寶保鮑貝畢邊卞柏卜蔡曹岑柴昌常陳成程遲池褚楚";
$surname .= "儲淳崔戴刀鄧狄刁丁董竇杜端段樊范方房斐費豐封馮鳳伏福傅蓋甘";
$surname .= "高戈耿龔宮勾茍辜谷古顧官關管桂郭韓杭郝禾何賀赫衡洪侯胡花";
$surname .= "華黃霍稽姬吉紀季賈簡翦姜江蔣焦晉金靳荊居康柯空孔匡鄺況賴藍";
$surname .= "郎朗勞樂雷冷黎李理厲利勵連廉練良梁廖林凌劉柳隆龍樓婁盧呂魯";
$surname .= "陸路倫羅洛駱麻馬麥滿茅毛梅孟米苗繆閔明莫牟穆倪聶牛鈕農潘龐";
$surname .= "裴彭皮樸平蒲溥浦戚祁齊錢強喬秦丘邱仇裘屈瞿權冉饒任榮容阮";
$surname .= "瑞芮薩賽沙單商邵佘申沈盛石史壽舒斯宋蘇孫邰譚談湯唐陶滕";
$surname .= "田佟仝屠涂萬汪王危韋魏衛蔚溫聞翁巫鄔伍武吳奚習夏鮮冼";
$surname .= "項蕭解謝辛邢幸熊徐許宣薛荀顏閻言嚴彥晏燕楊陽姚葉蟻易殷銀尹";
$surname .= "應英游尤於魚虞俞余禹喻郁尉元袁岳云臧曾查翟詹湛張章招趙甄";
$surname .= "鄭鐘周諸朱竺祝莊卓宗鄒祖左";
$uncommonsurname = "車和全時水同文席于";
for ($n = 0; $n < length($surname); $n+=2) {
    $csurname{substr($surname, $n, 2)} = 1;
}
for ($n = 0; $n < length($uncommonsurname); $n+=2) {
    $uncommoncsurname{substr($uncommonsurname, $n, 2)} = 1;
}

# Add in 2 character surnames; also add to lexicon so they'll be segmented as one unit
$csurname{"東郭"} = 1; $cwords{"東郭"} = 1;
$csurname{"公孫"} = 1; $cwords{"公孫"} = 1;
$csurname{"皇甫"} = 1; $cwords{"皇甫"} = 1;
$csurname{"慕容"} = 1; $cwords{"慕容"} = 1;
$csurname{"歐陽"} = 1; $cwords{"歐陽"} = 1;
$csurname{"單于"} = 1; $cwords{"單于"} = 1;
$csurname{"司空"} = 1; $cwords{"司空"} = 1;
$csurname{"司馬"} = 1; $cwords{"司馬"} = 1;
$csurname{"司徒"} = 1; $cwords{"司徒"} = 1;
$csurname{"澹臺"} = 1; $cwords{"澹臺"} = 1;
$csurname{"諸葛"} = 1; $cwords{"諸葛"} = 1;

#Not in name
$notname  = "的說對在和是被最所那這有將會與於他為";
$notname .= "、：，。★〖〗（）⊙～【】—·？！“”　";
for ($n = 0; $n < length($notname); $n+=2) {
    $cnotname{substr($notname, $n, 2)} = 1;
}


sub add_ChineseNames {
    ($tmpline) = @_;
    $tlen = length($tmpline);
    $newline = "";
    for ($m = 0; $m < $tlen; $m++) {
	$tchar = substr($tmpline, $m, 1);
	$currtoken = "";
	if ($tchar =~ /^\s$/) { 
	    $newline .= $tchar;
	} else {
	    $currtoken = "";
	    while ($tchar !~ /^\s$/ and $m < $tlen) {
		$currtoken .= $tchar;
		$m++;
		$tchar = substr($tmpline, $m, 1);
	    }

	    if (defined($csurname{$currtoken}) or
		defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken2 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces2 = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken3 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken3 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		if (isChinese($currtoken2) and (length($currtoken2) == 2) 
		    and (!defined($cnotname{$currtoken2})) and 
		    isChinese($currtoken3) and length($currtoken3) == 2 and
		    !defined($cnotname{$currtoken3})) 
		{
		    $newline .= $currtoken . $currtoken2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
		    $cwords{$currtoken . $currtoken2} = 2;  # short version for checking
		} elsif (isChinese($currtoken2) and (length($currtoken2) == 2) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		} elsif (defined($csurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} elsif (defined($uncommoncsurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})) 
			 and ($cwords{$currtoken2} != 1))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} else {
		    $newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
		}
				 
	    } else {
		$newline .= $currtoken;
	    }
	    $m--; # reset so won't skip space
	}
    }
    
    $newline;
}


#sub cword_start {
#    my($tword) = @_;
#    if (grep(/^$tword/, @cwordlist) > 0) {
#	return 1;
#    } else {
#	return 0;
#    }
#}

sub isChinese {
    my($cchar) = @_;
    for ($b = 0; $b < length($cchar); $b++) {
	if (unpack("C", substr($cchar, $b, 1)) < 128) {
	    return 0;
	} 
    }
    return 1;
}


sub allnum {
    ($localnum) = @_;
    for ($k = 0; $k < length($localnum); $k+=2) {
	if (!defined($cnumbers{substr($localnum, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}

sub allwascii {
    ($localstr) = @_;
    for ($k = 0; $k < length($localstr); $k+=2) {
	if (!defined($cascii{substr($localstr, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}

sub allforeign {
    ($localstr) = @_;
    for ($k = 0; $k < length($localstr); $k+=2) {
	if (!defined($cforeign{substr($localstr, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}


sub segmentline() {
    my($line) = @_;

    $chinaccum = "";
    $outline = "";
    $linelen = length($line);
    for ($i = 0; $i < $linelen; $i++) {
	$char1 = substr($line, $i, 1);
	if (unpack("C", $char1) > 127) {
	    $chinchar = substr($line, $i, 2);
	    if ($chinaccum eq "") {
		$outline .= " " unless $i == 0;
		$chinaccum = $chinchar;
	    } else {
		    if (exists($cwords{$chinaccum . $chinchar}) and
			$cwords{$chinaccum . $chinchar} == 1) { # is in lexicon
			$chinaccum .= $chinchar;
		    } elsif (allnum($chinaccum) and defined($cnumbers{$chinchar})) {
			$chinaccum .= $chinchar;
		    } elsif (allwascii($chinaccum) and defined($cascii{$chinchar})) {
			$chinaccum .= $chinchar;
		    } elsif (allforeign($chinaccum) and defined($cforeign{$chinchar}) and
			     $cwords{substr($line, $i, 4)} != 1 and
			     $cwords{substr($line, $i, 4)} != 2) {
			$chinaccum .= $chinchar;
		    } elsif (exists($cwords{$chinaccum . $chinchar}) and
			     ($cwords{$chinaccum . $chinchar} == 2) and
			     exists($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)}) and
			     (($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 1) or
			      ($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 2)))
		    { # starts a word in the lexicon
			$chinaccum .= $chinchar;
			
		    } else {
			$outline .= $chinaccum . " ";
			$chinaccum = $chinchar;  # start anew
		    }
		}
	    $i++;
	} else {  # Plain ascii text, attach any accumulated Chinese and then ascii
	    if ($chinaccum ne "") {
		$outline .= $chinaccum . " ";
		$chinaccum = "";
	    }
	    $outline .= $char1;
	}
    }
    
    $chinline = add_ChineseNames($outline);
    $chinline;
}

1;
?? 文件大小 359 K
?? 上傳用戶 guo25621286
?? 所屬分類人工智能/神經網絡
??? 相關標簽

#wordlist #Perl #txt #分
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? segmenter.pl

?? 快捷鍵說明