?? lib_splitword_full.php
字號:
<?
/*******************************
//織夢分詞算法 www.dedecms.com
//作者:IT柏拉圖 QQ:2500875
//本程式公提供測試用,不包含岐義處理和詞語標注的功能
//不過默認的情況是有載入標注詞典的,有需要可自行加入
//本程式詞典由織夢組織整理,嚴禁用于商業用途
********************************/
class SplitWord
{
var $TagDic = Array();
var $RankDic = Array();
var $OneNameDic = Array();
var $TwoNameDic = Array();
var $SourceString = '';
var $ResultString = '';
var $SplitChar = ' '; //分隔符
var $SplitLen = 4; //保留詞長度
var $EspecialChar = "和|的|是";
var $NewWordLimit = "在|的|與|或|就|你|我|他|她|有|了|是|其|能|對|地";
//這里可以按需要加入常用的量詞,
//程序會檢測詞語第一個字是否為這些詞和上一個詞是否為數詞,然后結合為單詞
var $CommonUnit = "年|月|日|時|分|秒|點|元|百|千|萬|億|位|輛";
var $CnNumber = "%|+|-|0|1|2|3|4|5|6|7|8|9|.";
var $CnSgNum = "一|二|三|四|五|六|七|八|九|十|百|千|萬|億|數";
var $MaxLen = 13; //詞典最大 7 中文字,這里的數值為字節數組的最大索引
var $MinLen = 3; //最小 2 中文字,這里的數值為字節數組的最大索引
var $CnTwoName = "端木 南宮 譙笪 軒轅 令狐 鐘離 閭丘 長孫 鮮于 宇文 司徒 司空 上官 歐陽 公孫 西門 東門 左丘 東郭 呼延 慕容 司馬 夏侯 諸葛 東方 赫連 皇甫 尉遲 申屠";
var $CnOneName = "趙錢孫李周吳鄭王馮陳褚衛蔣沈韓楊朱秦尤許何呂施張孔曹嚴華金魏陶姜戚謝鄒喻柏水竇章云蘇潘葛奚范彭郎魯韋昌馬苗鳳花方俞任袁柳酆鮑史唐費廉岑薛雷賀倪湯滕殷羅畢郝鄔安常樂于時傅皮卡齊康伍余元卜顧孟平黃穆蕭尹姚邵堪汪祁毛禹狄米貝明臧計伏成戴談宋茅龐熊紀舒屈項祝董粱杜阮藍閔席季麻強賈路婁危江童顏郭梅盛林刁鐘徐邱駱高夏蔡田樊胡凌霍虞萬支柯咎管盧莫經房裘繆干解應宗宣丁賁鄧郁單杭洪包諸左石崔吉鈕龔程嵇邢滑裴陸榮翁荀羊於惠甄魏加封芮羿儲靳汲邴糜松井段富巫烏焦巴弓牧隗谷車侯宓蓬全郗班仰秋仲伊宮寧仇欒暴甘鈄厲戎祖武符劉姜詹束龍葉幸司韶郜黎薊薄印宿白懷蒲臺從鄂索咸籍賴卓藺屠蒙池喬陰郁胥能蒼雙聞莘黨翟譚貢勞逄姬申扶堵冉宰酈雍郤璩桑桂濮牛壽通邊扈燕冀郟浦尚農溫別莊晏柴翟閻充慕連茹習宦艾魚容向古易慎戈廖庚終暨居衡步都耿滿弘匡國文寇廣祿闕東毆殳沃利蔚越夔隆師鞏厙聶晁勾敖融冷訾辛闞那簡饒空曾沙須豐巢關蒯相查后江游竺";
//------------------------------
//php4構造函數
//------------------------------
function SplitWord(){
$this->__construct();
}
//------------------------------
//php5構造函數
//------------------------------
function __construct(){
//載入姓氏詞典
for($i=0;$i<strlen($this->CnOneName);$i++)
{
$this->OneNameDic[$this->CnOneName[$i].$this->CnOneName[$i+1]] = 1;
$i++;
}
$twoname = explode(" ",$this->CnTwoName);
foreach($twoname as $n){ $this->TwoNameDic[$n] = 1; }
unset($twoname);
unset($this->CnOneName);
unset($this->CnTwoName);
//高級分詞,預先載入詞典以提分詞高速度
$dicfile = dirname(__FILE__)."/dededic.csv";
$fp = fopen($dicfile,'r');
while($line = fgets($fp,256)){
$ws = explode(' ',$line);
$this->TagDic[$ws[0]] = $ws[1];
$this->RankDic[strlen($ws[0])][$ws[0]] = $ws[2];
}
fclose($fp);
}
//--------------------------
//析放資源
//--------------------------
function Clear()
{
@fclose($this->QuickDic);
}
//----------------------------
//設置源字符串
//----------------------------
function SetSource($str){
$this->SourceString = trim($this->ReviseString($str));
$this->ResultString = "";
}
//-----------------------------
//檢查字符串是否不存在中文
//-----------------------------
function NotGBK($str)
{
if($str=="") return "";
//因為粗分的時候已經處理,因此不必要檢查所的字符
if( ord($str[0])>0x80 ) return false;
else return true;
}
//-----------------------------
//RMM分詞算法
//-----------------------------
function SplitRMM($str=""){
if($str!="") $this->SetSource(trim($str));
if($this->SourceString=="") return "";
//對文本進行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//對特定文本進行分離
$spwords = explode(" ",$this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if(trim($spwords[$i])=="") continue;
if($this->NotGBK($spwords[$i])){
if(ereg("[^0-9\.\+\-]",$spwords[$i]))
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else
{
$nextword = "";
@$nextword = substr($this->ResultString,0,strpos($this->ResultString," "));
if(ereg("^".$this->CommonUnit,$nextword)){
$this->ResultString = $spwords[$i].$this->ResultString;
}else{
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if($c=="《") //書名
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else if($n>0xA13F && $n < 0xAA40) //標點符號
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else //正常短句
{
if(strlen($spwords[$i]) <= $this->SplitLen)
{
//如果結束符為特殊分割詞,分離處理
if(ereg($this->EspecialChar."$",$spwords[$i],$regs)){
$spwords[$i] = ereg_replace($regs[0]."$","",$spwords[$i]).$spc.$regs[0];
}
//是否為常用單位
if(!ereg("^".$this->CommonUnit,$spwords[$i]) || $i==0){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}else{
$this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString;
$i--;
}
}
else
{
$this->ResultString = $this->RunRMM($spwords[$i]).$spc.$this->ResultString;
}
}
}
}
return $this->ResultString;
}
//對全中文字符串進行逆向匹配方式分解
function RunRMM($str)
{
$spc = $this->SplitChar;
$spLen = strlen($str);
$rsStr = "";
$okWord = "";
$tmpWord = "";
$WordArray = Array();
//逆向字典匹配
for($i=($spLen-1);$i>=0;)
{
//當i達到最小可能詞的時候
if($i<=$this->MinLen){
if($i==1){
$WordArray[] = substr($str,0,2);
//echo "Min 1: ".substr($str,0,2)."<br>";
}else
{
$w = substr($str,0,$this->MinLen+1);
if($this->IsWord($w)){
$WordArray[] = $w;
}else{
$WordArray[] = substr($str,2,2);
$WordArray[] = substr($str,0,2);
//echo "Min 2-2: ".substr($str,0,2).substr($str,2,2)."<br>";
}
}
$i = -1; break;
}
//分析在最小詞以上時的情況
if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
else $maxPos = $i;
$isMatch = false;
for($j=$maxPos;$j>=0;$j=$j-2){
$w = substr($str,$i-$j,$j+1);
if($this->IsWord($w)){
$WordArray[] = $w;
//echo "EG: ".$w." $str $i $j<br>";
$i = $i-$j-1;
$isMatch = true;
break;
}
}
if(!$isMatch){
if($i>1) {
$WordArray[] = $str[$i-1].$str[$i];
//echo "NOT EG: ".$w."<br>";
$i = $i-2;
}
}
}//End For
$rsStr = $this->ParOther($WordArray);
return $rsStr;
}
//
//進行名字識別和其它數詞識別
//
function ParOther($WordArray)
{
$wlen = count($WordArray)-1;
$rsStr = "";
$spc = $this->SplitChar;
for($i=$wlen;$i>=0;$i--)
{
//數量詞
if(ereg($this->CnSgNum,$WordArray[$i])){
$rsStr .= $spc.$WordArray[$i];
if($i>0 && ereg("^".$this->CommonUnit,$WordArray[$i-1]))
{ $rsStr .= $WordArray[$i-1]; $i--; }
else{
while($i>0 && ereg($this->CnSgNum,$WordArray[$i-1]))
{ $rsStr .= $WordArray[$i-1]; $i--; }
}
continue;
}
//雙字姓
if(strlen($WordArray[$i])==4 && isset($this->TwoNameDic[$WordArray[$i]]))
{
$rsStr .= $spc.$WordArray[$i];
if($i>0&&strlen($WordArray[$i-1])==2){
$rsStr .= $WordArray[$i-1];$i--;
if($i>0&&strlen($WordArray[$i-1])==2){ $rsStr .= $WordArray[$i-1];$i--; }
}
}
//單字姓
else if(strlen($WordArray[$i])==2 && isset($this->OneNameDic[$WordArray[$i]]))
{
$rsStr .= $spc.$WordArray[$i];
if($i>0&&strlen($WordArray[$i-1])==2){
$rsStr .= $WordArray[$i-1];$i--;
if($i>0 && strlen($WordArray[$i-1])==2){ $rsStr .= $WordArray[$i-1];$i--; }
}
}
//普通詞匯
else{
$rsStr .= $spc.$WordArray[$i];
}
}
//返回本段分詞結果
$rsStr = preg_replace("/^".$spc."/","",$rsStr);
return $rsStr;
}
//---------------------------------
//判斷詞典里是否存在某個詞
//---------------------------------
function IsWord($okWord){
$slen = strlen($okWord);
if($slen > $this->MaxLen) return false;
else return isset($this->RankDic[$slen][$okWord]);
}
//------------------------------
//整理字符串(對標點符號,中英文混排等初步處理)
//------------------------------
function ReviseString($str)
{
$spc = $this->SplitChar;
$slen = strlen($str);
if($slen==0) return '';
$okstr = '';
$prechar = 0; // 0-空白 1-英文 2-中文 3-符號
for($i=0;$i<$slen;$i++){
if(ord($str[$i]) < 0x81)
{
//英文的空白符號
if(ord($str[$i]) < 33){
if($prechar!=0&&$str[$i]!="\r"&&$str[$i]!="\n") $okstr .= $spc;
$prechar=0;
continue;
}else if(ereg("[^0-9a-zA-Z@\.%#:/\\&_-]",$str[$i]))
{
if($prechar==0)
{ $okstr .= $str[$i]; $prechar=3;}
else
{ $okstr .= $spc.$str[$i]; $prechar=3;}
}else
{
if($prechar==2||$prechar==3)
{ $okstr .= $spc.$str[$i]; $prechar=1;}
else
{
if(ereg("@#%:",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
else { $okstr .= $str[$i]; $prechar=1; }
}
}
}
else{
//如果上一個字符為非中文和非空格,則加一個空格
if($prechar!=0 && $prechar!=2) $okstr .= $spc;
//如果中文字符
if(isset($str[$i+1])){
$c = $str[$i].$str[$i+1];
if(ereg($this->CnNumber,$c))
{ $okstr .= $this->GetAlabNum($c); $prechar = 2; $i++; continue; }
$n = hexdec(bin2hex($c));
if($n>0xA13F && $n < 0xAA40)
{
if($c=="《"){
if($prechar!=0) $okstr .= $spc." 《";
else $okstr .= " 《";
$prechar = 2;
}
else if($c=="》"){
$okstr .= "》 ";
$prechar = 3;
}
else{
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
}
else{
$okstr .= $c;
$prechar = 2;
}
$i++;
}
}//中文字符
}//結束循環
return $okstr;
}
//-----------------------------------------
//嘗試識別新詞,字符串參數為已經分詞處理的串
//----------------------------------------
function FindNewWord($spwords,$maxlen=6)
{
$okstr = '';
$ws = explode(' ',$spwords);
$newword = '';
$nws = '';
foreach($ws as $w)
{
$w = trim($w);
if(strlen($w)==2 && !preg_match("/[0-9a-zA-Z]/",$w) && !preg_match("/".$this->NewWordLimit."/",$w) )
{ $newword .= " ".$w;}
else
{
if($newword!="")
{
$nw = str_replace(' ','',$newword);
if(strlen($nw)>2)
{
if(strlen($nw) <= $maxlen){ $okstr .= ' '.$nw; $nws[$nw] = 0; }
else $okstr .= ' '.$newword;
}
else
{ $okstr .= ' '.$newword; }
$newword = '';
}
$okstr .= ' '.$w;
}
}
if($newword!="") $okstr .= $newword;
$okstr = preg_replace("/ {1,}/"," ",$okstr);
if(is_array($nws))
{
$this->m_nws = $nws;
foreach($nws as $k=>$w)
{
$w = "";
for($i=0;$i<strlen($k);$i++){
if( ord($k[$i]) > 0x80 ){
$w .= " ".$k[$i];
if(isset($k[$i+1])){ $w .= $k[$i+1]; $i++;}
}
else
$w .= " ".$k[$i];
$w .= " ";
}
$w = preg_replace("/ {1,}/"," ",$w);
$okstr = str_replace($w," ".$k." ",$okstr);
$okstr = str_replace($k." "," ".$k." ",$okstr);
$okstr = str_replace(" ".$k," ".$k." ",$okstr);
}
}
return $okstr;
}
//----------------------------------------------
//除去字串中的重復詞,生成索引字符串,字符串參數為已經分詞處理的串
//--------------------------------------------------
function GetIndexText($okstr,$ilen=-1)
{
if($okstr=="") return "";
$ws = explode(" ",$okstr);
$okstr = "";
$wks = "";
foreach($ws as $w)
{
$w = trim($w);
//排除小于2的字符
if(strlen($w)<2) continue;
//排除數字或日期
if(!ereg("[^0-9:-]",$w)) continue;
if(strlen($w)==2&&ord($w[0])>0x80) continue;
if(isset($wks[$w])) $wks[$w]++;
else $wks[$w] = 1;
}
if(is_array($wks))
{
arsort($wks);
if($ilen==-1)
{ foreach($wks as $w=>$v) $okstr .= $w." "; }
else
{
foreach($wks as $w=>$v){
if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";
else break;
}
}
}
return trim($okstr);
}
//
//把全角數字轉為半角數字
//
function GetAlabNum($fnum)
{
$nums = array("0","1","2","3","4","5","6","7","8","9","+","-","%",".");
$fnums = "0123456789+-%.";
for($i=0;$i<count($nums);$i++){
if($nums[$i]==$fnum) return $fnums[$i];
}
return $fnum;
}
}//End Class
?>
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -