phpanalysis.class.php 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139
  1. <?php
  2. /*
  3. * 居于Unicode编码词典的php分词器
  4. * 1、只适用于php5,必要函数 iconv
  5. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  6. * 3、简单操作流程: SetSource -> StartAnalysis -> Get***Result
  7. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  8. *
  9. * Copyright IT柏拉图 QQ: 2500875 Email: 2500875#qq.com
  10. *
  11. * @version 2.0
  12. *
  13. */
  14. namespace app\common\services\wordanalysis;
  15. //常量定义
  16. define('_SP_', chr(0xFF).chr(0xFE));
  17. define('UCS2', 'ucs-2be');
  18. class PhpAnalysis
  19. {
  20. //hash算法选项
  21. public $mask_value = 0xFFFF;
  22. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  23. public $sourceCharSet = 'utf-8';
  24. public $targetCharSet = 'utf-8';
  25. //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
  26. public $resultType = 1;
  27. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  28. public $notSplitLen = 5;
  29. //把英文单词全部转小写
  30. public $toLower = false;
  31. //使用最大切分模式对二元词进行消岐
  32. public $differMax = false;
  33. //尝试合并单字
  34. public $unitWord = true;
  35. //初始化类时直接加载词典
  36. public static $loadInit = true;
  37. //使用热门词优先模式进行消岐
  38. public $differFreq = false;
  39. //被转换为unicode的源字符串
  40. private $sourceString = '';
  41. //附加词典
  42. public $addonDic = array();
  43. public $addonDicFile = 'dict/words_addons.dic';
  44. //主词典
  45. public $dicStr = '';
  46. public $mainDic = array();
  47. public $mainDicHand = false;
  48. public $mainDicInfos = array();
  49. public $mainDicFile = 'dict/base_dic_full.dic';
  50. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  51. private $isLoadAll = false;
  52. //主词典词语最大长度 x / 2
  53. private $dicWordMax = 14;
  54. //粗分后的数组(通常是截取句子等用途)
  55. private $simpleResult = array();
  56. //最终结果(用空格分开的词汇列表)
  57. private $finallyResult = '';
  58. //是否已经载入词典
  59. public $isLoadDic = false;
  60. //系统识别或合并的新词
  61. public $newWords = array();
  62. public $foundWordStr = '';
  63. //词库载入时间
  64. public $loadTime = 0;
  65. /**
  66. * 构造函数
  67. * @param $source_charset
  68. * @param $target_charset
  69. * @param $load_alldic
  70. * @param $source
  71. *
  72. * @return void
  73. */
  74. public function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=true, $source='')
  75. {
  76. $this->addonDicFile = dirname(__FILE__).'/'.$this->addonDicFile;
  77. $this->mainDicFile = dirname(__FILE__).'/'.$this->mainDicFile;
  78. $this->SetSource( $source, $source_charset, $target_charset );
  79. $this->isLoadAll = $load_all;
  80. if(self::$loadInit) $this->LoadDict();
  81. }
  82. /**
  83. * 析构函数
  84. */
  85. function __destruct()
  86. {
  87. if( $this->mainDicHand !== false )
  88. {
  89. @fclose( $this->mainDicHand );
  90. }
  91. }
  92. /**
  93. * 根据字符串计算key索引
  94. * @param $key
  95. * @return short int
  96. */
  97. private function _get_index( $key )
  98. {
  99. $l = strlen($key);
  100. $h = 0x238f13af;
  101. while ($l--)
  102. {
  103. $h += ($h << 5);
  104. $h ^= ord($key[$l]);
  105. $h &= 0x7fffffff;
  106. }
  107. return ($h % $this->mask_value);
  108. }
  109. /**
  110. * 从文件获得词
  111. * @param $key
  112. * @param $type (类型 word 或 key_groups)
  113. * @return short int
  114. */
  115. public function GetWordInfos( $key, $type='word' )
  116. {
  117. if( !$this->mainDicHand )
  118. {
  119. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  120. }
  121. $p = 0;
  122. $keynum = $this->_get_index( $key );
  123. if( isset($this->mainDicInfos[ $keynum ]) )
  124. {
  125. $data = $this->mainDicInfos[ $keynum ];
  126. }
  127. else
  128. {
  129. //rewind( $this->mainDicHand );
  130. $move_pos = $keynum * 8;
  131. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  132. $dat = fread($this->mainDicHand, 8);
  133. $arr = unpack('I1s/n1l/n1c', $dat);
  134. if( $arr['l'] == 0 )
  135. {
  136. return false;
  137. }
  138. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  139. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  140. $this->mainDicInfos[ $keynum ] = $data;
  141. }
  142. if( !is_array($data) || !isset($data[$key]) )
  143. {
  144. return false;
  145. }
  146. return ($type=='word' ? $data[$key] : $data);
  147. }
  148. /**
  149. * 设置源字符串
  150. * @param $source
  151. * @param $source_charset
  152. * @param $target_charset
  153. *
  154. * @return bool
  155. */
  156. public function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  157. {
  158. $this->sourceCharSet = strtolower($source_charset);
  159. $this->targetCharSet = strtolower($target_charset);
  160. $this->simpleResult = array();
  161. $this->finallyResult = array();
  162. $this->finallyIndex = array();
  163. if( $source != '' )
  164. {
  165. $rs = true;
  166. if( preg_match("/^utf/", $source_charset) ) {
  167. $this->sourceString = iconv('utf-8', UCS2, $source);
  168. }
  169. else if( preg_match("/^gb/", $source_charset) ) {
  170. $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  171. }
  172. else if( preg_match("/^big/", $source_charset) ) {
  173. $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  174. }
  175. else {
  176. $rs = false;
  177. }
  178. }
  179. else
  180. {
  181. $rs = false;
  182. }
  183. return $rs;
  184. }
  185. /**
  186. * 设置结果类型(只在获取finallyResult才有效)
  187. * @param $rstype 1 为全部, 2去除特殊符号
  188. *
  189. * @return void
  190. */
  191. public function SetResultType( $rstype )
  192. {
  193. $this->resultType = $rstype;
  194. }
  195. /**
  196. * 载入词典
  197. *
  198. * @return void
  199. */
  200. public function LoadDict( $maindic='' )
  201. {
  202. $startt = microtime(true);
  203. //正常读取文件
  204. $dicAddon = $this->addonDicFile;
  205. if($maindic=='' || !file_exists($maindic) )
  206. {
  207. $dicWords = $this->mainDicFile ;
  208. }
  209. else
  210. {
  211. $dicWords = $maindic;
  212. $this->mainDicFile = $maindic;
  213. }
  214. //加载主词典(只打开)
  215. $this->mainDicHand = fopen($dicWords, 'r');
  216. //载入副词典
  217. $hw = '';
  218. $ds = file($dicAddon);
  219. foreach($ds as $d)
  220. {
  221. $d = trim($d);
  222. if($d=='') continue;
  223. $estr = substr($d, 1, 1);
  224. if( $estr==':' ) {
  225. $hw = substr($d, 0, 1);
  226. }
  227. else
  228. {
  229. $spstr = _SP_;
  230. $spstr = iconv(UCS2, 'utf-8', $spstr);
  231. $ws = explode(',', $d);
  232. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  233. $ws = explode(_SP_, $wall);
  234. foreach($ws as $estr)
  235. {
  236. $this->addonDic[$hw][$estr] = strlen($estr);
  237. }
  238. }
  239. }
  240. $this->loadTime = microtime(true) - $startt;
  241. $this->isLoadDic = true;
  242. }
  243. /**
  244. * 检测某个词是否存在
  245. */
  246. public function IsWord( $word )
  247. {
  248. $winfos = $this->GetWordInfos( $word );
  249. return ($winfos !== false);
  250. }
  251. /**
  252. * 获得某个词的词性及词频信息
  253. * @parem $word unicode编码的词
  254. * @return void
  255. */
  256. public function GetWordProperty($word)
  257. {
  258. if( strlen($word)<4 )
  259. {
  260. return '/s';
  261. }
  262. $infos = $this->GetWordInfos($word);
  263. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  264. }
  265. /**
  266. * 指定某词的词性信息(通常是新词)
  267. * @parem $word unicode编码的词
  268. * @parem $infos array('c' => 词频, 'm' => 词性);
  269. * @return void;
  270. */
  271. public function SetWordInfos($word, $infos)
  272. {
  273. if( strlen($word)<4 )
  274. {
  275. return ;
  276. }
  277. if( isset($this->mainDicInfos[$word]) )
  278. {
  279. $this->newWords[$word]++;
  280. $this->mainDicInfos[$word]['c']++;
  281. }
  282. else
  283. {
  284. $this->newWords[$word] = 1;
  285. $this->mainDicInfos[$word] = $infos;
  286. }
  287. }
  288. /**
  289. * 开始执行分析
  290. * @parem bool optimize 是否对结果进行优化
  291. * @return bool
  292. */
  293. public function StartAnalysis($optimize=true)
  294. {
  295. if( !$this->isLoadDic )
  296. {
  297. $this->LoadDict();
  298. }
  299. $this->simpleResult = $this->finallyResult = array();
  300. $this->sourceString .= chr(0).chr(32);
  301. $slen = strlen($this->sourceString);
  302. $sbcArr = array();
  303. $j = 0;
  304. //全角与半角字符对照表
  305. for($i=0xFF00; $i < 0xFF5F; $i++)
  306. {
  307. $scb = 0x20 + $j;
  308. $j++;
  309. $sbcArr[$i] = $scb;
  310. }
  311. //对字符串进行粗分
  312. $onstr = '';
  313. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  314. $s = 0;
  315. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  316. $notNumberMatch = "[a-z@#%\+]";
  317. for($i=0; $i < $slen; $i++)
  318. {
  319. $c = $this->sourceString[$i].$this->sourceString[++$i];
  320. $cn = hexdec(bin2hex($c));
  321. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  322. //ANSI字符
  323. if($cn < 0x80)
  324. {
  325. if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  326. {
  327. if( $lastc != 2 && $onstr != '') {
  328. $this->simpleResult[$s]['w'] = $onstr;
  329. $this->simpleResult[$s]['t'] = $lastc;
  330. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  331. $s++;
  332. $onstr = '';
  333. }
  334. $lastc = 2;
  335. $onstr .= chr(0).chr($cn);
  336. }
  337. else
  338. {
  339. if( $onstr != '' )
  340. {
  341. $this->simpleResult[$s]['w'] = $onstr;
  342. if( $lastc==2 )
  343. {
  344. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  345. }
  346. $this->simpleResult[$s]['t'] = $lastc;
  347. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  348. $s++;
  349. }
  350. $onstr = '';
  351. $lastc = 3;
  352. if($cn < 31)
  353. {
  354. continue;
  355. }
  356. else
  357. {
  358. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  359. $this->simpleResult[$s]['t'] = 3;
  360. $s++;
  361. }
  362. }
  363. }
  364. //普通字符
  365. else
  366. {
  367. //正常文字
  368. if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  369. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  370. {
  371. if( $lastc != 1 && $onstr != '')
  372. {
  373. $this->simpleResult[$s]['w'] = $onstr;
  374. if( $lastc==2 )
  375. {
  376. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  377. }
  378. $this->simpleResult[$s]['t'] = $lastc;
  379. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  380. $s++;
  381. $onstr = '';
  382. }
  383. $lastc = 1;
  384. $onstr .= $c;
  385. }
  386. //特殊符号
  387. else
  388. {
  389. if( $onstr != '' )
  390. {
  391. $this->simpleResult[$s]['w'] = $onstr;
  392. if( $lastc==2 )
  393. {
  394. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  395. }
  396. $this->simpleResult[$s]['t'] = $lastc;
  397. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  398. $s++;
  399. }
  400. //检测书名
  401. if( $cn == 0x300A )
  402. {
  403. $tmpw = '';
  404. $n = 1;
  405. $isok = false;
  406. $ew = chr(0x30).chr(0x0B);
  407. while(true)
  408. {
  409. if( !isset($this->sourceString[$i+$n+1]) ) break;
  410. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  411. if( $w == $ew )
  412. {
  413. $this->simpleResult[$s]['w'] = $c;
  414. $this->simpleResult[$s]['t'] = 5;
  415. $s++;
  416. $this->simpleResult[$s]['w'] = $tmpw;
  417. $this->newWords[$tmpw] = 1;
  418. if( !isset($this->newWords[$tmpw]) )
  419. {
  420. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  421. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  422. }
  423. $this->simpleResult[$s]['t'] = 13;
  424. $s++;
  425. //最大切分模式对书名继续分词
  426. if( $this->differMax )
  427. {
  428. $this->simpleResult[$s]['w'] = $tmpw;
  429. $this->simpleResult[$s]['t'] = 21;
  430. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  431. $s++;
  432. }
  433. $this->simpleResult[$s]['w'] = $ew;
  434. $this->simpleResult[$s]['t'] = 5;
  435. $s++;
  436. $i = $i + $n + 1;
  437. $isok = true;
  438. $onstr = '';
  439. $lastc = 5;
  440. break;
  441. }
  442. else
  443. {
  444. $n = $n+2;
  445. $tmpw .= $w;
  446. if( strlen($tmpw) > 60 )
  447. {
  448. break;
  449. }
  450. }
  451. }//while
  452. if( !$isok )
  453. {
  454. $this->simpleResult[$s]['w'] = $c;
  455. $this->simpleResult[$s]['t'] = 5;
  456. $s++;
  457. $onstr = '';
  458. $lastc = 5;
  459. }
  460. continue;
  461. }
  462. $onstr = '';
  463. $lastc = 5;
  464. if( $cn==0x3000 )
  465. {
  466. continue;
  467. }
  468. else
  469. {
  470. $this->simpleResult[$s]['w'] = $c;
  471. $this->simpleResult[$s]['t'] = 5;
  472. $s++;
  473. }
  474. }//2byte symbol
  475. }//end 2byte char
  476. }//end for
  477. //处理分词后的结果
  478. $this->_sort_finally_result();
  479. }
  480. /**
  481. * 深入分词
  482. * @parem $str
  483. * @parem $ctype (2 英文类, 3 中/韩/日文类)
  484. * @parem $spos 当前粗分结果游标
  485. * @return bool
  486. */
  487. private function _deep_analysis( &$str, $ctype, $spos, $optimize=true )
  488. {
  489. //中文句子
  490. if( $ctype==1 )
  491. {
  492. $slen = strlen($str);
  493. //小于系统配置分词要求长度的句子
  494. if( $slen < $this->notSplitLen )
  495. {
  496. $tmpstr = '';
  497. $lastType = 0;
  498. if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  499. if($slen < 5)
  500. {
  501. //echo iconv(UCS2, 'utf-8', $str).'<br/>';
  502. if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  503. {
  504. $str2 = '';
  505. if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  506. {
  507. $str2 = substr($str, 2, 2);
  508. $str = substr($str, 0, 2);
  509. }
  510. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  511. $this->simpleResult[$spos - 1]['w'] = $ww;
  512. $this->simpleResult[$spos - 1]['t'] = 4;
  513. if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  514. {
  515. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  516. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  517. }
  518. $this->simpleResult[$spos]['w'] = '';
  519. if( $str2 != '' )
  520. {
  521. $this->finallyResult[$spos-1][] = $ww;
  522. $this->finallyResult[$spos-1][] = $str2;
  523. }
  524. }
  525. else {
  526. $this->finallyResult[$spos][] = $str;
  527. }
  528. }
  529. else
  530. {
  531. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  532. }
  533. }
  534. //正常长度的句子,循环进行分词处理
  535. else
  536. {
  537. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  538. }
  539. }
  540. //英文句子,转为小写
  541. else
  542. {
  543. if( $this->toLower ) {
  544. $this->finallyResult[$spos][] = strtolower($str);
  545. }
  546. else {
  547. $this->finallyResult[$spos][] = $str;
  548. }
  549. }
  550. }
  551. /**
  552. * 中文的深入分词
  553. * @parem $str
  554. * @return void
  555. */
  556. private function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=true )
  557. {
  558. $quote1 = chr(0x20).chr(0x1C);
  559. $tmparr = array();
  560. $hasw = 0;
  561. //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
  562. if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  563. {
  564. $tmparr[] = $str;
  565. if( !isset($this->newWords[$str]) )
  566. {
  567. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  568. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  569. }
  570. if( !$this->differMax )
  571. {
  572. $this->finallyResult[$spos][] = $str;
  573. return ;
  574. }
  575. }
  576. //进行切分
  577. for($i=$slen-1; $i > 0; $i -= 2)
  578. {
  579. //单个词
  580. $nc = $str[$i-1].$str[$i];
  581. //是否已经到最后两个字
  582. if( $i <= 2 )
  583. {
  584. $tmparr[] = $nc;
  585. $i = 0;
  586. break;
  587. }
  588. $isok = false;
  589. $i = $i + 1;
  590. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  591. {
  592. if($i < $k) continue;
  593. $w = substr($str, $i-$k, $k);
  594. if( strlen($w) <= 2 )
  595. {
  596. $i = $i - 1;
  597. break;
  598. }
  599. if( $this->IsWord( $w ) )
  600. {
  601. $tmparr[] = $w;
  602. $i = $i - $k + 1;
  603. $isok = true;
  604. break;
  605. }
  606. }
  607. //echo '<hr />';
  608. //没适合词
  609. if(!$isok) $tmparr[] = $nc;
  610. }
  611. $wcount = count($tmparr);
  612. if( $wcount==0 ) return ;
  613. $this->finallyResult[$spos] = array_reverse($tmparr);
  614. //优化结果(岐义处理、新词、数词、人名识别等)
  615. if( $optimize )
  616. {
  617. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  618. }
  619. }
  620. /**
  621. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  622. * @parem $optimize 是否优化合并的结果
  623. * @return bool
  624. */
  625. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  626. private function _optimize_result( &$smarr, $spos )
  627. {
  628. $newarr = array();
  629. $prePos = $spos - 1;
  630. $arlen = count($smarr);
  631. $i = $j = 0;
  632. //检测数量词
  633. if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  634. {
  635. $lastw = $this->simpleResult[$prePos]['w'];
  636. $lastt = $this->simpleResult[$prePos]['t'];
  637. if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  638. {
  639. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  640. $this->simpleResult[$prePos]['t'] = 4;
  641. if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  642. {
  643. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  644. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  645. }
  646. $smarr[0] = '';
  647. $i++;
  648. }
  649. }
  650. for(; $i < $arlen; $i++)
  651. {
  652. if( !isset( $smarr[$i+1] ) )
  653. {
  654. $newarr[$j] = $smarr[$i];
  655. break;
  656. }
  657. $cw = $smarr[$i];
  658. $nw = $smarr[$i+1];
  659. $ischeck = false;
  660. //检测数量词
  661. if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  662. {
  663. //最大切分时保留合并前的词
  664. if($this->differMax)
  665. {
  666. $newarr[$j] = chr(0).chr(0x28);
  667. $j++;
  668. $newarr[$j] = $cw;
  669. $j++;
  670. $newarr[$j] = $nw;
  671. $j++;
  672. $newarr[$j] = chr(0).chr(0x29);
  673. $j++;
  674. }
  675. $newarr[$j] = $cw.$nw;
  676. if( !isset($this->newWords[$newarr[$j]]) )
  677. {
  678. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  679. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  680. }
  681. $j++; $i++; $ischeck = true;
  682. }
  683. //检测前导词(通常是姓)
  684. else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  685. {
  686. $is_rs = false;
  687. //词语是副词或介词或频率很高的词不作为人名
  688. if( strlen($nw)==4 )
  689. {
  690. $winfos = $this->GetWordInfos($nw);
  691. if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  692. {
  693. $is_rs = true;
  694. }
  695. }
  696. if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  697. {
  698. $newarr[$j] = $cw.$nw;
  699. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
  700. //尝试检测第三个词
  701. if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  702. {
  703. $newarr[$j] .= $smarr[$i+2];
  704. $i++;
  705. }
  706. if( !isset($this->newWords[$newarr[$j]]) )
  707. {
  708. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  709. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  710. }
  711. //为了防止错误,保留合并前的姓名
  712. if(strlen($nw)==4)
  713. {
  714. $j++;
  715. $newarr[$j] = chr(0).chr(0x28);
  716. $j++;
  717. $newarr[$j] = $cw;
  718. $j++;
  719. $newarr[$j] = $nw;
  720. $j++;
  721. $newarr[$j] = chr(0).chr(0x29);
  722. }
  723. $j++; $i++; $ischeck = true;
  724. }
  725. }
  726. //检测后缀词(地名等)
  727. else if( isset($this->addonDic['a'][$nw]) )
  728. {
  729. $is_rs = false;
  730. //词语是副词或介词不作为前缀
  731. if( strlen($cw)>2 )
  732. {
  733. $winfos = $this->GetWordInfos($cw);
  734. if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  735. {
  736. $is_rs = true;
  737. }
  738. }
  739. if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  740. {
  741. $newarr[$j] = $cw.$nw;
  742. if( !isset($this->newWords[$newarr[$j]]) )
  743. {
  744. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  745. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  746. }
  747. $i++; $j++; $ischeck = true;
  748. }
  749. }
  750. //新词识别(暂无规则)
  751. else if($this->unitWord)
  752. {
  753. if(strlen($cw)==2 && strlen($nw)==2
  754. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  755. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  756. {
  757. $newarr[$j] = $cw.$nw;
  758. //尝试检测第三个词
  759. if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  760. {
  761. $newarr[$j] .= $smarr[$i+2];
  762. $i++;
  763. }
  764. if( !isset($this->newWords[$newarr[$j]]) )
  765. {
  766. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  767. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  768. }
  769. $i++; $j++; $ischeck = true;
  770. }
  771. }
  772. //不符合规则
  773. if( !$ischeck )
  774. {
  775. $newarr[$j] = $cw;
  776. //二元消岐处理——最大切分模式
  777. if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  778. {
  779. $slen = strlen($nw);
  780. $hasDiff = false;
  781. for($y=2; $y <= $slen-2; $y=$y+2)
  782. {
  783. $nhead = substr($nw, $y-2, 2);
  784. $nfont = $cw.substr($nw, 0, $y-2);
  785. if( $this->IsWord( $nfont.$nhead ) )
  786. {
  787. if( strlen($cw) > 2 ) $j++;
  788. $hasDiff = true;
  789. $newarr[$j] = $nfont.$nhead;
  790. }
  791. }
  792. }
  793. $j++;
  794. }
  795. }//end for
  796. $smarr = $newarr;
  797. }
  798. /**
  799. * 转换最终分词结果到 finallyResult 数组
  800. * @return void
  801. */
  802. private function _sort_finally_result()
  803. {
  804. $newarr = array();
  805. $i = 0;
  806. foreach($this->simpleResult as $k=>$v)
  807. {
  808. if( empty($v['w']) ) continue;
  809. if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  810. {
  811. foreach($this->finallyResult[$k] as $w)
  812. {
  813. if(!empty($w))
  814. {
  815. $newarr[$i]['w'] = $w;
  816. $newarr[$i]['t'] = 20;
  817. $i++;
  818. }
  819. }
  820. }
  821. else if($v['t'] != 21)
  822. {
  823. $newarr[$i]['w'] = $v['w'];
  824. $newarr[$i]['t'] = $v['t'];
  825. $i++;
  826. }
  827. }
  828. $this->finallyResult = $newarr;
  829. $newarr = '';
  830. }
  831. /**
  832. * 把uncode字符串转换为输出字符串
  833. * @parem str
  834. * return string
  835. */
  836. private function _out_string_encoding( &$str )
  837. {
  838. $rsc = $this->_source_result_charset();
  839. if( $rsc==1 ) {
  840. $rsstr = iconv(UCS2, 'utf-8', $str);
  841. }
  842. else if( $rsc==2 ) {
  843. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  844. }
  845. else{
  846. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  847. }
  848. return $rsstr;
  849. }
  850. /**
  851. * 获取最终结果字符串(用空格分开后的分词结果)
  852. * @return string
  853. */
  854. public function GetFinallyResult($spword=' ', $word_meanings=false)
  855. {
  856. $rsstr = '';
  857. foreach($this->finallyResult as $v)
  858. {
  859. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  860. {
  861. continue;
  862. }
  863. $m = '';
  864. if( $word_meanings )
  865. {
  866. $m = $this->GetWordProperty($v['w']);
  867. }
  868. $w = $this->_out_string_encoding($v['w']);
  869. if( $w != ' ' )
  870. {
  871. if($word_meanings) {
  872. $rsstr .= $spword.$w.$m;
  873. }
  874. else {
  875. $rsstr .= $spword.$w;
  876. }
  877. }
  878. }
  879. return $rsstr;
  880. }
  881. /**
  882. * 获取粗分结果,不包含粗分属性
  883. * @return array()
  884. */
  885. public function GetSimpleResult()
  886. {
  887. $rearr = array();
  888. foreach($this->simpleResult as $k=>$v)
  889. {
  890. if( empty($v['w']) ) continue;
  891. $w = $this->_out_string_encoding($v['w']);
  892. if( $w != ' ' ) $rearr[] = $w;
  893. }
  894. return $rearr;
  895. }
  896. /**
  897. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  898. * @return array()
  899. */
  900. public function GetSimpleResultAll()
  901. {
  902. $rearr = array();
  903. foreach($this->simpleResult as $k=>$v)
  904. {
  905. $w = $this->_out_string_encoding($v['w']);
  906. if( $w != ' ' )
  907. {
  908. $rearr[$k]['w'] = $w;
  909. $rearr[$k]['t'] = $v['t'];
  910. }
  911. }
  912. return $rearr;
  913. }
  914. /**
  915. * 获取索引hash数组
  916. * @return array('word'=>count,...)
  917. */
  918. public function GetFinallyIndex()
  919. {
  920. $rearr = array();
  921. foreach($this->finallyResult as $v)
  922. {
  923. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  924. {
  925. continue;
  926. }
  927. $w = $this->_out_string_encoding($v['w']);
  928. if( $w == ' ' )
  929. {
  930. continue;
  931. }
  932. if( isset($rearr[$w]) )
  933. {
  934. $rearr[$w]++;
  935. }
  936. else
  937. {
  938. $rearr[$w] = 1;
  939. }
  940. }
  941. arsort( $rearr );
  942. return $rearr;
  943. }
  944. /**
  945. * 获取最终关键字(返回用 "," 间隔的关键字)
  946. * @return string
  947. */
  948. public function GetFinallyKeywords( $num = 10 )
  949. {
  950. $n = 0;
  951. $arr = $this->GetFinallyIndex();
  952. $okstr = '';
  953. foreach( $arr as $k => $v )
  954. {
  955. //排除长度为1的词
  956. if( strlen($k)==1 ) {
  957. continue;
  958. }
  959. //排除长度为2的非英文词
  960. elseif( strlen($k)==2 && preg_match('/[^0-9a-zA-Z]/', $k) ) {
  961. continue;
  962. }
  963. //排除单个中文字
  964. elseif( strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
  965. continue;
  966. }
  967. $okstr .= ($okstr=='' ? $k : ','.$k);
  968. $n++;
  969. if( $n > $num ) break;
  970. }
  971. return $okstr;
  972. }
  973. /**
  974. * 获得保存目标编码
  975. * @return int
  976. */
  977. private function _source_result_charset()
  978. {
  979. if( preg_match("/^utf/", $this->targetCharSet) ) {
  980. $rs = 1;
  981. }
  982. else if( preg_match("/^gb/", $this->targetCharSet) ) {
  983. $rs = 2;
  984. }
  985. else if( preg_match("/^big/", $this->targetCharSet) ) {
  986. $rs = 3;
  987. }
  988. else {
  989. $rs = 4;
  990. }
  991. return $rs;
  992. }
  993. /**
  994. * 编译词典
  995. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  996. * 注意, 需要PHP开放足够的内存才能完成操作
  997. * @return void
  998. */
  999. public function MakeDict( $source_file, $target_file='' )
  1000. {
  1001. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  1002. $allk = array();
  1003. $fp = fopen($source_file, 'r');
  1004. while( $line = fgets($fp, 512) )
  1005. {
  1006. if( $line[0]=='@' ) continue;
  1007. list($w, $r, $a) = explode(',', $line);
  1008. $a = trim( $a );
  1009. $w = iconv('utf-8', UCS2, $w);
  1010. $k = $this->_get_index( $w );
  1011. if( isset($allk[ $k ]) )
  1012. $allk[ $k ][ $w ] = array($r, $a);
  1013. else
  1014. $allk[ $k ][ $w ] = array($r, $a);
  1015. }
  1016. fclose( $fp );
  1017. $fp = fopen($target_file, 'w');
  1018. $heade_rarr = array();
  1019. $alldat = '';
  1020. $start_pos = $this->mask_value * 8;
  1021. foreach( $allk as $k => $v )
  1022. {
  1023. $dat = serialize( $v );
  1024. $dlen = strlen($dat);
  1025. $alldat .= $dat;
  1026. $heade_rarr[ $k ][0] = $start_pos;
  1027. $heade_rarr[ $k ][1] = $dlen;
  1028. $heade_rarr[ $k ][2] = count( $v );
  1029. $start_pos += $dlen;
  1030. }
  1031. unset( $allk );
  1032. for($i=0; $i < $this->mask_value; $i++)
  1033. {
  1034. if( !isset($heade_rarr[$i]) )
  1035. {
  1036. $heade_rarr[$i] = array(0, 0, 0);
  1037. }
  1038. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  1039. }
  1040. fwrite( $fp, $alldat);
  1041. fclose( $fp );
  1042. }
  1043. /**
  1044. * 导出词典的词条
  1045. * @parem $targetfile 保存位置
  1046. * @return void
  1047. */
  1048. public function ExportDict( $targetfile )
  1049. {
  1050. if( !$this->mainDicHand )
  1051. {
  1052. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  1053. }
  1054. $fp = fopen($targetfile, 'w');
  1055. for($i=0; $i <= $this->mask_value; $i++)
  1056. {
  1057. $move_pos = $i * 8;
  1058. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1059. $dat = fread($this->mainDicHand, 8);
  1060. $arr = unpack('I1s/n1l/n1c', $dat);
  1061. if( $arr['l'] == 0 )
  1062. {
  1063. continue;
  1064. }
  1065. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1066. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1067. if( !is_array($data) ) continue;
  1068. foreach($data as $k => $v)
  1069. {
  1070. $w = iconv(UCS2, 'utf-8', $k);
  1071. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1072. }
  1073. }
  1074. fclose( $fp );
  1075. return true;
  1076. }
  1077. }
  1078. ?>