scwsカスタム辞書


CleverCodeはscws分詞の効率が高いことを発見し,カスタム分詞ライブラリを検討した.
1 scwsのインストール
インストールの詳細:http://blog.csdn.net/clevercode/article/details/52204124.
2カスタム辞書が追加されていません
2.1 phpコード
# vim parseWord.php
set_dict($dictPath);

    //      
	$myDictPath = ini_get('scws.default.fpath').'/mydict.xdb';
	if(file_exists($myDictPath))
	{
	    //$cws->add_dict($myDictPath);
	}
	$cws->set_ignore(true);

	$utf8Str = iconv("GBK","UTF-8//IGNORE",$str);
	$cws->send_text($utf8Str);
	$resArr = array();
	while($tmp = $cws->get_result())
	{
		$resArr[] = $tmp;
	}
	$cws->close();
	
	return $resArr;
}

function start()
{
	$key = '        ';
	
	$words_array = parse($key); 
    $str = print_r($words_array,true);
	echo '

:'.$key.'

'."\r
"; echo '

:'.iconv("UTF-8","GBK//IGNORE",$str); } start(); ?>


2.2分詞の中の【リオ五輪の荒れ果てた力】の結果

:Array ( [0] => Array ( [0] => Array ( [word] => [off] => 0 [len] => 6 [idf] => 15.119999885559 [attr] => ns ) [1] => Array ( [word] => [off] => 6 [len] => 6 [idf] => 4.8800001144409 [attr] => n ) [2] => Array ( [word] => [off] => 12 [len] => 6 [idf] => 8.0500001907349 [attr] => n ) [3] => Array ( [word] => [off] => 18 [len] => 3 [idf] => 0 [attr] => r ) [4] => Array ( [word] => [off] => 21 [len] => 3 [idf] => 0 [attr] => n ) ) )


3カスタム辞書の追加
3.1辞書の生成
# vim genMyDict.php
init();

        $this->deleteOldFile();

        $words = $this->getMyWordData();

        $this->write2File($words, $this->myNewDictTxt);

        $this->genMyDict();
    }/*}}}*/

    private function init()
    {/*{{{*/
        $path = ini_get('scws.default.fpath');
        $this->myNewDictTxt = $path.'/myNewDict.txt';
        $this->myNewDictXdb = $path.'/myNewDict.xdb';
        $this->myDictXdb = $path.'/mydict.xdb';
    }/*}}}*/

    //        
    function getMyWordData()
    {/*{{{*/
        $words = array('    ','    ');    
        return $words;
    }/*}}}*/

    function deleteOldFile()
    {/*{{{*/
        $this->msgLog('INFO',"     ");
        exec("rm -f $this->myNewDictTxt");
        exec("rm -f $this->myNewDictXdb");
    }/*}}}*/

    //    
    private function write2File(array $words, $path)
    {/*{{{*/

        foreach($words as $word)
        {
            $utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk');
            if(trim($utf8Word) != '')
            {
                $line = sprintf("%s\t%.2f\t%.2f\t%.2s
", trim($utf8Word), 10.00, 10.00, "n"); $this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8')); file_put_contents($path, $line, FILE_APPEND); } } }/*}}}*/ // private function genMyDict() {/*{{{*/ $path = ini_get('scws.default.fpath'); $this->msgLog('INFO'," myNewDict.xdb"); exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb"); $this->msgLog('INFO'," "); exec("mv $this->myNewDictXdb $this->myDictXdb"); $this->msgLog('INFO'," "); exec("rm -f $this->myNewDictTxt"); exec("rm -f $this->myNewDictXdb"); }/*}}}*/ /** * * * @param string $level INFO/WARNING/ERROR * @param string $logStr * @static * @access public * @return void */ public function msgLog($level,$logStr) {/*{{{*/ if($this->isLogStdOut) { $t = time(); $logHdr = $t.", [".$level."]: "; $logStr = $logHdr.$logStr."\r
"; echo $logStr; } }/*}}}*/ }/*}}}*/ function start() { $myDict = new MyDict(); $myDict->run(); } start(); ?>

辞書生成後の結果
scws自定义分词库_第1张图片
3.2カスタム辞書の追加
取り除くphp、13行コメント.$cws->add_dict($myDictPath); 再びphp parseWordを実行します.php.以下のように、リオ五輪も洪水の力も完成の言葉とされている.

:Array ( [0] => Array ( [0] => Array ( [word] => [off] => 0 [len] => 12 [idf] => 10 [attr] => n ) [1] => Array ( [word] => [off] => 12 [len] => 12 [idf] => 10 [attr] => n ) ) )