scwsカスタム辞書
1 scwsのインストール
インストールの詳細:http://blog.csdn.net/clevercode/article/details/52204124.
2カスタム辞書が追加されていません
2.1 phpコード
# vim parseWord.php
set_dict($dictPath);
//
$myDictPath = ini_get('scws.default.fpath').'/mydict.xdb';
if(file_exists($myDictPath))
{
//$cws->add_dict($myDictPath);
}
$cws->set_ignore(true);
$utf8Str = iconv("GBK","UTF-8//IGNORE",$str);
$cws->send_text($utf8Str);
$resArr = array();
while($tmp = $cws->get_result())
{
$resArr[] = $tmp;
}
$cws->close();
return $resArr;
}
function start()
{
$key = ' ';
$words_array = parse($key);
$str = print_r($words_array,true);
echo ' :'.$key.'
'."\r
";
echo ' :'.iconv("UTF-8","GBK//IGNORE",$str);
}
start();
?>
2.2分詞の中の【リオ五輪の荒れ果てた力】の結果
:
:Array
(
[0] => Array
(
[0] => Array
(
[word] =>
[off] => 0
[len] => 6
[idf] => 15.119999885559
[attr] => ns
)
[1] => Array
(
[word] =>
[off] => 6
[len] => 6
[idf] => 4.8800001144409
[attr] => n
)
[2] => Array
(
[word] =>
[off] => 12
[len] => 6
[idf] => 8.0500001907349
[attr] => n
)
[3] => Array
(
[word] =>
[off] => 18
[len] => 3
[idf] => 0
[attr] => r
)
[4] => Array
(
[word] =>
[off] => 21
[len] => 3
[idf] => 0
[attr] => n
)
)
)
3カスタム辞書の追加
3.1辞書の生成
# vim genMyDict.php
init();
$this->deleteOldFile();
$words = $this->getMyWordData();
$this->write2File($words, $this->myNewDictTxt);
$this->genMyDict();
}/*}}}*/
private function init()
{/*{{{*/
$path = ini_get('scws.default.fpath');
$this->myNewDictTxt = $path.'/myNewDict.txt';
$this->myNewDictXdb = $path.'/myNewDict.xdb';
$this->myDictXdb = $path.'/mydict.xdb';
}/*}}}*/
//
function getMyWordData()
{/*{{{*/
$words = array(' ',' ');
return $words;
}/*}}}*/
function deleteOldFile()
{/*{{{*/
$this->msgLog('INFO'," ");
exec("rm -f $this->myNewDictTxt");
exec("rm -f $this->myNewDictXdb");
}/*}}}*/
//
private function write2File(array $words, $path)
{/*{{{*/
foreach($words as $word)
{
$utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk');
if(trim($utf8Word) != '')
{
$line = sprintf("%s\t%.2f\t%.2f\t%.2s
", trim($utf8Word), 10.00, 10.00, "n");
$this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8'));
file_put_contents($path, $line, FILE_APPEND);
}
}
}/*}}}*/
//
private function genMyDict()
{/*{{{*/
$path = ini_get('scws.default.fpath');
$this->msgLog('INFO'," myNewDict.xdb");
exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb");
$this->msgLog('INFO'," ");
exec("mv $this->myNewDictXdb $this->myDictXdb");
$this->msgLog('INFO'," ");
exec("rm -f $this->myNewDictTxt");
exec("rm -f $this->myNewDictXdb");
}/*}}}*/
/**
*
*
* @param string $level INFO/WARNING/ERROR
* @param string $logStr
* @static
* @access public
* @return void
*/
public function msgLog($level,$logStr)
{/*{{{*/
if($this->isLogStdOut)
{
$t = time();
$logHdr = $t.", [".$level."]: ";
$logStr = $logHdr.$logStr."\r
";
echo $logStr;
}
}/*}}}*/
}/*}}}*/
function start()
{
$myDict = new MyDict();
$myDict->run();
}
start();
?>
辞書生成後の結果
3.2カスタム辞書の追加
取り除くphp、13行コメント.$cws->add_dict($myDictPath); 再びphp parseWordを実行します.php.以下のように、リオ五輪も洪水の力も完成の言葉とされている.
:
:Array
(
[0] => Array
(
[0] => Array
(
[word] =>
[off] => 0
[len] => 12
[idf] => 10
[attr] => n
)
[1] => Array
(
[word] =>
[off] => 12
[len] => 12
[idf] => 10
[attr] => n
)
)
)