phpを解析して正規表現を用いて収集内容のレイアウトの問題を解決する
/**
*
* @param string $content utf-8
* @return string
* ! tidy
*/
function removeFormat($content) {
$replaces = array (
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => "",
"//i" => "
",
"//i"=>'',
/* "//i" => '',//
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => '',
"//i" => '
',
"//i" => '', */
"/style=.+?['|\"]/i" => '',
"/class=.+?['|\"]/i" => '',
"/id=.+?['|\"]/i"=>'',
"/lang=.+?['|\"]/i"=>'',
//"/width=.+?['|\"]/i"=>'',//
//"/height=.+?['|\"]/i"=>'',
"/border=.+?['|\"]/i"=>'',
"/face=.+?['|\"]/i"=>'',
"/[ ]*/i" => "",
"/.*/i" => '',
"/ /i" => ' ',//
"/[ |\x{3000}|\r
]*/ui" => '
',// 、 , ,
);
$config = array(
//'indent' => TRUE, //
'output-html' => TRUE,// xhtml
'show-body-only'=>TRUE,// body
'wrap' => 0
);
$content = tidy_repair_string($content, $config, 'utf8');// php tidy html ,
$content = trim($content);
foreach ( $replaces as $k => $v ) {
$content = preg_replace ( $k, $v, $content );
}
if(strpos($content,'
')>6)//
$content = '
'.$content;
$content = tidy_repair_string($content, $config, 'utf8');// , html
$content = trim($content);
return $content;
}