Webタイトルのキーワード記述とWebコンテンツをキャプチャし、自動的に符号化します.
4539 ワード
# $url: #$encode:
function htmload($url,$encode='UTF-8'){
$pageinfo = array();
$pageinfo['content_type'] = '';
$pageinfo['charset'] = '';
$pageinfo['title'] = '';
$pageinfo['description'] = '';
$pageinfo['keywords'] = '';
$pageinfo['body'] = '';
$pageinfo['httpcode'] = 200;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 8);
curl_setopt($ch, CURLOPT_FILETIME, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_URL,$url);
$curl_start = microtime(true);
$store = curl_exec ($ch);
$curl_time = microtime(true) - $curl_start;
if( curl_error($ch) ) {
$pageinfo['httpcode'] = 505;
return $pageinfo;
}
$pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);
$pageinfo['content_type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);
if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE))){
return $pageinfo;
}
preg_match('#charset=([^/s/n/r]+)#i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); // header charset
if( trim($matches[1]) ){
$pageinfo['charset'] = trim($matches[1]);
}
curl_close ($ch);
$store = preg_replace("/<mce:script.*><!--(.*)<\/script>/smUi",'',$store);
$store = preg_replace("/<link\/s+[^>]+>/smUi",'',$store);
$store = preg_replace("/<!--.*-->/smUi",'',$store);
$store = preg_replace("/<style.*>(.*)<\/style>/smUi",'',$store);
$store = preg_replace("/ /",'',$store);
if($pageinfo['charset'] == '' ) {
preg_match('@<meta.+charset=([/w/-]+)[^>]*>@i',$store,$matches);
$pageinfo['charset'] = trim($matches[1]);
}
preg_match('/<meta\s+name=\"description\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches);
$desc = trim($matches[1]);
$pageinfo['description'] = get_encoding(str_replace("/", '',$desc),$encode);
preg_match('/<meta\s+name=\"keywords\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches);
$keywords = trim($matches[1]);
$pageinfo['keywords'] = str_replace("/", '',$keywords);
preg_match("/<title>(.*)<\/title>/smUi",$store, $matches);
$pageinfo['title'] = get_encoding(trim($matches[1]),$encode);
preg_match("/<body.*>(.*)<\/body>/smUi",$store, $matches);
$pageinfo['body'] = get_encoding(addslashes(clhtml($matches[1])),$encode);
return $pageinfo;
}
# js
function clhtml($document){
$document = trim(preg_replace("/\s| /","",$document));
if (strlen($document) <= 0){
return $document;
}
$search = array (
"'<script[^>]*?>.*?</script>'si",
"'<[///!]*?[^<>]*?>'si",
"'[/r/n/s+]'",
"'&([/w]+);'i",
"'&bp;'"
);
$replace = array ( "","","", "","");
return @preg_replace ($search, $replace, $document);
}
#
function get_encoding($data,$to){
$encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP');
$encoded = mb_detect_encoding($data, $encode_arr);
$data = mb_convert_encoding($data,$to,$encoded);
return $data;
}
コードはネット上の一部を参考にしていますが、ネット上の基本的にはBUGがあります.私は修正して最適化して、将来のある日私も検索エンジンを作ることができるとは限らないと言っていました.