perl爬虫類の2つのテクニック
2850 ワード
<pre name="code" class="cpp">jrhmpt01:/root/lwp# cat data.html
<div class="m-page J-ajax-page">
<a class="changePage" page="1" href="javascript:void(0);"> </a> <a class="changePage" page="11" href="javascript:void(0);"> </a> <a class="changePage" page="11" href="javascript:void(0);">11</a> <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
</div>
<div class="m-page J-ajax-page">
<a class="changePage" page="1" href="javascript:void(0);"> </a> <a class="changePage" page="11" href="javascript:void(0);"> </a> <a class="changePage" page="11" href="javascript:void(0);">11</a> <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
</div>
jrhmpt01:/root/lwp# cat c1.pl
use LWP::UserAgent;
use DBI;
use POSIX;
use Data::Dumper;
use HTML::TreeBuilder;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->agent("Mozilla/8.0");
use HTML::TreeBuilder::XPath;
$tree= HTML::TreeBuilder::XPath->new;
$tree->parse_file( "data.html");
my @title= $tree->findvalues('/html/body//a[@class="changePage"]');
print "\@title is @title
";
jrhmpt01:/root/lwp# perl c1.pl
@title is 11 11
my @title= $tree->findvalue('/html/body//a[@class="changePage"]');
body a @class="changePage"
jrhmpt01:/root/lwp# cat c1.pl
use LWP::UserAgent;
use DBI;
use POSIX;
use Data::Dumper;
use HTML::TreeBuilder;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->agent("Mozilla/8.0");
use HTML::TreeBuilder::XPath;
$tree= HTML::TreeBuilder::XPath->new;
$tree->parse_file( "data.html");
my @pages=$tree->find_by_tag_name('a');
#@urlall ,
foreach (@pages) {
@titlepage = $_->attr('page');
foreach (@titlepage) {
if ($_){
print "\$_ is $_
";
};
};
};
jrhmpt01:/root/lwp# perl c1.pl
$_ is 1
$_ is 11
$_ is 11
$_ is 1
$_ is 11
$_ is 11
a , page