perl爬虫類の2つのテクニック

2850 ワード

<pre name="code" class="cpp">jrhmpt01:/root/lwp# cat data.html 
     <div class="m-page J-ajax-page">
        <a class="changePage" page="1" href="javascript:void(0);">  </a> <a class="changePage" page="11" href="javascript:void(0);">   </a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
     </div>

    <div class="m-page J-ajax-page">
        <a class="changePage" page="1" href="javascript:void(0);">  </a> <a class="changePage" page="11" href="javascript:void(0);">   </a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
     </div>
	 
	 

jrhmpt01:/root/lwp# cat c1.pl 
use  LWP::UserAgent;
use DBI;  
use POSIX;
use Data::Dumper;
use HTML::TreeBuilder;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->agent("Mozilla/8.0");


  use HTML::TreeBuilder::XPath;
   $tree= HTML::TreeBuilder::XPath->new;
  $tree->parse_file( "data.html");
my @title=  $tree->findvalues('/html/body//a[@class="changePage"]');
print "\@title is @title
"; jrhmpt01:/root/lwp# perl c1.pl @title is 11 11 my @title= $tree->findvalue('/html/body//a[@class="changePage"]'); body a @class="changePage" jrhmpt01:/root/lwp# cat c1.pl use LWP::UserAgent; use DBI; use POSIX; use Data::Dumper; use HTML::TreeBuilder; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; $ua->agent("Mozilla/8.0"); use HTML::TreeBuilder::XPath; $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "data.html"); my @pages=$tree->find_by_tag_name('a'); #@urlall , foreach (@pages) { @titlepage = $_->attr('page'); foreach (@titlepage) { if ($_){ print "\$_ is $_
"; }; }; }; jrhmpt01:/root/lwp# perl c1.pl $_ is 1 $_ is 11 $_ is 11 $_ is 1 $_ is 11 $_ is 11 a , page