【Python爬虫類】第15回作業

1374 ワード

import requests
from lxml import etree
url='http://www.ygdy8.com'
req = requests.get(url)
req.encoding='gb2312'
html=req.text
selector=etree.HTML(html)
infos=selector.xpath('//div[@class="contain"][1]/ul/li[position()<10]/a')
url_list=[]
for info in infos:
    a_text=info.xpath('text()')
    a_href=info.xpath('@href')
    if len(a_text)==0 or a_text[0]=='    ':
        pass
    else:
        menu_url=url+a_href[0]
        print(a_text[0],menu_url)
        req2 = requests.get(menu_url)
        req2.encoding='gb2312'
        html2=req2.text
        # print(html2)
        selector2=etree.HTML(html2)
        page_total=selector2.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace(' ','').replace(' ','')
        print(page_total)
        list_id=selector2.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html','')
        print(list_id)
        for i in range(1,int(page_total)+1):
            right_url=list_id+str(i)
            # print(right_url)
            page_url=menu_url.replace('index',right_url)
            # print(page_url)
            url_list.append(page_url)
        print(len(url_list))