[python爬虫類04]邪悪動態図

2464 ワード

コード
# -*- coding:utf8 -*-

'''
http://www.xieemanhuaba.com/xieedontaitu/

sudo -H pip install requests beautifulsoup
'''

import os               # path, makedirs
import requests         #     
import urllib           #     
from bs4 import BeautifulSoup #     
import re               #      

CURR_DIR = os.path.dirname(os.path.abspath(__file__))
FOLDRE = 'xeba'

def downloadUrl(url, path):
    print url, path
    if os.path.exists(path): return
    mkdir(os.path.dirname(path))
    urllib.urlretrieve(url, path)

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def fixUrl(url):
    return 'http://www.xieemanhuaba.com' + url

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}

url = 'http://www.xieemanhuaba.com/xieedontaitu/'
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
item = soup.find('div', class_='kg')
count = int(item.span.text)
print u'  %d   .' % count


title = item.a.get('title')
url = fixUrl(item.a.get('href'))
print title, url

def parsePage(url, subpage=True):
    print 'url=', url
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    item = soup.find('li', id='imgshow')
    try:
        title, img_url = item.img.get('alt'), item.img.get('src')
        print title, img_url
        downloadUrl(img_url, os.path.join(CURR_DIR, FOLDRE, os.path.basename(img_url)))
    except Exception,e:
        print u'       %s' % e.message

    if subpage:
        pagelist = soup.find('ul', class_='pagelist').find_all('li')
        if len(pagelist):
            #             
            pagecount = re.findall("\d+", pagelist[0].a.text)[0]
            pagecount = int(pagecount)
            print u'     :', pagecount
            baseurl = url.replace('.html', '')
            for index in xrange(2, pagecount+1):
                nexturl = '%s_%d.html' % (baseurl, index)
                pagelist += nexturl
                print nexturl
                parsePage(nexturl, subpage=False)
        url = re.findall("var str = \S+
後記
次のインデックスはjsを使って作られるので、正規表現を使ってマッチします。一部のページは動画がないので、異常な状況に注意してください。