BeautifulSoup4でWEBスクレイピング(階層化ページ)


BeutifulSoup4でWEBスクレイピング

BeautifulSoup4でWEBスクレイピング(連番ページ)に引き続き、階層化されているページのためのコードを書いたのでメモ

ポイント

カテゴリ→ページ→欲しいファイルと順にリストを作成、処理していくと途中で中断しても再開しやすい

コード

scraper.py
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
    # Python 3
    from urllib import request
except ImportError:
    # Python 2
    import urllib2 as request

from bs4 import BeautifulSoup
import time, os, codecs, string, json

domain = 'http://hoge.com'
wait_sec = 3
headers = { 'User-Agent' : 'Mozilla/5.0)' }
cwd = os.getcwd()
result_file = cwd + '/result_url.txt'
category_file = cwd + '/category.txt'
page_file = cwd + '/page.txt'

def fetchSoup(url):
    time.sleep(wait_sec)

    req = request.Request(url, headers = headers)
    try:
        print('open {url}'.format(url = url))
        response = request.urlopen(req)
        print('ok')
        body = response.read()
        return BeautifulSoup(body, 'lxml')
    except URLError, e:
        print('error: {reason}'.format(reason = e.reason))
        return None

def getUrl(src):
    return '{domain}{src}'.format(domain = domain, src = src)

def extractUrlFromTags(tags):
    result = []
    for tag in tags:
        if tag.name == 'a':
            result.append(getUrl(tag['href']))
        elif tag.name == 'img':
            result.append(getUrl(tag['src']))
    return result

def saveUrl(file_name, url_list):
    with codecs.open(file_name, 'a', 'utf-8') as f:
        f.write('{list}\n'.format(list = '\n'.join(url_list)))

def deleteFirstLine(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f:
        content = f.read()
        content = content[content.find('\n') + 1:]
    with codecs.open(file_name, 'w', 'utf-8') as f:
        f.write(content)

def fetchAllCategories():
    page = 1
    while True:
        url = '{domain}/category_{page}/'.format(domain = domain, page = page)
        soup = fetchSoup(url)
        categories = soup.find('div', id = 'list').find_all('a')
        url_list = extractUrlFromTags(categories)
        if len(url_list):
            saveUrl(category_file, url_list)
        page_list_last = soup.find('div', class = 'pagenation').find_all('a')[-1].string
        if page_list_last not in ['>', '>>']:
            break
        page += 1

def fetchCategory():
    if not os.path.exists(category_file):
        fetchAllCategories()
    with codecs.open(category_file, 'r', 'utf-8') as f:
        result = f.readline().rstrip('\n')
    return result

def fetchAllPages():
    category = fetchCategory()
    while category != '':
        soup = fetchSoup(category)
        pages = soup.find_all('a', class = 'page')
        url_list = extractUrlFromTag(pages)
        if len(url_list):
            saveUrl(page_file)
        deleteFirstLine(page_file)
        small_category = fetchCategory()

def fetchPage():
    if os.path.exists(page_file) or fetchCategory() != '':
        fetchAllPages()
    with codecs.open(page_file, 'r', 'utf-8'):
        result = f.readline().rstrip('\n')
    return result

def fetchTargets():
    page = fetchPage()
    while page != '':
        soup = fetchSoup(page)
        targets = soup.find_all('img', class = 'target')
        url_list = extractUrlFromTags(targets)
        if len(url_list):
            saveUrl(result_file, url_list)
        deleteFirstLine(page_file)
        page = fetchPage()

fetchTargets()

便利技

名前などがアルファベットでカテゴライズされているときに

alphabet_l = list(string.ascii_lowercase)
alphabet_u = list(string.ascii_uppercase)

scriptタグから抜き出した変数などを処理するのに

data = json.loads(json_string)

VPS等での実行時にバックグラウンドで処理、ログオフしても継続するために

$ nohup python scraper.py < /dev/null &

継続処理しているプロセスを確認するために

$ ps x