新潟県の Go To EATの加盟店一覧をスクレイピングしてCSVに変換


前回公開していた

新潟県の Go To EATの加盟店一覧のPDFをCSVに変換
https://qiita.com/barobaro/items/74fb5bdedbf1ae7267a0

はPDFが見つからないのでスクレイピングで一覧表を作成

スクレイピング

import re
import time

import requests
from bs4 import BeautifulSoup

url = "https://niigata-gte.com/shop/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

result = []

while True:

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    for shop in soup.select("div#result > div.cont"):

        data = {}

        data["取扱店コード"] = (
            shop.select_one("div.no").get_text(strip=True).split(":", 1)[-1]
        )

        span = shop.select("div.tag > span")

        data["エリア"] = span[0].get_text(strip=True)
        data["ジャンル"] = span[1].get_text(strip=True)

        if len(span) > 2:
            temp = {i.get("alt"): "○" for i in span[2].select("img")}
            data.update(temp)

        h4 = shop.select_one("h4")

        data["店舗名"] = h4.get_text(strip=True)

        if h4.select_one("a"):

            link = h4.a.get("href")

            if link:
                data["ホームページ"] = link

        p_add = shop.select_one("p.add").contents

        postcode, address = p_add[0].split(sep=None, maxsplit=1)

        # google mapのリンクから緯度・経度を抽出
        gps = re.search(r"(?<=@)(.+?),(.+?)(?=,\d{1,2}z)", p_add[1].a.get("href"))

        if gps:
            data["緯度"] = float(gps.group(1))
            data["経度"] = float(gps.group(2))

        data["郵便番号"] = postcode.strip()
        data["所在地"] = address.strip()

        data["電話番号"] = shop.select_one("p.tel").get_text(strip=True)

        result.append(data)

    tag = soup.select_one("li.next")

    if tag:

        m = re.search("https://niigata-gte.com/shop/page/\d+/", tag.a.get("onclick"))

        if m:
            url = m.group(0)

    else:
        break

    time.sleep(3)

result

CSVに変換

import pandas as pd

df = pd.DataFrame(result)

df.index += 1

df.to_csv("niigata.csv", encoding="utf_8_sig")