新型コロナウイルス感染症に関する相模原市発表資料(発生状況等)のPDFをCSVに変換


import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find(
    "a", href=re.compile(".pdf$"), onclick=re.compile("新型コロナウイルス感染症による新たな患者の確認")
)

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

with pdfplumber.open(path_pdf) as pdf:

    dfs = []

    for page in pdf.pages:

        if page.page_number == 1:

            # cropでテキスト取得
            crop = page.within_bbox((400, 44, page.width, 60))
            update = crop.extract_text()

        for table in page.extract_tables():

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            # 列が11

            if col == 11:

                # 表の一番先頭に未満が含まれない

                if "未満" not in table[0][0]:

                    dfs.append(df_tmp)

df = (
    pd.concat(dfs)
    .iloc[1:]
    .set_axis(
        ["症例No.", "年代", "性別", "職業等", "場所", "居住地", "症状", "発症日", "陽性判明日", "感染経路等", "備考"],
        axis=1,
    )
)

df

# 前後の空白文字、正規化
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")

dt_now = datetime.datetime.now()


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{1,2})月(\d{1,2})日")
        .rename(columns={0: "month", 1: "day"})
        .fillna(0)
        .astype(int)
    )

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")


df["発症日YMD"] = str2date(df["発症日"])

df["陽性判明日YMD"] = str2date(df["陽性判明日"])

y, m, d = map(int, re.findall("\d+", update))

dt_update = datetime.datetime(2018 + y, m, d)


df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")