沖縄県の新型コロナウイルス陽性者一覧をデータラングリング


import datetime
import pathlib
import re

from urllib.parse import urljoin

import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{4})年(\d{1,2})月(\d{1,2})日")
        .rename(columns={0: "year", 1: "month", 2: "day"})
        .fillna(0)
        .astype(int)
    )

    return pd.to_datetime(df, errors="coerce")

# スクレイピング
url = "https://www.pref.okinawa.lg.jp/site/hoken/kansen/soumu/press/20200214_covid19_pr1.html"

soup = fetch_soup(url)

tag = soup.find("a", class_=None, text=re.compile("^陽性者一覧"), href=re.compile(".csv$"))
link = urljoin(url, tag.get("href"))

# データラングリング

p = fetch_file(link)
df = pd.read_csv(p, encoding="cp932")

df["確定日YMD"] = str2date(df["確定日"])
df["発病日YMD"] = str2date(df["発病日"])
df["状況"] = df["発病日"].where(df["発病日YMD"].isnull())

df

# 削除条件抽出

df["drop"] = False

# 確定陽性者が数字じゃない
df["drop"] = df["drop"].where(df["確定陽性者"].str.isnumeric(), True)

# 性別が男性・女性・非公表じゃない
df["drop"] = df["drop"].where(df["性別"].isin(["男性", "女性", "非公表"]), True)

# 年齢が欠番
df["drop"] = df["drop"].mask(df["年齢"] == "欠番", True)

# 削除条件抽出

df1 = df[~df["drop"]].copy()

df1.drop("drop", axis=1, inplace=True)

df1["確定陽性者"] = df1["確定陽性者"].astype(int)

df1.set_index("確定陽性者", inplace=True)
df1.sort_index(inplace=True)

df1.to_csv("output.csv", encoding="utf_8_sig")

# 削除条件抽出

df2 = df[df["drop"]].copy()

# 削除条件の番号抽出

# 数字以外を除去
missing_num = sorted([int(i) for i in df2["確定陽性者"].to_list() if i.isdecimal()])

JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST).replace(tzinfo=None)

with open("report.txt", "w") as fw:
    print(f'Report created at: {dt_now.strftime("%H:%M:%S")} JST', file=fw)
    print(f"Total cases: {len(df)}", file=fw)
    print(f"Missing cases: {len(missing_num)}", file=fw)
    print(f"Missing case id: {missing_num}", file=fw)