新型コロナウイルス感染症に関する相模原市発表資料(発生状況等)のPDFをCSVに変換
14951 ワード
import datetime
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find(
"a", href=re.compile(".pdf$"), onclick=re.compile("新型コロナウイルス感染症による新たな患者の確認")
)
link = urljoin(url, tag.get("href"))
path_pdf = fetch_file(link)
with pdfplumber.open(path_pdf) as pdf:
dfs = []
for page in pdf.pages:
if page.page_number == 1:
# cropでテキスト取得
crop = page.within_bbox((400, 44, page.width, 60))
update = crop.extract_text()
for table in page.extract_tables():
df_tmp = pd.DataFrame(table)
row, col = df_tmp.shape
# 列が11
if col == 11:
# 表の一番先頭に未満が含まれない
if "未満" not in table[0][0]:
dfs.append(df_tmp)
df = (
pd.concat(dfs)
.iloc[1:]
.set_axis(
["症例No.", "年代", "性別", "職業等", "場所", "居住地", "症状", "発症日", "陽性判明日", "感染経路等", "備考"],
axis=1,
)
)
df
# 前後の空白文字、正規化
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")
dt_now = datetime.datetime.now()
def str2date(s: pd.Series) -> pd.Series:
df = (
s.str.extract("(\d{1,2})月(\d{1,2})日")
.rename(columns={0: "month", 1: "day"})
.fillna(0)
.astype(int)
)
df["year"] = dt_now.year
tmp = pd.to_datetime(df, errors="coerce")
df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)
return pd.to_datetime(df, errors="coerce")
df["発症日YMD"] = str2date(df["発症日"])
df["陽性判明日YMD"] = str2date(df["陽性判明日"])
y, m, d = map(int, re.findall("\d+", update))
dt_update = datetime.datetime(2018 + y, m, d)
df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")
Author And Source
この問題について(新型コロナウイルス感染症に関する相模原市発表資料(発生状況等)のPDFをCSVに変換), 我々は、より多くの情報をここで見つけました https://qiita.com/barobaro/items/55ad358ad7ef4a07c65f著者帰属:元の著者の情報は、元のURLに含まれています。著作権は原作者に属する。
Content is automatically searched and collected through network algorithms . If there is a violation . Please contact us . We will adjust (correct author information ,or delete content ) as soon as possible .