JFLの試合結果から得点ランキングを作成

19949 ワード

pandas スクレイピング Python データラングリング Python テキストリンク

スクレイピング

import time
import unicodedata
from urllib.parse import urljoin
import re

import requests
from bs4 import BeautifulSoup

def cleaning(info, team, html):

    result = []

    for trs in html:

        data = [i.get_text(strip=True) for i in trs.select("th, td")]

        # 時間の分を除去後、延長時間を計算
        data[0] = eval(data[0].rstrip("分"))

        # 選手名のPKを削除
        data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()

        result.append(info + [team] + data)

    return result

def scraping(n, url):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html5lib")

    # 節
    score_season = soup.select_one(
        "div.score-header > h2.score-meta > span.score-season"
    ).get_text(strip=True)

    score_season = int(score_season.strip("第節"))

    # print(score_season)

    # 日時
    score_date = (
        soup.select_one("div.score-header > h2.score-meta > span.score-date")
        .get_text(strip=True)
        .split()
    )

    # print(score_date)

    # チーム名
    score_table = soup.select_one("table.score-table")

    home_team = score_table.select_one("th.score-team1").get_text(strip=True)
    away_team = score_table.select_one("th.score-team2").get_text(strip=True)

    # print(home_team, away_team)

    # 試合情報
    game_info = [n, score_season] + score_date + [home_team, away_team]

    # 得点
    tag = soup.find("h3", text="得　点")

    # 得点のテーブルか確認
    if tag:

        table_home = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-left > table > tbody > tr"
            )
        ]
        home_data = cleaning(game_info, home_team, table_home)

        table_away = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-right > table > tbody > tr"
            )
        ]
        away_data = cleaning(game_info, away_team, table_away)

        score_data = home_data + away_data

        return score_data

    return None

url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"

r = requests.get(url)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "詳細"]

result = []

for i, link in enumerate(links):

    score_data = scraping(i, link)

    if score_data:

        result.extend(score_data)

    time.sleep(1)

データラングリング

import pandas as pd

df = pd.DataFrame(result, columns=["試合", "節", "日付", "時刻", "ホーム", "アウェイ", "チーム名", "時間", "背番号", "選手名"])

df

df["得点"] = 1

# ゴール数ランキング
pv_goal = df.pivot_table(
    values="得点", index=["選手名", "チーム名", "背番号"], aggfunc=sum, fill_value=0
).drop(["オウンゴール"]).reset_index()

pv_goal["背番号"] = pv_goal["背番号"].astype(int)

# ランキング
pv_goal["順位"] = pv_goal["得点"].rank(ascending=False, method="min").astype(int)

# チーム
jfl_2020 = [
    "Ｈｏｎｄａ ＦＣ",
    "ソニー仙台ＦＣ",
    "東京武蔵野シティＦＣ",
    "テゲバジャーロ宮崎",
    "ホンダロックＳＣ",
    "ヴェルスパ大分",
    "ＦＣ大阪",
    "ＭＩＯびわこ滋賀",
    "ヴィアティン三重",
    "ＦＣマルヤス岡崎",
    "鈴鹿ポイントゲッターズ",
    "ラインメール青森",
    "奈良クラブ",
    "松江シティＦＣ",
    "いわきＦＣ",
    "高知ユナイテッドＳＣ",
]

team = {name: i for i, name in enumerate(jfl_2020, 1)}

pv_goal["チームID"] = pv_goal["チーム名"].map(team)

# 順位・チーム名・選手名で昇順
pv_goal.sort_values(["順位", "チームID", "背番号"], ascending=[True, True, True], inplace=True)

pv_goal.drop(["チームID", "背番号"], axis=1, inplace=True)

pv_goal.set_index("順位", inplace=True)

pv_goal.to_csv("goal.csv")

JFLの試合結果から得点ランキングを作成

スクレイピング

データラングリング

ランキング

Author And Source