miniProject4 - MovieStar: 1. DBの作成(スクロール)

9278 ワード

パッケージとDB接続(init db.py)

import requests
from bs4 import BeautifulSoup

from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.moviestarDB
(NAVER映画、映画人ランキングページ)<=スクロール

1.DBに保存する映画人のソースURLを取得する

# DB에 저장할 영화인들의 출처 url을 가져옵니다.
def get_urls():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    data = requests.get('https://movie.naver.com/movie/sdb/rank/rpeople.nhn', headers=headers)

    soup = BeautifulSoup(data.text, 'html.parser')

    trs = soup.select('#old_content > table > tbody > tr')

    urls = []
    for tr in trs:
        # old_content > table > tbody > tr:nth-child(X) > td.title > a
        a = tr.select_one('td.title > a') # a 태그
        if a is not None:
            base_url = 'https://movie.naver.com/'
            url = base_url + a['href']
            urls.append(url)  # urls 리스트에 추가 (각 배우에 해당하는 링크)

    return urls

2.各映画人のページから写真、名前、最新作品情報を取得してDBに保存する

def insert_star(url):
   headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
   data = requests.get(url, headers=headers)

   soup = BeautifulSoup(data.text, 'html.parser')

   # (HTML)
   # name: #content > div.article > div.mv_info_area > div.mv_info.character > h3 > a
   # img_url: #content > div.article > div.mv_info_area > div.poster > img
   # recent_work: #content > div.article > div.section_group.section_group_frst > div:nth-child(2) > div > ul > li:nth-child(1)

   name = soup.select_one('#content > div.article > div.mv_info_area > div.mv_info.character > h3 > a').text
   img_url = soup.select_one('#content > div.article > div.mv_info_area > div.poster > img')['src']
   recent_work = soup.select_one(
       '#content > div.article > div.mv_info_area > div.mv_info.character > dl > dd > a:nth-child(1)').text

   doc = {
       'name': name,
       'img_url': img_url,
       'recent': recent_work,
       'url': url,
       'like': 0
   }

   db.mystar.insert_one(doc)
   print('완료!', name)

3.コレクションを更新およびロールバックしてデータベースに保存

def insert_all():
    db.mystar.drop()  # mystar 콜렉션을 모두 지워줍니다.
    urls = get_urls()
    for url in urls:
        insert_star(url)


# 실행하기
insert_all()

moviestarDBの「mystar」集合