Googleプレスリリース[HTML全体からハングル単語のみ抽出]

3271 ワード

最も面白いscriperを作ります!

リポジトリとリポジトリをインストールする必要があります.
from bs4 import BeautifulSoup

from selenium import webdriver

from time import sleep

import requests

import re

import urllib

import konlpy.tag

keyword_input = "EX"

keyword = urllib.parse.quote(keyword_input)

base_url = "https://news.google.com"

search_url = base_url + "/search?q=" + keyword + "&hl=ko&gl=KR&cied=KR%3Ako"

print("검색어와 조합한 URL: ", search_url)

driver = webdriver.Chrome('./chromedriver')

page_ko = []

Okt = konlpy.tag.Okt()

page_c = 1

def google_news_clipping_keyword(keyword_input, limit = 5 ) :

    keyword = urllib.parse.quote(keyword_input) #키워드로 구글 뉴스 주소를 만드는 부분

    url = base_url + "/search?q=" + keyword + "&hl=ko&gl=KR&cied=KR%3Ako"

    resp = requests.get(url)

    html_src = resp.text

    soup = BeautifulSoup(html_src, "html.parser")

    news_items = soup.select('div[class="xrnccd"]')

    links = []; titles=[]; contents=[]; agencies=[]; reporting_dates=[]; reporting_times=[]

    for item in news_items[:limit]:

        global page_c

        link = item.find('a', attrs={'class': 'VDXfz'}).get('href')

        news_link = base_url + link[1:]

        links.append(news_link)

        news_agency = item.find('a', attrs={'class': 'wEwyrc AVN2gc uQIVzc Sksgp'}).text 

        agencies.append(news_agency)

        news_reporting = item.find('time', attrs = {'class':'WW6dff uQIVzc Sksgp'})

        news_reporting_datetime = news_reporting.get('datetime').split("T")

        news_reporting_date = news_reporting_datetime[0]

        news_reporting_time = news_reporting_datetime[1][:-1]

        reporting_dates.append(news_reporting_date)

        reporting_times.append(news_reporting_time)

        url_html_out_korean(news_link)

        kotext_rm_stopwords(page_ko)

        print(page_c)

        page_c += 1

    result = {'link': links, 'agency': agencies, \

        'date': reporting_dates, 'time': reporting_times}

    return result

def url_html_out_korean(url):

    global page_ko

    driver.get(url)

    sleep(6)

    page = driver.page_source

    page_ko = re.compile('[가-힣]+').findall(page)

def kotext_rm_stopwords(text):

    Noun_words = []

    Okt_morphs = Okt.pos(' '.join(text))  # 튜플반환

    for word, pos in Okt_morphs:

        if pos == 'Noun':

            Noun_words.append(word)

    unique_Noun_words = set(Noun_words)

    for word in unique_Noun_words:

        if word in stopwords:

            while word in Noun_words: Noun_words.remove(word)

    f = open(search_word+str(page_c)+".txt", 'w', encoding="UTF8")

    f.write(' '.join(Noun_words))

    f.close()

search_word = input('검색어를 입력하세요: ')

page_count = input('페이지 수를 입력하세요: ')

stopwords = [search_word]

news = google_news_clipping_keyword(search_word, int(page_count))

print(news['link'])

print(news['agency'])