[オンラインローミング大魔王]様々なフォーマットで保存


完全なコード
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
import os
import requests
import pandas as pd

f_name = os.getcwd()
fc_name = os.getcwd()
fx_name = os.getcwd()

while True:
    try:
        site = input("사이트를 입력하세요\n")
        if site.find("naver") != -1 or site.find("네이버") != -1:
            site = "https://search.naver.com/search.naver"
            f_name += "\\py_site2\\naver.txt"
            fc_name += "\\py_site2\\naver.csv"
            fx_name += "\\py_site2\\naver.xls"
            break
        elif site.find("daum") != -1 or site.find("다음") != -1:
            site = "https://search.daum.net/search"
            f_name += "\\py_site2\\daum.txt"
            fc_name += "\\py_site2\\daum.csv"
            fx_name += "\\py_site2\\daum.xls"
            break
        elif site.find("google") != -1 or site.find("구글") != -1:
            site = "https://www.google.co.kr/search"
            f_name += "\\py_site2\\google.txt"
            fc_name += "\\py_site2\\google.csv"
            fx_name += "\\py_site2\\google.xls"
            break 
        else:
            print("지원하지 않는 사이트입니다.")
    except:
        print(" 오류가 발생하였습니다.")


query_txt = input('크롤링할 키워드는 무엇입니까\n')


if site == "https://search.naver.com/search.naver":
    params = {'where':'kin', 'query': query_txt}
    resp = requests.get(site, params = params)


    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('ul.lst_total > li')
    no =1
    numbers = []
    questions=[]
    answers=[]
    
    for i in content_list:
    
        numbers.append(no)
        print('번호: ', no)

        question = i.find('div', 'question_group').get_text()
        questions.append(question)
        print('질문: ', question.strip())

        answer = i.find('div', 'answer_group').get_text()
        answers.append(answer)
        print('답변: ', answer.strip())
   
        no += 1
    
    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['질문'] = questions
    DB['답변'] = answers

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(answers))
    f.close()

elif site == "https://search.daum.net/search":   
    params = {'w':'blog', 'q': query_txt}
    resp = requests.get(site, params = params)


    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('ul.list_info > li')
    no =1
    numbers = []
    titles=[]
    bodies=[]

    
    for i in content_list:
    
        numbers.append(no)
        print('번호: ', no)

        title = i.find('div', 'wrap_tit mg_tit').get_text()
        titles.append(title)
        print('제목: ', title.strip())

        body = i.find('p', 'f_eb desc').get_text()
        bodies.append(body)
        print('본문: ', body.strip())
   
        no += 1
    
    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['제목'] = titles
    DB['본문'] = bodies

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(bodies))
    f.close()
        
elif site == "https://www.google.co.kr/search":
    params = {'q': query_txt, 'tbm':'nws'}
    resp = requests.get(site, params = params)

    time.sleep(1)
    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('div#main div.ZINbbc')
    no =1
    numbers = []
    titles=[]
    bodies=[]
    
    
    for i in content_list:
        title = i.find('div', class_ ='BNeawe vvjwJb AP7Wnd')
        if title == None:
            continue
            
        numbers.append(no)
        print('번호: ', no)
        
        titles.append(title.get_text())
        print('제목: ', title.get_text().strip())
        
        body = i.find('div', class_ ='BNeawe s3v9rd AP7Wnd').get_text()
        bodies.append(body)
        print('본문: ', body.strip())
        
        no += 1

    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['제목'] = titles
    DB['본문'] = bodies

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(bodies))
    f.close()
リード
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
import os
import requests
import pandas as pd

f_name = os.getcwd()
fc_name = os.getcwd()
fx_name = os.getcwd()

while True:
    try:
        site = input("사이트를 입력하세요\n")
        if site.find("naver") != -1 or site.find("네이버") != -1:
            site = "https://search.naver.com/search.naver"
            f_name += "\\py_site2\\naver.txt"
            fc_name += "\\py_site2\\naver.csv"
            fx_name += "\\py_site2\\naver.xls"
            break
        elif site.find("daum") != -1 or site.find("다음") != -1:
            site = "https://search.daum.net/search"
            f_name += "\\py_site2\\daum.txt"
            fc_name += "\\py_site2\\daum.csv"
            fx_name += "\\py_site2\\daum.xls"
            break
        elif site.find("google") != -1 or site.find("구글") != -1:
            site = "https://www.google.co.kr/search"
            f_name += "\\py_site2\\google.txt"
            fc_name += "\\py_site2\\google.csv"
            fx_name += "\\py_site2\\google.xls"
            break 
        else:
            print("지원하지 않는 사이트입니다.")
    except:
        print(" 오류가 발생하였습니다.")


query_txt = input('크롤링할 키워드는 무엇입니까\n')
前にやったこととあまり差がありません.しかし、今回はcsvファイルとxlsファイルを作成するので、名前を付けましょう.
NAVER
if site == "https://search.naver.com/search.naver":
    params = {'where':'kin', 'query': query_txt}
    resp = requests.get(site, params = params)


    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('ul.lst_total > li')
    no =1
    numbers = []
    questions=[]
    answers=[]
    
    for i in content_list:
    
        numbers.append(no)
        print('번호: ', no)

        question = i.find('div', 'question_group').get_text()
        questions.append(question)
        print('질문: ', question.strip())

        answer = i.find('div', 'answer_group').get_text()
        answers.append(answer)
        print('답변: ', answer.strip())
   
        no += 1
    
    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['질문'] = questions
    DB['답변'] = answers

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(answers))
    f.close()
前回のコードとは大きく違います.urlルールのため、すぐに検索して、selectも簡潔に書きます.
<ul class = 'lst_total'>
	<li>
    		<div class = 'question_group'> 제목1 </div>
            	<div class = 'answer_group'> 본문 1 </div> </li> 
   	<li>
    		<div class = 'question_group'> 제목2 </div> </li>
            	<div class = 'answer_group'> 본문 2 </div> </li> 
	<li>
    		<div class = 'question_group'> 제목3 </div> </li> 
            	<div class = 'answer_group'> 본문 3 </div> </li> 
   	<li>
    		<div class = 'question_group'> 제목4 </div> </li>
            	<div class = 'answer_group'> 본문 4 </div> </li> 
            ....
</ul>
content_list = soup.select('ul.lst_total > li')なら、それなりのものを集めてforゲートに並べばOK!
csvファイルとxlsファイルを作成するだけです.
次です.
elif site == "https://search.daum.net/search":   
    params = {'w':'blog', 'q': query_txt}
    resp = requests.get(site, params = params)


    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('ul.list_info > li')
    no =1
    numbers = []
    titles=[]
    bodies=[]

    
    for i in content_list:
    
        numbers.append(no)
        print('번호: ', no)

        title = i.find('div', 'wrap_tit mg_tit').get_text()
        titles.append(title)
        print('제목: ', title.strip())

        body = i.find('p', 'f_eb desc').get_text()
        bodies.append(body)
        print('본문: ', body.strip())
   
        no += 1
    
    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['제목'] = titles
    DB['본문'] = bodies

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(bodies))
    f.close()
        
これもそれほど悪くない何の変化もない.
グーグル
elif site == "https://www.google.co.kr/search":
    params = {'q': query_txt, 'tbm':'nws'}
    resp = requests.get(site, params = params)

    time.sleep(1)
    full_html = resp.content
    soup = BeautifulSoup(full_html, 'html.parser')

    content_list = soup.select('div#main div.ZINbbc')
    no =1
    numbers = []
    titles=[]
    bodies=[]
    
    
    for i in content_list:
        title = i.find('div', class_ ='BNeawe vvjwJb AP7Wnd')
        if title == None:
            continue
            
        numbers.append(no)
        print('번호: ', no)
        
        titles.append(title.get_text())
        print('제목: ', title.get_text().strip())
        
        body = i.find('div', class_ ='BNeawe s3v9rd AP7Wnd').get_text()
        bodies.append(body)
        print('본문: ', body.strip())
        
        no += 1

    DB = pd.DataFrame()
    DB['번호'] = numbers
    DB['제목'] = titles
    DB['본문'] = bodies

    DB.to_csv(fc_name, encoding='utf-8-sig')
    DB.to_excel(fx_name)

    f = open(f_name, 'a', encoding='UTF-8')
    f.write(str(numbers))
    f.write(str(questions))
    f.write(str(bodies))
    f.close()
グーグルは少し複雑です.まずsoup.select('div#main div.ZINbbc')ここは前回習ったとおりにしました!このIDでしょうあ、そうだ、ここの区切り記号はサブタグを表すので、クラス名に区切り記号があれば、後ろで切って、前に書くだけでいいです!
for文も複雑で、content list[0]にはdiv.bneaweが含まれています.いいえ.この場合の解決策は、値がNone(ない場合)の場合、continueを使用して文をスキップすることです.その後は...そっくりです.
Openpyxl
import openpyxl

wb = openpyxl.Workbook()

sheet_1 = wb.active
sheet_2 = wb.create_sheet("매출현황")
sheet_1.title = '총매출현황'

wb.save('c:\\py_temp\\py_site\\test3.xlsx')
これは、新しいxlsxファイルを作成する方法です.仮想ワークブックを作ってあげる!
import openpyxl

wb = openpyxl.load_workbook('c:\\py_temp\\py_site\\test3.xlsx')
sheet_1 = wb['총매출현황']
sheet_1['A1'] = '첫번째 cell'
sheet_1['A2'] = '두번째 cell'

wb.save('c:\\py_temp\\py_site\\test3.xlsx')
既存のファイルを読み込むことで変更されます.しかし、これらを使う必要がありますか?