selenium+PhantomJS爬取(豆弁読書)
4191 ワード
Python ;
request ‘User-Agent’ ‘data’ , , , , , ; selenium , selenium ;
#
from selenium import webdriver
import time
from lxml import etree
import pymysql
import re
#
def my_browers(url, page):
#
browers = webdriver.PhantomJS(executable_path=r'd:\Desktop\pythonjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
#
browers.get(url)
# , , ,
time.sleep(2)
#
html = browers.page_source
#
parse_html(html)
#
def parse_html(html):
# xpath
html = etree.HTML(html)
#
books = html.xpath('//div[contains(@class,"sc-bZQynM")]')
#
for book in books:
#
book_dict = {}
#
pic = book.xpath('//img/@src')
if pic:
book_dict['pic'] = pic[0]
else:
book_dict['pic'] = ''
# print(pic)
#
book_name = book.xpath('//div[@class="title"]/a/text()')
# print(book_name)
if book_name:
book_name = book_name[0]
# ,
# ,
if '"' in book_name:
pattern = re.compile(r'"')
book_name = pattern.sub('', book_name)
if "'" in book_name:
pattern = re.compile(r"'")
book_name = pattern.sub('', book_name)
# \, \ sql ,
#
if '\\' in book_name:
book_name = book_name[:-1]
book_dict['book_name'] = book_name
else:
book_dict['book_name'] = ''
#
book_url = book.xpath('//div[@class="title"]/a/@href')
if book_url:
book_dict['book_url'] = book_url[0]
else:
book_dict['book_url'] = ''
#
score_book = book.xpath('//span[@class="rating_nums"]/text()')
if score_book:
book_dict['score_book'] = score_book[0]
else:
book_dict['score_book'] = ''
#
book_detail = book.xpath('//div[@class="meta abstract"]/text()')
if book_detail:
# ;
book_detail = book_detail[0]
if "'" in book_detail:
pattern = re.compile(r"'")
book_detail = pattern.sub('', book_detail)
book_dict['book_detail'] = book_detail
else:
book_dict['book_detail'] = ''
print(book_dict)
#
insert_mysql(book_dict)
#
def insert_mysql(book_dict):
#
conn = pymysql.connect('localhost', 'root', 'root', 'test', charset='utf8')
#
cursor = conn.cursor()
pic = book_dict['pic']
book_name = book_dict['book_name']
book_url = book_dict['book_url']
score = book_dict['score_book']
book_detail = book_dict['book_detail']
sql = f"insert into python_book (pic,book_name,book_url,score,book_detail) " \
f"VALUE ('{pic}','{book_name}','{book_url}','{score}','{book_detail}')"
#
cursor.execute(sql)
conn.commit()
if __name__ == '__main__':
for i in range(0, 199):
print('================= {} ========================'.format(i + 1))
page = i * 15
base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start={}'.format(page)
my_browers(base_url, page)