selenium+PhantomJS爬取(豆弁読書)

4191 ワード

	    Python       ;
	       request  ‘User-Agent’  ‘data’                 ,     ,      ,         ,        ,                 ;  selenium                ,    selenium         ;
#       
from selenium import webdriver
import time
from lxml import etree
import pymysql
import re

#      
def my_browers(url, page):

    #        
    browers = webdriver.PhantomJS(executable_path=r'd:\Desktop\pythonjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
    
    #         
    browers.get(url)
    
    #    ,     ,       ,      
    time.sleep(2)
    
     #       
    html = browers.page_source
    
    #         
    parse_html(html)
    
#       
def parse_html(html):

    #     xpath  
    html = etree.HTML(html)

    #            
    books = html.xpath('//div[contains(@class,"sc-bZQynM")]')

    #                    
    for book in books:
        #             
        book_dict = {}

        #       
        pic = book.xpath('//img/@src')
        if pic:
            book_dict['pic'] = pic[0]
        else:
            book_dict['pic'] = ''
        # print(pic)

        #     
        book_name = book.xpath('//div[@class="title"]/a/text()')
        # print(book_name)
        if book_name:
            book_name = book_name[0]
            #             ,
            #                         ,          
            if '"' in book_name:
                pattern = re.compile(r'"')
                book_name = pattern.sub('', book_name)
            if "'" in book_name:
                pattern = re.compile(r"'")
                book_name = pattern.sub('', book_name)
                #           \,           \  sql         ,
                #          
            if '\\' in book_name:
                book_name = book_name[:-1]
            book_dict['book_name'] = book_name
        else:
            book_dict['book_name'] = ''

        #         
        book_url = book.xpath('//div[@class="title"]/a/@href')
        if book_url:
            book_dict['book_url'] = book_url[0]
        else:
            book_dict['book_url'] = ''

        #       
        score_book = book.xpath('//span[@class="rating_nums"]/text()')
        if score_book:
            book_dict['score_book'] = score_book[0]
        else:
            book_dict['score_book'] = ''

        #        
        book_detail = book.xpath('//div[@class="meta abstract"]/text()')
        if book_detail:
            #              ;
            book_detail = book_detail[0]
            if "'" in book_detail:
                pattern = re.compile(r"'")
                book_detail = pattern.sub('', book_detail)

            book_dict['book_detail'] = book_detail
        else:
            book_dict['book_detail'] = ''
        print(book_dict)

        #        
        insert_mysql(book_dict)

#      
def insert_mysql(book_dict):
    #      
    conn = pymysql.connect('localhost', 'root', 'root', 'test', charset='utf8')

    #           
    cursor = conn.cursor()

    pic = book_dict['pic']
    book_name = book_dict['book_name']
    book_url = book_dict['book_url']
    score = book_dict['score_book']
    book_detail = book_dict['book_detail']

    sql = f"insert into python_book (pic,book_name,book_url,score,book_detail) " \
          f"VALUE ('{pic}','{book_name}','{book_url}','{score}','{book_detail}')"

    #      
    cursor.execute(sql)
    conn.commit()


if __name__ == '__main__':
    for i in range(0, 199):
        print('=================   {} ========================'.format(i + 1))
        page = i * 15
        base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start={}'.format(page)
        my_browers(base_url, page)