python爬虫類天猫スーパー


# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time

s = time.time()
#
browser = webdriver.Chrome()
#browser = webdriver.PhantomJS()
#option = webdriver.ChromeOptions()
#option.add_argument('headless')
#option.add_argument('no-sandbox')
#option.add_argument('disable-dev-shm-usage')
#browser = webdriver.Chrome(chrome_options=option)
wait = WebDriverWait(browser,10)

browser.get('https://www.tmall.com/')
# cookie

#print(browser.get_cookies())

#cookies = 
#
#for i in cookies:  
#    if 'expiry' in i:
#        del i['expiry']
##    print(i) 
#    browser.add_cookie(i)

tmiu = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#content > div.main-nav > div > div > div > a:nth-child(1) > img')))
tmiu.click()
tmall0 = browser.current_window_handle
handles0 = browser.window_handles
list_0 = handles0[1]
browser.switch_to.window(list_0) 

#time.sleep(20)

input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mq')))
input.send_keys(' ')

button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mallSearch > form > fieldset > div > button')))
button.click()

by = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#filterForm > div > ul.filter-sort > li:nth-child(3) > a > span')))
by.click()

tmall = browser.current_window_handle


def list_click(i):
    txt = open('03 .txt', "a")
    try:
        browser.switch_to.window(tmall) 
        to1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_ProductList > li:nth-child({}) > div > h3 > a'.format(i+1))))                                    
        to1.click()
        tnum = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_ProductList > li:nth-child({}) > div > div.item-summary > div.item-sum > strong'.format(i+1))))
        print(' :{}'.format(tnum.text))
        txt.write(' :{}'.format(tnum.text)+"~")
        href = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_ProductList > li:nth-child({}) > div > h3 > a'.format(i+1))))
        print(href.get_attribute('href'))
        txt.write('href:{}'.format(href.get_attribute('href'))+"~")
        dic1 = {
     }
        dic1[' '] = tnum.text                         
        handles = browser.window_handles

        list_ = handles[-1]
        
        browser.switch_to.window(list_) 
        
        try:
            def pick():
                title = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_DetailMeta > div.tm-clear > div.tb-property > div > div.tb-detail-hd > h1')))
                print(title.text)
                txt.write('title:{}'.format(title.text)+"~")
                price = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_PromoPrice > dd > div > span')))
                print(price.text)
                txt.write('price:{}'.format(price.text)+"~")
                num = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_DetailMeta > div.tm-clear > div.tb-property > div > ul > li.tm-ind-item.tm-ind-sellCount > div > span.tm-count')))
                print(num.text)
                txt.write('num:{}'.format(num.text)+"~")
                pkjx = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_ItemRates > div > span.tm-count')))
                print(pkjx.text)
                txt.write(' :{}'.format(pkjx.text)+"
"
) except TimeoutException: browser.close() txt.write('
'
) pick() browser.close() browser.switch_to.window(tmall) except TimeoutException: browser.switch_to.window(tmall) txt.write('
'
) pass txt.close() n = 0 while n < 3: for i in range(8,10): list_click(i) time.sleep(1) button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#content > div.main > div > div.list-bottom > div > div > a.page-next'))) button.click() n += 1 e = time.time() print(" {}".format(e-s))