Python爬虫実戦—京東商品リストを這い取る

11210 ワード

タイトルPython爬虫実戦-京東商品リストを登る
まず、爬取プログラムには小さな欠陥があり、中国語の検索をサポートできません.爬取時に引っかかる可能性があります.具体的な原因は不明ですが、後期に改善されています.
import urllib.request
import random
import re
uapools = [
        'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0']
def UA(uapools):
    opener = urllib.request.build_opener()
    thisua=random.choice(uapools)
    ua=('User-Agent',thisua)
    opener.addheaders=[ua]
    urllib.request.install_opener(opener)
    #return UA()
for i in range(2,101):
    keyname = 'Amani'
    key = urllib.request.quote(keyname)

    print(' ====='+str(i)+'   =====')
    url="https://search.jd.com/Search?keyword="+key+"&page="+str((i*2)-1)+"&s="+str((i-1)*50+6)

    UA(uapools)
    data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
    pat='=re.compile(pat,re.S).findall(data)
    print(idlist)
    for j in range(0,len(idlist)):
        thisid=idlist[j]
        thisurl="https://item.jd.com/"+thisid+'.html'
        itemdata=urllib.request.urlopen(thisurl).read().decode('gbk','ignore')
        #print(itemdata)
        titlepat='
(.*?)
'
title=re.compile(titlepat,re.S).findall(itemdata) if (len(title)>0): title=title[0] else: continue print(' =',str(title) # = (ARMANI) 4# 30ml( ) = EA7 EMPORIO ARMANI T WHITE-1100 L = Armani Exchange/ T 8NZT84 1200 L = EMPORIO ARMANI UNDERWEAR ( ) 302402-9A282 NAYGRY-10735 U = <a href="//item.jd.com/51112579543.html" target="_blank"> (GIORGIO ARMANI) t ea7 6Z1TP5 1JTUZ L</a> = ARMANI EXCHANGE/ ax t 1562 M = EA7 EMPORIO ARMANI 6GPB32-PNC3Z BLACK-2206 XL = ARMANI EXCHANGE T 3GZTAW-ZJE6Z BLACK-1200 XL = Armani Exchange/ T 8NZT84 1200 M = Armani POLO T XL = (ARMANI) 3.5 ( ) 。。。。。。。。。。。。。。。 # thisurl="https://item.jd.com/"+thisid+'.html' #