python爬虫類大衆評価グルメランキング

3625 ワード

import requests
from bs4 import BeautifulSoup
import re

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst, stockURL,city_lst,infodict):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, 'html.parser') 
    a = soup.find_all("h4")
    b = soup.find_all("a", href = re.compile("http://www.dianping.com/search/category/33/0/r\d{4}"))
    print (a,b)
    for i in a[2:len(a)-2]:

        name = i.text

        lst.append(name)
        print (name)


    count=0
    for j in b:

        try:
            address = j.text.split()[0]+j.text.split()[1]
            city_lst.append(address)
            infodict[lst[count]]=address
            print (address)

            count+=1
        except:
            count+=1
            continue



def main():
    stock_list_url = 'http://www.dianping.com/search/category/33/10/r3300'

    output_file = 'E:/dzdpmspm.txt'
    slist=[]
    clist=[]
    infoDict={}
    getStockList(slist, stock_list_url,clist,infoDict)
    for n in range(2,51):
        stock_list_url="http://www.dianping.com/search/category/33/10/r3300p"+str(n)+"?aid=91959818%2C93071129"
        getStockList(slist, stock_list_url,clist,infoDict)
        with open(output_file, 'a', encoding='utf-8') as f:
                f.write(str(infoDict.items()) + '
'
) print("\r :{:.2f}%".format(n*100/50),end="") main()