豆弁爬虫実践-python版

2611 ワード

豆弁登録、認証コード版なし:
import requests

#starturl = "https://www.douban.com/accounts/login"
loginurl = "https://accounts.douban.com/login"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            }
fromdata ={'source':'None',
           'redir':'https://shanghai.douban.com/',
           'form_email':'yourAccount',
           'form_password':'password',
           'login':'  '}

s = requests.Session()
s.headers.update(headers)
resp = s.post(loginurl,fromdata)

with open('douban.html','wb') as f:
    f.write(resp.text.encode('utf-8'))

print(resp.status_code)
print(resp.cookies)
s.close()   

豆弁TOP 250映画爬虫類
import requests
from bs4 import BeautifulSoup

def getContent(bsItem):
    content=[]
    content.append(item.find('a')['href'])
    film=item.find_all('span',{'class':'title'})
    film[0]=film[0].string
    if len(film) > 1:
        film[1]=film[1].string.replace(u'\xa0','').replace(r'/','')
    else:
        film.append('    ')
    content.append(film)
    content.append(item.find('span',{'class':'rating_num'}).string)
    content.append(item.find('span',{'class':'','property':''}).string)
    return content

starturl = 'https://movie.douban.com/top250'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            }
params={'start':0}

s = requests.Session()
s.headers.update(headers)
curpage = 0
with open('doubanfilm.txt','w',encoding='utf-8') as f:
    while(curpage<250):
        params['start'] = curpage
        resp = s.get(starturl,params=params)
        bs = BeautifulSoup(resp.text,'html.parser')
        for item in bs.find_all('div',{"class":'info'}):
            f.write(str(getContent(item))+'
') curpage += 25 print('bug end') s.close()

python、生活はあなたのためにすばらしいです!