新浪微博データの抽出
&&&&
&&&&
# -*- coding: utf-8 -*
from bs4 import BeautifulSoup
import requests
from xlwt import Workbook
import time
import json
import sys
import csv
reload(sys)
sys.setdefaultencoding('utf-8')
def weibo(url):
headers = {
"Cookie":"_T_WM=cff5bb8be0f4876163913084ff9c62f0; ALF=1526623461; SCF=ApMI3mluv9yH6yKz4i7-HMlHojzPtQULc5G0xlrri-Ne18lmXmEFvULlwx0CKS_sw3NN27MeOjlDlndONngzHPI.; SUB=_2A25321onDeRhGeNN7FsX9CrIzzqIHXVVJGZvrDV6PUJbktAKLUbCkW1NSSw_ojRtzxFp7XG4qFOB5nNMnhzPu_2a; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhfooAg0xuUTUnWRX292Cbe5JpX5K-hUgL.Fo-0S0.cShBXShq2dJLoIpnLxKqL1-BL12-LxKML1K.LB.xke0np1hqt; SUHB=0KABS255VQmWlt; H5_INDEX=0_all; H5_INDEX_TITLE=%E6%9D%8E%E7%99%BD_38186; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=featurecode%3D20000320%26luicode%3D10000011%26lfid%3D100103type%253D3%2526q%253D%25E4%25B8%25AD%25E5%2585%25B4%25E5%2588%25B6%25E8%25A3%2581%2526t%253D0%26fid%3D100103type%253D2%2526q%253D%25E4%25B8%25AD%25E5%2585%25B4%25E5%2588%25B6%25E8%25A3%2581%26uicode%3D10000011",
"Referer":"https://m.weibo.cn/p/100103type%3D2%26q%3D%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81?type=wb&queryVal=%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81&featurecode=20000320&luicode=10000011&lfid=100103type%3D3%26q%3D%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81%26t%3D0&title=%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36",
}
response = requests.get(url,headers=headers)
response = response.json()
return response
def tiqu(page):
url = "https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81&featurecode=20000320&luicode=10000011&lfid=100103type%3D3%26q%3D%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81%26t%3D0&title=%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81&containerid=100103type%3D2%26q%3D%E4%B8%AD%E5%85%B4%E5%88%B6%E8%A3%81&page="+str(page)
response = weibo(url)
response = response['data']['cards']
datalist = []
for i in range(0, len(response)):
for key in response[i]['card_group']:
dict2 = key['mblog']
user = dict2['user']['screen_name']
blog = dict2['text'].split()[0]
source = dict2['source']
data = [user, blog, source]
datalist.append(data)
return datalist
if __name__ == "__main__":
book = Workbook(encoding='utf-8') # execl
sheet1 = book.add_sheet('Sheet 1') # execl
sheet1.write(0, 0, 'user') # label1
sheet1.write(0, 1, 'blog') # label1
sheet1.write(0, 2, 'source') # 2 label2
datalistnew = []
for page in range(1,200):
datalist = tiqu(page)
datalistnew = datalistnew + datalist
datalist = datalistnew
for data in range(0, len(datalist)): # ,
title = datalist[data][0]
blog = datalist[data][1]
source = datalist[data][2]
sheet1.write(data + 1, 0, title)
sheet1.write(data + 1, 1, blog)
sheet1.write(data + 1, 2, source)
book.save("weibo2.xls") # , nsdc.xls
&&&&