import requests
import pymongo
from bs4 import BeautifulSoup
import getNews
client = pymongo.MongoClient(host='localhost',port=27017,connect=False)
pengpai = client['pengpai']
if 'pengpai_news' in pengpai.list_collection_names():
pengpai_news = pengpai['pengpai_news']
pengpai_news.drop()
else:
pengpai_news = pengpai['pengpai_news']
urlList = ['https://www.thepaper.cn/channel_90077']
url_1 = 'https://www.thepaper.cn/'
url_2 = 'load_index.jsp?nodeids=90069,&channelID=90077&topCids=,5922202,5934344,5934605,5934601,5934698&pageidx='
url_3 = '&lastTime=1581492637041'
for i in [url_1 + url_2 + str(n) + url_3 for n in range(1,30)]:
urlList.append(i)
num = 0
for url in urlList:
news_data = requests.get(url)
news_data.encoding = "utf-8"
soup=BeautifulSoup(news_data.text,'lxml')
news_item = soup.select('.news_li')
for new in news_item:
if len(new.select('h2')) != 0:
title = new.select('h2')[0].text.strip()
text_href = "https://www.thepaper.cn/" + new.select('a')[0]['href']
pic_src ="https://www.thepaper.cn/" + new.select('img')[0]['src']
num += 1
try:
news_info = getNews.getNewsTxt(text_href)
except:
print("bug")
data = {
'title': title,
'title_link': text_href,
'pic_link': pic_src,
'news_info':news_info
}
print(" %d "%(num),data)
pengpai_news.insert_one(data)