Python爬虫類を利用して
6803 ワード
'''
:Valve
: , 。 , 。
'''
import requests,shutil
import os
from bs4 import BeautifulSoup
import random
import datetime
import re
today = datetime.date(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day)
###########################################################################
StartDate = datetime.date(2000,4,23) #
EndDate = datetime.date(2020,4,25) #
# 2019 3 1 2020 4 2 , StartDate EndDate
SearchTerm = [' '] #SearchTerm ,
BlackList = [' ',' ',' '] # ,
SavePath = 'C:/Users/Jared/Pictures/Saved Pictures/get/mm131/' #
###########################################################################
url = 'https://www.mm131.net/xinggan/' #
PicSerUrl = 'https://img1.mmmw.net/pic/'
RefererLink = 'https://www.mm131.net'
my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
if not os.path.exists(SavePath):
os.mkdir(SavePath)
if url == 'https://www.mm131.net/xinggan/':
PageNo = 1
else:
PageNo = int(url[37:-5])+1
NextUrl = url
while NextUrl:
head = {'Referer':RefererLink ,'user-agent': random.choice(my_headers)}
html = requests.get( NextUrl, headers=head )
if str(html) == '':
break
else:
PageNo += 1
NextUrl = 'https://www.mm131.net/xinggan/list_6_'+str(PageNo)+'.html'
bs = BeautifulSoup(html.content, 'lxml')
MainBox = bs.find('dl',class_='list-left public-box')
AllArticles = MainBox.find_all('dd',class_='')
for article in AllArticles:
head = {'Referer':RefererLink,'user-agent': random.choice(my_headers)}
GroupLinkTemp = article.find('a')
GroupLink = GroupLinkTemp['href']
GroupNameTemp_0 = GroupLinkTemp.find('img')
if str(type(GroupNameTemp_0)) == '':
continue
GroupNameTemp = GroupNameTemp_0['alt']
RGName = ''.join( e for e in GroupNameTemp if e.isalnum() )
Ghtml = requests.get(GroupLink,headers=head)
Gbs = BeautifulSoup(Ghtml.content, 'lxml')
NumTemp = Gbs.find('span',class_='page-ch')
if str(NumTemp) == 'None':
continue
Num = int( NumTemp.text[1:-1] )
WantMark = 1
if len(SearchTerm) > 0:
for OnlyWant in SearchTerm:
pattern = re.compile('.*'+OnlyWant+'.*')
if re.match(pattern,GroupNameTemp):
WantMark = 1
break
else:
WantMark = 0
if not WantMark:
print(GroupNameTemp,' , ')
continue
#filter black list
BlackMark = 0
for black in BlackList:
pattern = re.compile('.*'+black+'.*')
if re.match(pattern,GroupNameTemp):
BlackMark = 1
print(GroupNameTemp,' , ')
break
if BlackMark :
continue
DateBox = Gbs.find('div',class_='content-msg')
PostDateTemp = DateBox.text[5:15]
year = int(PostDateTemp[0:4])
month = int(PostDateTemp[5:7])
day = int(PostDateTemp[8:])
PostDate = datetime.date(year,month,day)
RPostDate = ''.join( e for e in PostDateTemp if e.isnumeric() )
if PostDate > EndDate:
print(GroupNameTemp,' , ')
continue
elif PostDate < StartDate:
break
n = -1
while 1:
if GroupLink[n] == '.':
end = n
elif GroupLink[n] == '/':
start = n
break
n -=1
GroupNo = GroupLink[start+1:end] ##
GroupName = RPostDate+'-'+RGName
DirPath = SavePath+GroupName+'/'
pattern = re.compile('.*'+RGName+'.*')
AllDirName = os.listdir(SavePath)
jump = ex = 0
for DirName in AllDirName:
if re.match(pattern,DirName):
if len(os.listdir(SavePath+DirName)) >= Num:
print(SavePath+DirName,': , , ')
jump = 1
break
else:
shutil.rmtree(SavePath+DirName)
print(SavePath+DirName,': , , ')
break
if jump:
continue
os.mkdir(DirPath)
j = 1
while j <= Num:
ImageUrl = PicSerUrl+str(GroupNo)+'/'+str(j)+'.jpg'
head = {'Referer':'https://m.mm131.net/','user-agent': random.choice(my_headers)}
pic = requests.get(ImageUrl,headers = head)
with open(DirPath+GroupName+'-'+str(j)+'.jpg','wb') as f:
f.write(pic.content)
print('【mm131】',GroupName,'【'+str(j)+'/'+str(Num)+'】')
j += 1
if PostDate < StartDate:
break