Python爬虫類を利用して

6803 ワード

'''
  :Valve
  :         ,        。       ,      。
'''
import requests,shutil
import os
from bs4 import BeautifulSoup
import random
import datetime
import re
today = datetime.date(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day)

###########################################################################
StartDate = datetime.date(2000,4,23) #      
EndDate = datetime.date(2020,4,25) #      
#     2019 3 1  2020 4 2        ,    StartDate EndDate  
SearchTerm = ['  '] #SearchTerm             ,           
BlackList = ['   ','   ','   '] #   ,          
SavePath = 'C:/Users/Jared/Pictures/Saved Pictures/get/mm131/' #      
###########################################################################

url = 'https://www.mm131.net/xinggan/'  #     
PicSerUrl = 'https://img1.mmmw.net/pic/'
RefererLink = 'https://www.mm131.net'
my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]

if not os.path.exists(SavePath):
    os.mkdir(SavePath)
if url == 'https://www.mm131.net/xinggan/':
    PageNo = 1
else:
    PageNo = int(url[37:-5])+1
NextUrl = url
while NextUrl:
    head = {'Referer':RefererLink ,'user-agent': random.choice(my_headers)}
    html = requests.get( NextUrl, headers=head )
    if str(html) == '':
        break
    else:
        PageNo += 1        
        NextUrl = 'https://www.mm131.net/xinggan/list_6_'+str(PageNo)+'.html'
    bs = BeautifulSoup(html.content, 'lxml')
    MainBox = bs.find('dl',class_='list-left public-box')

    AllArticles = MainBox.find_all('dd',class_='')
    for article in AllArticles:
        head = {'Referer':RefererLink,'user-agent': random.choice(my_headers)}
        GroupLinkTemp = article.find('a')
        GroupLink = GroupLinkTemp['href']
        GroupNameTemp_0 = GroupLinkTemp.find('img')
        if str(type(GroupNameTemp_0)) == '':
            continue
        GroupNameTemp = GroupNameTemp_0['alt']        
        RGName = ''.join( e for e in GroupNameTemp if e.isalnum() )

        Ghtml = requests.get(GroupLink,headers=head)
        Gbs = BeautifulSoup(Ghtml.content, 'lxml')
        NumTemp = Gbs.find('span',class_='page-ch')
        if  str(NumTemp) == 'None':
            continue
        Num = int( NumTemp.text[1:-1] )

        WantMark = 1
        if len(SearchTerm) > 0:
            for OnlyWant in SearchTerm:
                pattern = re.compile('.*'+OnlyWant+'.*')
                if re.match(pattern,GroupNameTemp):
                    WantMark = 1
                    break
                else:
                    WantMark = 0
        if not WantMark:
            print(GroupNameTemp,'       ,   ')
            continue

        #filter black list
        BlackMark = 0
        for black in BlackList:
            pattern = re.compile('.*'+black+'.*')
            if re.match(pattern,GroupNameTemp):
                BlackMark = 1
                print(GroupNameTemp,'     ,   ')
                break
        if BlackMark :
            continue

        DateBox = Gbs.find('div',class_='content-msg')
        PostDateTemp = DateBox.text[5:15]

        year = int(PostDateTemp[0:4])
        month = int(PostDateTemp[5:7])
        day = int(PostDateTemp[8:])
        PostDate = datetime.date(year,month,day)

        RPostDate = ''.join( e for e in PostDateTemp if e.isnumeric() )

        if PostDate > EndDate:
            print(GroupNameTemp,'         ,   ')
            continue
        elif PostDate < StartDate:
            break

        n = -1
        while 1:
            if GroupLink[n] == '.':
                end = n
            elif GroupLink[n] == '/':
                start = n
                break
            n -=1
        GroupNo = GroupLink[start+1:end]   ##

        GroupName = RPostDate+'-'+RGName
        DirPath = SavePath+GroupName+'/'

        pattern = re.compile('.*'+RGName+'.*')
        AllDirName = os.listdir(SavePath)
        jump = ex = 0
        for DirName in AllDirName:
            if re.match(pattern,DirName):
                if len(os.listdir(SavePath+DirName)) >= Num:
                    print(SavePath+DirName,':     ,        ,     ')
                    jump = 1
                    break
                else:
                    shutil.rmtree(SavePath+DirName)
                    print(SavePath+DirName,':     ,           ,    ')
                    break
        if jump:
            continue

        os.mkdir(DirPath)

        j = 1
        while j <= Num:
            ImageUrl = PicSerUrl+str(GroupNo)+'/'+str(j)+'.jpg'
            head = {'Referer':'https://m.mm131.net/','user-agent': random.choice(my_headers)}
            pic = requests.get(ImageUrl,headers = head)
            with open(DirPath+GroupName+'-'+str(j)+'.jpg','wb') as f:
                f.write(pic.content)
            print('【mm131】',GroupName,'【'+str(j)+'/'+str(Num)+'】')
            j += 1

    if PostDate < StartDate:
        break