Python爬虫類が携程網の航空券情報をつかみ、メールで通知する
15906 ワード
背景:航空券を買うので、ずっと検索して、爬虫類は私にこの問題を解決することができます; Pythonで携程網航空券情報をキャプチャする過程紀実(前編) 説明の超詳細.そこでこの過程を通じて、基本的にいくつか理解しました.上海から西安まで4.29~05.02の航空券を調べる:
メール送信プログラム(出典を見つけたら載せます):
クエリー情報とメールを送信するプログラムを統合します.
次にcrontabを使用してタイミングタスクを行い、20 minsごとに実行します.基本フォーマット:
so,
いろいろな問題があるにもかかわらず、勉強しています.
#coding:utf-8
import urllib2
from lxml import etree
import json
import random
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def get_json2(date,rk,CK,r):
''' url '''
url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"%(date,rk,CK,r)
headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"}
headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date
req=urllib2.Request(url,headers=headers)
res=urllib2.urlopen(req)
content=res.read()
dict_content=json.loads(content,encoding="gb2312")
length = len(dict_content['fis'])
# print length
i = 0
for i in range(length):
if ((dict_content['fis'][i][u'lp']) < 600 ):
print (dict_content['fis'][i][u'lp']),
print (dict_content['fis'][i][u'dt']),
print (dict_content['fis'][i][u'at'])
#print (dict_content['fis'][i][u'dpbn'])
def get_parameter(date):
''' date: , :2016-05-13 '''
url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date
res=urllib2.urlopen(url).read()
tree=etree.HTML(res)
pp=tree.xpath('''//body/script[1]/text()''')[0].split()
CK_original=pp[3][-34:-2]
CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:]
rk=pp[-1][18:24]
num=random.random()*10
num_str="%.15f"%num
rk=num_str+rk
r=pp[-1][27:len(pp[-1])-3]
return rk,CK,r
if __name__=='__main__':
dates=['2017-04-29','2017-04-30','2017-05-01','2017-05-02']
for date in dates:
rk,CK,r=get_parameter(date)
get_json2(date,rk,CK,r)
print "-----"
メール送信プログラム(出典を見つけたら載せます):
# -*- coding: utf-8 -*-
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr(( \
Header(name, 'utf-8').encode(), \
addr.encode('utf-8') if isinstance(addr, unicode) else addr))
from_addr = raw_input('From: ')
password = raw_input('Password: ')
to_addr = raw_input('To: ')
smtp_server = raw_input('SMTP server: ')
msg = MIMEText('Not just fly fight...', 'plain', 'utf-8')
msg['From'] = _format_addr(u'Air ' % from_addr)
msg['To'] = _format_addr(u'126.Air ' % to_addr)
msg['Subject'] = Header(u'flight……', 'utf-8').encode()
server = smtplib.SMTP(smtp_server, 25)
server.set_debuglevel(1) # 0
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
クエリー情報とメールを送信するプログラムを統合します.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
from lxml import etree
import json
import random
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from_addr = "****@126.com" #raw_input('From: ')
password = "******" #raw_input('Password: ')
to_addr = "********@qq.com" #raw_input('To: ')
smtp_server = "smtp.126.com"#raw_input('SMTP server: ')
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr(( \
Header(name, 'utf-8').encode(), \
addr.encode('utf-8') if isinstance(addr, unicode) else addr))
def get_json2(date,rk,CK,r):
''' url '''
url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"%(date,rk,CK,r)
headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"}
headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date
req=urllib2.Request(url,headers=headers)
res=urllib2.urlopen(req)
content=res.read()
dict_content=json.loads(content,encoding="gb2312")
length = len(dict_content['fis'])
# print length
i = 0
for i in range(length):
if ((dict_content['fis'][i][u'lp']) < 600 ):
print (dict_content['fis'][i][u'lp']),
print (dict_content['fis'][i][u'dt']),
print (dict_content['fis'][i][u'at']),
print (dict_content['fis'][i][u'dpbn'])
if ((dict_content['fis'][i][u'lp']) <= 450 ):
msg = MIMEText(('%r at %s in %s'% ((dict_content['fis'][i][u'lp']),(dict_content['fis'][i][u'dt']),(dict_content['fis'][i][u'dpbn']))),'plain', 'utf-8')
msg['From'] = _format_addr(u'Air ' % from_addr)
msg['To'] = _format_addr(u'126.Air ' % to_addr)
msg['Subject'] = Header(u'flight…%r '%(dict_content['fis'][i][u'lp']), 'utf-8').encode()
server = smtplib.SMTP(smtp_server, 25)
server.set_debuglevel(0)
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
def get_parameter(date):
''' date: , :2016-05-13 '''
url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date
res=urllib2.urlopen(url).read()
tree=etree.HTML(res)
pp=tree.xpath('''//body/script[1]/text()''')[0].split()
CK_original=pp[3][-34:-2]
CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:]
rk=pp[-1][18:24]
num=random.random()*10
num_str="%.15f"%num
rk=num_str+rk
r=pp[-1][27:len(pp[-1])-3]
return rk,CK,r
if __name__=='__main__':
dates=['2017-04-29','2017-04-30','2017-05-01']
for date in dates:
rk,CK,r=get_parameter(date)
get_json2(date,rk,CK,r)
print "-----"
次にcrontabを使用してタイミングタスクを行い、20 minsごとに実行します.基本フォーマット:
* * * * * command
so,
0,20,40 * * * * python ~/test.py
いろいろな問題があるにもかかわらず、勉強しています.