Python selenium身分証明書情報オンライン解析爬取
3905 ワード
メモとして:
身分証明書の地域検索は、ネット上の検索サイトも多い.データベース内のIDを照会し、その情報を識別します.主にこのサイトを通じて:http://www.gpsspg.com/sfz/
スクリプト:
身分証明書の地域検索は、ネット上の検索サイトも多い.データベース内のIDを照会し、その情報を識別します.主にこのサイトを通じて:http://www.gpsspg.com/sfz/
スクリプト:
#-*- coding: utf-8 -*-
# python 3.5.0
import sqlalchemy
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
class IDCARD(object):
def __init__(self,d):
self._engine = sqlalchemy.create_engine("mssql+pymssql://%s:%s@%s/%s" %(d['user'],d['pwd'],d['inst'],d['db']))
self._loginurl = "http://www.gpsspg.com/sfz/"
#self._chromedriver = 'D:/Python35/selenium/webdriver/chromedriver.exe'
#self._driver = webdriver.Chrome(self._chromedriver)
# ,
self._chromedriver = 'D:/Python35/mypy/phantomjs/bin/phantomjs.exe'
self._driver = webdriver.PhantomJS(self._chromedriver)
# list
def get_data_from_db(self):
sql = """SELECT identityCard FROM [dbo].[ClientInfoAll]"""
list = (pd.read_sql_query(sql, self._engine))['identityCard'].tolist()
return list
#
def set_url(self,cardid):
self._loginurl = r"http://www.gpsspg.com/sfz/?q=%s" % cardid
self._driver.get(self._loginurl)
# df
def get_date(self,cardid,df):
province, city ,district, AreaCode, birthday, lunarday, sex, age, horoscope, lng, lat = (None,)*11
table = self._driver.find_element_by_xpath("//table[@class='tabs']/tbody")
id = table.find_element_by_xpath(".//tr[1]/td[2]").text.strip()
if cardid == id:
district = table.find_element_by_xpath(".//tr[2]/td[2]").text.strip()+' '
province = district.split(' ')[0]
city = district.split(' ')[1]
district = district.split(' ')[2]
AreaCode = table.find_element_by_xpath(".//tr[2]/td[4]").text.strip()
birthday = table.find_element_by_xpath(".//tr[3]/td[2]").text.strip()
lunarday = table.find_element_by_xpath(".//tr[3]/td[4]").text.strip()
horoscope = table.find_element_by_xpath(".//tr[4]/td[2]").text.strip()
sex = horoscope.split('(')[0]
age = horoscope.replace(')','(').split('(')[1]
horoscope = horoscope.split(')')[1]
lat = table.find_element_by_xpath(".//tr[4]/td[4]").text.strip()+','
lng = lat.split(',')[0]
lat = lat.split(',')[1]
df.loc[df.shape[0]+1] = {'cardid':cardid,'province':province,'city':city,'district':district,'AreaCode':AreaCode
,'birthday':birthday,'lunarday':lunarday,'sex':sex,'age':age,'horoscope':horoscope,'lng':lng,'lat':lat}
# , , df , 。
# df
def get_alldata(self):
list_idcards = self.get_data_from_db()
print(type(list_idcards[10]))
df = pd.DataFrame(columns = ['cardid','province','city','district','AreaCode','birthday','lunarday','sex','age','horoscope','lng','lat'])
for idcard in list_idcards:
print(" :%s" % idcard)
self.set_url(idcard)
self.get_date(idcard,df)
df.to_sql('newidcard', self._engine, if_exists='replace',index=False) # : (replace) newidcard
if __name__ == "__main__":
conn = {'user':' ','pwd':' ','inst':' ','db':' '}
idcard = IDCARD(conn)
idcard.get_alldata()