python爬虫実戦---seleniumを利用して表1を這い出す

24903 ワード

                    ,                     ,     sql server 。       !               :        ,                    。             。         ,       python。             ,       selenium   。  ,selenium  webdriver     ,      ,        。           !
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import json
from selenium import webdriver
import pyodbc
cnxn = pyodbc.connect("DRIVER={SQL Server};SERVER=WIN-20160105DRP;DATABASE=CtripData;UID=sa;PWD=123")
cursor1=cnxn.cursor()
sql1="SELECT ID,HotelCode,URL from DownHotelListLive where dstatus=1"
cursor1.execute(sql1)
allselect =cursor1.fetchall()
cursor1.close()
for oneselect in allselect:
url_CN = oneselect.URL
Driver_CN = webdriver.Chrome()
Driver_CN.get(url_CN)
Html_CN = Driver_CN.page_source
Html_Change = BeautifulSoup(Html_CN)
Hroom_list_cn = Html_Change.findAll("div", {"class": "hroom_list"})[0]
url_EN = 'https://www.trip.com/hotels/london-hotel-detail-' + str(oneselect.HotelCode) + '/'
Driver_EN = webdriver.Chrome()
Driver_EN.get(url_EN)
Html_EN = Driver_EN.page_source
Html_Change_EN = BeautifulSoup(Html_EN)
HTags_list=list()
if Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("div",{"class":"htl_info_tags"}).findAll("span"):
AllHTags_html=Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("div",{"class":"htl_info_tags"}).findAll("span")
for oneHTags_html in AllHTags_html:
oneHTags_cuthtml=str(oneHTags_html)[32:]
oneHTags_cuthtml=oneHTags_cuthtml[:-7]
HTags_list.append(oneHTags_cuthtml)
HTags_str=','.join(HTags_list)
name_html=Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("h1",{"class":"name"})
nameEN_html=str(name_html.find("span",{"class":"ename"}))[20:]
nameEN=nameEN_html[:-7].strip()
if str(name_html)[35].isalpha():
nameCN=''
else:
nameCN=str(name_html)[33:]
for nameCN_index in range(len(nameCN)):
if nameCN[nameCN_index]==' nameCN_for_index=nameCN_index
break
nameCN=nameCN[:nameCN_for_index].strip()
location_html=Html_Change.find("div",{"class":"cont"}).find("div",{"class":"path_bar2"}).find("div",{"itemprop":"breadcrumb"}).findAll("a")
country_html=str(location_html[1])[:-6]
city_html=str(location_html[2])[:-6]
area_html=str(location_html[-1])[:-4]
for country_str_index in range(len(country_html)):
if country_html[len(country_html)-1-country_str_index]=='>':
country_html=country_html[len(country_html)-country_str_index:]
break
for city_str_index in range(len(city_html)):
if city_html[len(city_html)-1-city_str_index]=='>':
city_html=city_html[len(city_html)-city_str_index:]
break
for area_str_index in range(len(area_html)):
if area_html[len(area_html)-1-area_str_index]=='>':
area_html=area_html[len(area_html)-area_str_index:]
break
if ' ' in area_html:
area_html=area_html[:-2]
if ' ' in city_html:
city_html=city_html[:-2]
address_html =Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div", {"class": "cont_main"}).find("div", {"class": "htl_info_com"}).find("div", {"class": "htl_info"}).findAll("div")[0].find("div",{"class":"adress"}).find("span",{"class":"address_text"})
address_EN=str(address_html)[27:]
address_EN=address_EN[:-7]
location_value=Html_Change.find("input",{"id":"hotelCoordinate"}).get("value")
location_value=location_value.split('|')
latitude_value=location_value[0]
longitude_value=location_value[1]
if Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div",{"class":"cont_aside"}).find("div",{"class":"cmt_summary c-2"}):
comment_num_html=Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div",{"class":"cont_aside"}).find("div",{"class":"cmt_summary c-2"}).find("div",{"class":"cmt_summary_hd"}).find("a",{"id":"commnet_score"})
comment_num=str(comment_num_html)[:-12]
for comment_num_index in range(len(comment_num)):
if comment_num[len(comment_num)-1-comment_num_index]=='>':
comment_num=comment_num[len(comment_num)-comment_num_index:]
break
else:
comment_num=0
price=Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div", {"class": "cont_main"}).find("div", {"class": "htl_info_com"}).find("div", {"class": "htl_info"}).find("div",{"class":"price_box"}).find("div",{"class":"J_price_info"}).find("div",{"class":"staring_price"}).find("div",{"class":"detail_price"}).find("span",{"class":"price"})
price=str(price)[20:]
price=price[:-7]
if price.isdigit()==False:
price=''

cursor = cnxn.cursor()
sql="INSERT INTO CtripHotelList([HCode],[HotelTag],[NameEN],[NameCN],[CountryName],[CityName],[AreaName],[AddrEN],[Latitude],[Longitude],[DianpingNum],[PriceInfo],[SourceURL],[UpdateDate],[AddDate]) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%d','%s','%s',getdate(),getdate())" %(str(oneselect.HotelCode),HTags_str,nameEN,nameCN,country_html,city_html,area_html,address_EN,latitude_value,longitude_value,int(comment_num),price,oneselect.URL)
cursor.execute(sql)
cnxn.commit()
cursor.close()
cursor1 = cnxn.cursor()
sql1 = "SELECT Hid from CtripHotelList where HCode='%s'" %str(oneselect.HotelCode)
cursor1.execute(sql1)
selectforHid = cursor1.fetchone()[0]
cursor1.close()
hotel_type_list = Html_Change_EN.findAll("div", {"class": "m-hotel-type", "id": "room_table"})[0].findAll("div",{"class": "hotel-type__list"})
roomid_list = list()
for hotel_type in hotel_type_list:
# c5+=i6.find("table").find("tbody").findAll("")
roomid_list.append(int(hotel_type.find("table").get("data-roomid")))
# a20=int(len(c5)/4)
# for i4 in range(a20):
# b3.append(c5[4*i4+1])
#
# print(b3)
allhroomlist = Hroom_list_cn.findAll("div", {"class": "hroom_tr J_baseRoomlist "})
for allhroomlist_index in range(len(allhroomlist)):
hroominfo = allhroomlist[allhroomlist_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl", {"class": "hroom_base"})
hroominfo_text = hroominfo.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get("data-text")
hroominfo_json = json.loads(str(hroominfo_text))
roomId = int(hroominfo_json['comment_baseroomId'])
roomName = hroominfo_json['comment_baseroomName']
subRoomlist = allhroomlist[allhroomlist_index].find("div", {"class": "hroom_tr_cols"}).findAll("div", {"class": "hroom_tr_col J_subRoomlist"})
freewifi_num = 0
nonsmoking_num = 0
for subRoomlistfor_one in subRoomlist:
hroom_col_network = str(subRoomlistfor_one.find("div", {"class": "hroom_col hroom_col_network"}))
if ' ' in hroom_col_network:
freewifi_num += 1
if ' ' in hroom_col_network and ' ' not in hroom_col_network:
nonsmoking_num += 1
Tname_list = list()
Tinfo_list = list()
if hroominfo_json['thisBaseRoomServiceDetailList'] != []:
RoomServiceDetailList = hroominfo_json['thisBaseRoomServiceDetailList']
for RoomServiceDetail in RoomServiceDetailList:
Tname_list.append(RoomServiceDetail['thisDetailInfoName'])
Tinfo_list.append(RoomServiceDetail['thisDetailInfoVal'])
Addbed_info = ''
Area_info = ''
if Tname_list != []:
if ' ' in Tname_list:
Addbed_info = Tinfo_list[Tname_list.index(' ')]
if ' ' in Tname_list:
Area_info = Tinfo_list[Tname_list.index(' ')]
cursor2 = cnxn.cursor()
sql2 = "insert into CtripHotelRoomList([Hid],[RoomCode],[RoomNameCN],[FreeWifi],[SmokingInfo],[AreaInfo],[AddBedInfo],[Language],[UpdateDate],[AddDate]) values('%d','%d','%s','%d','%d','%s','%s','%s',getdate(),getdate());" % (selectforHid, roomId, roomName, freewifi_num, nonsmoking_num, Area_info, Addbed_info, " ")
cursor2.execute(sql2)
cnxn.commit()
cursor2.close()

for hotel_type_index in range(0, len(hotel_type_list)):
h_type = hotel_type_list[hotel_type_index].find("td", {"class": "h-type"})
Rcode = roomid_list[hotel_type_index]

RnameEN = str((h_type.find("div", {"class": "h-type__cnt"})).find("a", {"class": "h-type__name is-link"}))[:-4]
for RnameEN_index in range(len(RnameEN)):
if RnameEN[RnameEN_index] == '>':
RnameEN_forindex = RnameEN_index + 1
break
RnameEN = RnameEN[RnameEN_forindex:]
wifi_smoking_info = hotel_type_list[hotel_type_index].find("td", {"colspan": "6"})
freewifi_num_EN = int((str(wifi_smoking_info).count('Free')) / 2)
nonsmoking_num_EN = int(str(wifi_smoking_info).count('Non-smoking'))
area_info = ''
if str(h_type.find("div", {"class": "h-type__cnt"}).find("ul", {"class": "o-fi-txt"}).find("li")):
area_info_html = str(h_type.find("div", {"class": "h-type__cnt"}).find("ul", {"class": "o-fi-txt"}).find("li"))
area_info = area_info_html[50:]
area_info = area_info[:-5]
cursor3 = cnxn.cursor()
sql3 = "insert into CtripHotelRoomList([Hid],[RoomCode],[RoomNameEN],[FreeWifi],[SmokingInfo],[AreaInfo],[UpdateDate],[AddDate],[Language]) values('%d','%d','%s','%d','%d','%s',getdate(),getdate(),'%s');" % (selectforHid, Rcode, RnameEN, freewifi_num_EN, nonsmoking_num_EN,area_info, " ")
cursor3.execute(sql3)
cnxn.commit()
cursor3.close()

if Html_Change.findAll("div", {"class": "htl_room_txt text_3l J_tabHeightConShift_1"}) != None:
intro_html = Html_Change.findAll("div", {"class": "htl_room_txt text_3l J_tabHeightConShift_1"})[0].find("div")
intro = str(intro_html)
intro = intro[5:]
intro = intro[:-6]
intro=intro.replace('
', '')
intro=intro.replace(' ', '')
cursor4 = cnxn.cursor()
sql4 = "insert into CtripHotelDesc([Hid],[Language],[Intro],[AddDate],[UpdateDate]) values('%d','%s','%s',getdate(),getdate());" % (selectforHid, " ", intro)
cursor4.execute(sql4)
cnxn.commit()
cursor4.close()

hbrief_html = Html_Change_EN.find("div",{"class":"p-hotel-details"}).find("div",{"class":"l-inner"}).find("div", {"class": "m-hotel-brief"}).find("div", {"class": "brief-wrapper"})
hbrief_html_part1 = hbrief_html.find("p", {"class": "brief-prompt"}).findAll("strong")
hbrief_html_part2 = hbrief_html.find("div", {"class": "brief-cnt"})
hbrief_html_part2=str(hbrief_html_part2)[45:]
hbrief_html_part2=hbrief_html_part2[:-6]
hbrief_list=list()
for one_hbrief_html_part1 in hbrief_html_part1:
one_hbrief_html_part1=str(one_hbrief_html_part1)[8:]
one_hbrief_html_part1=one_hbrief_html_part1[:-9].strip()
hbrief_list.append(one_hbrief_html_part1)
hbrief_list='|'.join(hbrief_list)
hbrief = hbrief_list+'|'+hbrief_html_part2
hbrief=hbrief.replace('
','')
hbrief=hbrief.replace('
', '') hbrief =hbrief.replace('>', '') hbrief=hbrief.replace(' ','') hbrief.replace("'", "''") cursor9 = cnxn.cursor() sql9 = "insert into CtripHotelDesc([Hid],[Language],[Intro],[UpdateDate],[AddDate]) values('%d','%s','%s',getdate(),getdate());" % (int(selectforHid)、「英語」hbrief)print(sql 9)cursor 9.execute(sql9) cnxn.commit() cursor9.close() htl_info_table = Html_Change.findAll("div", {"class": "htl_info_table detail_con_2 J_tabHeightConShift_1"})[0] htl_info_table_txt = htl_info_table.find("table").find("tbody") htl_info = htl_info_table_txt.findAll("tr") tname_list = list() tinfo_list = list() if len(htl_info) > 4: length_htl_info = len(htl_info) - 1 else: length_htl_info = len(htl_info) for length_htl_info_index in range(length_htl_info): one_tname = str(htl_info[length_htl_info_index].find('th'))[4:] one_tname = one_tname[:-5] tname_list.append(one_tname) tinfo_html_list = (htl_info[length_htl_info_index].findAll("td")[0]).findAll("ul")[0].findAll("li") tinfo_cuthtml_list = list() for one_tinfo_html_list in tinfo_html_list: tinfo_cut2 = '' tinfo_cut1 = '' tinfo_cut3 = '' for one_tinfo_html_list_index in range(50, len(str(one_tinfo_html_list))): if str(one_tinfo_html_list)[one_tinfo_html_list_index] == ' tinfo_cut1 = str(one_tinfo_html_list)[50:one_tinfo_html_list_index] break if one_tinfo_html_list.find("span"): for one_tinfo_html_list_index_1 in range(len(str(one_tinfo_html_list.find("span")))): if str(one_tinfo_html_list.find("span"))[one_tinfo_html_list_index_1] == '>': tinfo_cut2 = str(one_tinfo_html_list.find("span"))[one_tinfo_html_list_index_1 + 1:] tinfo_cut2 = tinfo_cut2[:-7] break if len(str(one_tinfo_html_list)) > 80 and one_tinfo_html_list.find("span"): for one_tinfo_html_list_index_2 in range(78, len(str(one_tinfo_html_list))): if str(one_tinfo_html_list)[one_tinfo_html_list_index_2] == ' tinfo_cut3 = str(one_tinfo_html_list)[78:one_tinfo_html_list_index_2] break join_tinfo = tinfo_cut1.strip() + tinfo_cut2.strip() + tinfo_cut3.strip() tinfo_cut2 = '' tinfo_cut1 = '' tinfo_cut3 = '' tinfo_cuthtml_list.append(join_tinfo) join_tinfo_change = str(','.join(tinfo_cuthtml_list)) tinfo_list.append(join_tinfo_change) tname = '|'.join(tname_list) tinfo = '|'.join(tinfo_list) cursor5 = cnxn.cursor() sql5 = "insert into CtripHotelBookInfo([Hid],[Language],[TName],[TInfo],[UpdateDate],[AddDate]) values('%d','%s','%s','%s',getdate(),getdate());"% (selectforHid,"中国語",tname,tinfo)cursor 5.execute(sql5) cnxn.commit() cursor5.close() hotel_facility_normal = Html_Change_EN.find("div", {"class": "c-hotel-facility"}).find("div", {"class": "c-hotel-facility__wrapper"}).find("div",{"class": "c-hotel-facility__normal"}).findAll("div", {"class": "c-hotel-facility__normal-item u-clearfix"}) tname_list_EN = list() tinfo_list_EN = list() for hotel_facility_normal_index_1 in range(len(hotel_facility_normal)): tinfo_list_EN_list_part = list() tname_html = str(hotel_facility_normal[hotel_facility_normal_index_1].find("div", {"class": "c-hotel-facility__normal-cnt"}).find("p"))[:-4] for tname_html_index in range(len(tname_html)): if tname_html[len(tname_html) - 1 - tname_html_index] == '>': tname_html_forindex = len(tname_html) - tname_html_index break tname_list_EN.append(tname_html[tname_html_forindex:]) tinfo_html = hotel_facility_normal[hotel_facility_normal_index_1].find("div", {"class": "c-hotel-facility__normal-cnt"}).findAll("li",{"class":"u-power"}) for tinfo_html_index in range(len(tinfo_html)): tinfo_html_cut = str(tinfo_html[tinfo_html_index].find("span"))[6:] tinfo_html_cut_1 = tinfo_html_cut[:-7] tinfo_list_EN_list_part.append(tinfo_html_cut_1) tinfo_list_EN_part = ','.join(tinfo_list_EN_list_part) tinfo_list_EN.append(tinfo_list_EN_part) tinfo_list_EN.append('|') join_tname_EN = '|'.join(tname_list_EN) tinfo_list_EN =tinfo_list_EN[:-1] join_tinfo_EN = ''.join(tinfo_list_EN) cursor10 = cnxn.cursor() sql10 = "insert into CtripHotelBookInfo([Hid],[Language],[TName],[TInfo],[UpdateDate],[AddDate]) values('%d','%s','%s','%s',getdate(),getdate());"% (selectforHid,"英語",join_tname_EN,join_tinfo_EN)cursor 10.execute(sql10) cnxn.commit() cursor10.close() if Html_Change.findAll("div", {"class": "group_brand htl_room_txt text_3l"}): htl_room_txt = Html_Change.findAll("div", {"class": "group_brand htl_room_txt text_3l"})[0] brand_html = htl_room_txt.find("p").find("b") brand_html_cut = str(brand_html)[3:] brand_html_cut = brand_html_cut[:-4] for htl_room_txt_index in range(11, len(str(htl_room_txt))): if str(htl_room_txt)[len(str(htl_room_txt)) - htl_room_txt_index - 1] == '>': intro_brand = str(htl_room_txt)[0 - htl_room_txt_index:-12] break cursor6 = cnxn.cursor() sql6 = "insert into CtripHotelBrand([Language],[BrandName],[Intro],[UpdateDate],[AddDate]) values('%d','%s','%s',getdate(),getdate());"% (「中国語」,brand_html_cut,intro_brand)cursor 6.execute(sql6) cnxn.commit() cursor6.close() Html_Change = Html_Change.findAll("div", {"class": "hroom_list"})[0] Room_list = Html_Change.findAll("div", {"class": "hroom_tr J_baseRoomlist "}) for Room_list_index in range(len(Room_list)): hroom_base = Room_list[Room_list_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl", {"class": "hroom_base"}) RoomInfoDetailsList_text = hroom_base.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get("data-text") RoomInfoDetailsList_json = json.loads(RoomInfoDetailsList_text) Rinfoname_list = list() Rinfoval_list = list() if RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList'] != []: RoomInfoDetailsList_json_list = RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList'] for RoomInfoDetailsList_json_list_index in RoomInfoDetailsList_json_list: RInfoVal = ','.join(RoomInfoDetailsList_json_list_index['thisDetailInfoVal']) Rinfoname_list.append(RoomInfoDetailsList_json_list_index['thisDetailInfoName']) Rinfoval_list.append(RInfoVal) Rinfoval_list.append('|') roomid = int(RoomInfoDetailsList_json['comment_baseroomId']) roomname = RoomInfoDetailsList_json['comment_baseroomName'] if RoomInfoDetailsList_json['thisBaseRoomServiceDetailList'] != []: ServiceDetailList = RoomInfoDetailsList_json['thisBaseRoomServiceDetailList'] for ServiceDetailList_index in ServiceDetailList: InfoVal_part = ServiceDetailList_index['thisDetailInfoVal'] Rinfoname_list.append(ServiceDetailList_index['thisDetailInfoName']) Rinfoval_list.append(InfoVal_part) Rinfoval_list.append('|') Rinfoval_list = Rinfoval_list[:-1] infoname = str('|'.join(Rinfoname_list)) infoval = str(''.join(Rinfoval_list)) cursor7 = cnxn.cursor() sql7 = "SELECT Hid,Rid from CtripHotelRoomList where RoomCode = '%d'"% (roomid) cursor7.execute(sql7) hid = cursor7.fetchone()[0] rid = cursor7.fetchone()[1] cursor7.close() cursor8 = cnxn.cursor() sql8 = "insert into CtripHotelFacilities([Hid],[Rid],[Language],[TName],[TInfo],[RoomCode],[UpdateDate],[AddDate]) values('%d','%d','%s','%s','%s','%d',getdate(),getdate());"% (hid,rid,"中国語",infoname,infoval,roomid)cursor 8.execute(sql8) cnxn.commit() cursor8.close() print(rid) cursor11 = cnxn.cursor() updateDstatus_Sql = "update downhotellistLive set DStatus=4 where id='%d'"% oneselect.ID cursor11.execute(updateDstatus_Sql) cnxn.commit() cursor11.close() Driver_CN.quit() Driver_EN.quit()
転載先:https://www.cnblogs.com/kanziliang/p/9438132.html