GlidedSky爬虫類サイト練習基礎1


GlidedSky爬虫類サイト練習第一関門
何気なくいい爬虫類の練習サイトを見ました.
GlidedSky爬虫類練習サイトはいい練習サイトですが、まず登録する必要があります.第一関門はウェブページのデータの和を求めることで、フォーマットが統一されているため、多くの方法で実現することができ、GET方法を使用する際には、User-Agentとcookieを加えることができます.*re,bs 4,xpath,css,selenium自動化手法による情報の抽出を紹介した.*selenium自動化テスト参考文献
import requests
import re
from fake_useragent import UserAgent  #      headers
from lxml import etree
from bs4 import BeautifulSoup

def get_html(url):
    #     headers
	headers = {
     'User-Agent': UserAgent().random,
				'cookie': '  cookie'}
	try:
		#   url  
		response = requests.get(url, headers=headers)
		#   200,    
		response.raise_for_status()
		#             
		response.encoding = response.apparent_encoding
		return response.text
	except:
		return

def html_infos(html):
	num = 0
	#     
	# infos          
	infos = re.findall(r'
(.*?)
'
, html, re.S) # , for info in infos: num += int(info.strip()) print(num) # bs4 soup = BeautifulSoup(html, 'lxml') infos = soup.find_all('div', class_="col-md-1") for info in infos: info = info.text.strip() num += int(info) print(num) # xpath infos_txt = etree.HTML(html) # html infos = infos_txt.xpath('//div[@class="col-md-1"]') for info in infos: info = info.text.strip() num += int(info) # css infos_txt = etree.HTML(html) infos = infos_txt.cssselect('.row>.col-md-1') for info in infos: num += int(info.text.strip()) print(num) if __name__=='__main__': url = 'http://www.glidedsky.com/level/web/crawler-basic-1' # url text html html = get_html(url) html_infos(html) ------------------------------------------------------------------------------------- # selenium from selenium import webdriver from selenium.webdriver.common.by import By # HTML DOM from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC # driver = webdriver.Chrome() # url = 'http://www.glidedsky.com/login' driver.get(url) # driver.maximize_window() # # WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'form-control'))) # , selenium # user = driver.find_element_by_id('email') user.click() user.send_keys(' ') password = driver.find_element_by_id('password') password.click() password.send_keys(' ') # login = driver.find_element_by_css_selector( '#app > main > div.container > div > div > div > div.card-body > form > div.form-group.row.mb-0 > div > button') login.click() # spider1 = driver.find_element_by_xpath('//[@id="app"]/main/div[1]/div/div/table/tbody/tr[1]/td[1]/a') spider1.click() # url1 = driver.find_element_by_xpath('//*[@id="app"]/main/div[1]/div/div/div/div/a').get_attribute('href') driver.get(url1) num = 0 infos = driver.find_elements_by_class_name('col-md-1') for info in infos: info = info.text.strip() num += int(info) print(num) # selenium ,