Python外国語定期刊行物論文情報(機械計器工業)
26448 ワード
NSTL国家科学技術図書文献センター2017機械計器工業全定期刊行物論文情報
コードは勝手だから気にしないで
最初のステップは、すべての定期刊行物のリンクを取得します.
第2歩は、各定期刊行物の2017年の論文リンクをすべて取得することです.
第三歩、論文情報の詳細ページのソースコードを取得する
ステップ4、ソースコードの解析
転載先:https://www.cnblogs.com/zhangtianyuan/p/9199324.html
コードは勝手だから気にしないで
最初のステップは、すべての定期刊行物のリンクを取得します.
#coding=utf-8
import time
from selenium import webdriver
from lxml import etree
from pymongo import MongoClient
client = MongoClient("IP", 27017)
db = client["nstl"]
collection=db["journal_urls"]
db.authenticate("","")
driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")
driver.get('https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH')
html = driver.page_source
tree = etree.HTML(html)
count = int(tree.xpath("//span[@id='totalPages1']/text()")[0])
# 47
for i in range(count):
html = driver.page_source
tree = etree.HTML(html)
#
table = tree.xpath("//div[@class='s2listtd2']/span/a/@href")
for j in table:
bson = {}
bson['url'] = j
collection.insert(bson)
# i 46
if i==(count-1):
break
#
driver.find_element_by_xpath('//div[@id="page"]/div//a[text()="%s"]'%str(i+2)).click()
# while
while True:
time.sleep(1)
if driver.page_source!=html:
break
driver.close()
第2歩は、各定期刊行物の2017年の論文リンクをすべて取得することです.
#coding=utf-8
import requests
from pymongo import MongoClient
from lxml import etree
from selenium import webdriver
import time
client = MongoClient("IP", 27017)
db = client["nstl"]
collection1=db["journal_urls"]
collection2=db["journalArticle2017_urls"]
db.authenticate("","")
driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")
#
for item in collection1.find({}, {"url":1, "_id":0}):
driver.get(item['url'][29:-4])
html = driver.page_source
tree = etree.HTML(html)
# 18 , 17
table_2018 = tree.xpath("//div[@id='year_2018']")
if table_2018!=[]:
driver.find_element_by_xpath("//div[@id='year_2017']").click()
time.sleep(1)
driver.find_element_by_xpath("//div[@id='volumeUl_2017']/div[@class='ltreebom2']").click()
# 17
table = tree.xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3']/a")
for i in range(1, len(table)+1):
wen_html = driver.page_source
wen_tree = etree.HTML(wen_html)
#
wen_table = tree.xpath("//div[@class='s2listtd2']/a/@href")
for j in wen_table:
bson = {}
bson['url'] = j
collection2.insert(bson)
#
if i==len(table):
break
#
try:
driver.find_element_by_xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3'][%s]"%str(i+1)).click()
except:
break
#
while True:
time.sleep(1)
if driver.page_source!=wen_html:
break
driver.close()
第三歩、論文情報の詳細ページのソースコードを取得する
#coding=utf-8
import requests
from pymongo import MongoClient
from lxml import etree
from selenium import webdriver
import time
client = MongoClient("IP", 27017)
db = client["nstl"]
collection=db["journalArticle2017_urls"]
collection1=db["journalArticle2017_codes"]
db.authenticate("","")
driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")
#
for item in collection.find({}, {"url":1, "_id":0}):
url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item['url'][23:-11]+"&subDocType="+item['url'][-8:-3]
# # post
# for i in range(100):
# try:
# result = requests.post(url, verify = False)
# except:
# time.sleep(1)
# continue
# html = result.text
# if html:
# break
# ,
driver.get(url)
for i in range(100):
time.sleep(1)
if driver.page_source!=html:
break
#
bson = {}
html1 = driver.page_source
bson['html'] = html1
collection1.insert(bson)
driver.close()
ステップ4、ソースコードの解析
#coding=utf-8
from pymongo import MongoClient
from lxml import etree
client = MongoClient("IP", 27017)
db = client["nstl"]
collection1 = db["journalArticle2017_codes"]
collection2 = db["journalArticle2017_data"]
db.authenticate("","")
zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u'【 】:', u'【 】:', u'【ISSN】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:', u'【 】:'
#
n = 0
for item in collection1.find({}, {"html":1, "_id":0}):
html = item["html"]
tree = etree.HTML(html)
title = tree.xpath("//span[@name='title']/text()")
author = tree.xpath("//a[starts-with(@href,'javascript:searchByAuthor')]/text()")
organization = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zzdw)
journal_name = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%km)
issn = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%ma)
publication_year = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%cbn)
volume = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%j)
issue = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%q)
page_start = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%qy)
page_end = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zy)
page_count = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zys)
clc = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%flh)
keywords = tree.xpath("//div[text()='%s']/following-sibling::*/span/a/text()"%gjc)
language = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%yz)
summary = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%wz)
dc = {}
dc['title'] = title[0]
if author: dc['author'] = author
if organization: dc['organization'] = organization[0]
if journal_name: dc['journal_name'] = journal_name[0]
if issn: dc['issn'] = issn[0]
if publication_year: dc['publication_year'] = publication_year[0]
if volume: dc['volume'] = volume[0]
if issue: dc['issue'] = issue[0]
if page_start: dc['page_start'] = page_start[0]
if page_end: dc['page_end'] = page_end[0]
if page_count: dc['page_count'] = page_count[0]
if clc: dc['clc'] = clc[0]
if keywords: dc['keywords'] = keywords[0]
if language: dc['language'] = language[0]
if summary: dc['summary'] = summary[0]
collection2.insert(dc)
転載先:https://www.cnblogs.com/zhangtianyuan/p/9199324.html