pythonは当駅の電子書籍の情報を登り取ってそして入庫する実現コードを実現します。
入門レベルの爬虫類:本の名前と情報と住所を把握して、データベースに保存します。
データベースツール類:DBUtil.py
データベースツール類:DBUtil.py
import pymysql
class DBUtils(object):
def connDB(self): #
conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
cur=conn.cursor();
return (conn,cur);
def exeUpdate(self,conn,cur,sql): #
sta=cur.execute(sql);
conn.commit();
return (sta);
def exeDelete(self,conn,cur,IDs): # demo
sta=0;
for eachID in IDs.split(' '):
sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
conn.commit();
return (sta);
def exeQuery(self,cur,sql): #
effect_row = cur.execute(sql);
return (effect_row,cur);
def connClose(self,conn,cur): # ,
cur.close();
conn.close();
if __name__ == '__main__':
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
書籍操作ファイルブックOpe.py
from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
level=logging.INFO
)
class BookOperator(object):
def __addBook(self,book):
logging.info("add book:%s" % book.bookName);
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
dbUtil.exeUpdate(conn,cur,insertBookSql);
dbUtil.connClose(conn,cur);
def __selectLastBookId(self):
logging.info("selectLastBookId ");
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
selectLastBookSql = "select id from book order by id desc limit 1";
effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
bookId = cur.fetchone()[0];
dbUtil.connClose(conn,cur);
return bookId;
def __addBookDownLoadInfos(self,downLoadInfos,bookId):
logging.info("add bookId:%s" % bookId);
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
for downLoadinfo in downLoadInfos:
insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
dbUtil.connClose(conn,cur);
def addBookInfo(self,book):
logging.info("add bookInfo:%s" % book.bookName);
self.__addBook(book);
bookId = self.__selectLastBookId();
self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
bookope = BookOperator();
book = Book("aaa","yang","cccc");
book.addDownLoadUrl(DownLoadInfo("aaa.html"," "));
bookope.addBookInfo(book);
書籍情報ファイルbook Info.py
import sys
sys.encoding = "utf8"
class Book(object):
# #
def __init__(self,mainInfo,downLoadUrl,bookName):
self.mainInfo = mainInfo;
self.downLoadUrl = downLoadUrl;
self.bookName = bookName;
self.downLoadInfos = [];
def addDownLoadUrl(self,downloadInfo):
self.downLoadInfos.append(downloadInfo);
def print_book_info(self):
print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
# #
def __init__(self,downUrl,downName):
self.downUrl = downUrl;
self.downName = downName;
def print_down_info(self):
print ("downLoad %s - %s" % (self.downUrl,self.downName));
51 jobインターフェース解析ファイルFiveOneJobFetch.py
import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
host = "//www.jb51.net/"; # +
category = "books/"; #
def __init__(self,pageUrl):
self.pageUrl = pageUrl; # URL
self.url = PageFetch.host+PageFetch.category + pageUrl;
def __getPageContent(self):
req = requests.get(self.url);
if req.status_code == 200:
req.encoding = "gb2312";
strText = req.text;
return strText;
else:
return "";
def getPageContent(url):
req = requests.get(url);
if req.status_code == 200:
req.encoding = "gb2312";
strText = req.text;
return strText;
else:
return "";
def __getMaxPageNumAndUrl(self):
fetchUrl = self.pageUrl;
# url list45_2.html 2 #
maxPageNum = 0;
maxLink = "";
while maxLink == "":
url = PageFetch.host+PageFetch.category +fetchUrl;
reqContent = PageFetch.getPageContent(url)
soup = BeautifulSoup (reqContent,"html.parser");
for ul in soup.select(".plist"):
print (" ");
print (ul);
maxPageNum = ul.select("strong")[0].text;
alink = ul.select("a");
if alink[-1]['href'] == "#":
maxLink = alink[1]['href'];
else:
fetchUrl = alink[-1]['href'];
return maxPageNum,maxLink;
def __formatPage(self,pageNum):
# url list45_2.html#
lineBeginSite = self.pageUrl.index("_")+1;
docBeginSite = self.pageUrl.index(".");
return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
def getBookPageList(self):
# URL#
shortPageList = [];
maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
for i in range(int(maxPageNum)):
shortPageList.append(self.host +self.category+ self.__formatPage(i));
return shortPageList;
def getDownloadPage(url):
downPage= [];
reqContent = PageFetch.getPageContent(url);
soup = BeautifulSoup (reqContent,"html.parser");
for a in soup.select(".cur-cat-list .btn-dl"):
downPage.append(PageFetch.host+a['href']);
return downPage;
def getBookInfo(url):
logging.info(" url:%s" % url);
reqContent = PageFetch.getPageContent(url);
soup = BeautifulSoup (reqContent,"html.parser");
mainInfo = (soup.select("#soft-intro"))[0].text.replace(" :","").replace("'","");
title = (soup.select("dl dt h1"))[0].text.replace("'","");
book = Book(mainInfo,url,title);
for ul in soup.select(".ul_Address"):
for li in ul.select("li"):
downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
book.addDownLoadUrl(downLoadInfo);
return book;
if __name__ == '__main__':
p = PageFetch("list152_1.html");
shortPageList = p.getBookPageList();
downPage= [];
for page in shortPageList:
downLoadPage = PageFetch.getDownloadPage(page);
downPage = downPage+downLoadPage;
print ("================ ===============================");
for bookDownLoadPage in downPage:
book = PageFetch.getBookInfo(bookDownLoadPage);
print (book.bookName+":%s" % book.downLoadUrl);
for d in book.downLoadInfos:
print ("%s - %s" % (d.downUrl,d.downName));
# p = PageFetch("list977_1.html");
# p = p.getMaxPageNumAndUrl();
# print (p);
ファイルを実行します。以上のファイルのcopyは同じフォルダの下でこのファイルを実行すればいいです。
from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator
def main(url):
p = PageFetch(url);
shortPageList = p.getBookPageList();
bookOperator = BookOperator();
downPage= [];
for page in shortPageList:
downLoadPage = PageFetch.getDownloadPage(page);
downPage = downPage+downLoadPage;
for bookDownLoadPage in downPage:
book = PageFetch.getBookInfo(bookDownLoadPage);
bookOperator.addBookInfo(book);
print (" :"+url);
if __name__ == '__main__':
urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
for url in urls:
main(url);
データベーステーブル:書籍情報表と住所表のダウンロード
CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;
gitアドレス:https://git.oschina.net/yangsj/BookFetch/tree/master