【python 3 2時間入門】入門ノート03:簡単な爬虫類+マルチスレッド爬虫類
9424 ワード
原文のリンク:http://www.cnblogs.com/the-fool/p/11054115.html
ターゲットページをローカルに保存する
1、爬虫類コードはネットから修正して、現在運行は安定しています.ブロガーが必要なのは正確に登ればいいです.データ量は大きくなく、まだマルチスレッドを追加していません.
2、分割戦略は照会条件によって分類し、マルチスレッドを循環的に起動する.
1、単スレッド単純爬虫類(二次整理)
2、マルチスレッド爬虫(第二回整理)
ここは安全なキューを利用してスレッドの安全を保証し、まずアドレスをキューに入れます(ネットワークから抜粋します).
ターゲットページをローカルに保存する
1、爬虫類コードはネットから修正して、現在運行は安定しています.ブロガーが必要なのは正確に登ればいいです.データ量は大きくなく、まだマルチスレッドを追加していません.
2、分割戦略は照会条件によって分類し、マルチスレッドを循環的に起動する.
1、単スレッド単純爬虫類(二次整理)
import urllib.parse
import urllib.request
import os
import datetime
import json
# ,
def getHtml(url,values):
user_agent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
headers = {'User-Agent':user_agent}
data = urllib.parse.urlencode(values)
response_result = urllib.request.urlopen(url+'?'+data).read()
html = response_result.decode('utf-8')
return html
#
def requestCnblogs(index):
print(' ')
url = 'http://xxx xxx.com/'
value= {
'param1': '',
'param2': '',
'param3': '308',
'page': index
}
result = getHtml(url,value)
return result
#print(requestCnblogs(1))
# IO
def writeToTxt(html,file_path):
print(file_path)
try:
fp = open(file_path,"w+",encoding='utf-8')
fp.write(html)
fp.close()
except IOError:
print("fail to open file")
#
def createFile():
# date = datetime.datetime.now().strftime('%Y-%m-%d')
path = r'P:\Users' + '/foldername'
if os.path.exists(path):
return path
else:
os.mkdir(path)
return path
#
def saveBlogs():
i=1;
while 1==1:
try:
print('request for '+str(i)+'...')
blogs = requestCnblogs(i)
#
path = createFile()
writeToTxt(blogs,path+'/filenames'+ str(i) +'.txt')
print(' '+ str(i) +' ')
i = i + 1;
except IOError:
print("sleep 10min and retry")
return 'success'
#
result = saveBlogs()
print(result)
併発爬虫類:https://www.cnblogs.com/huohuohuo1/p/9064759.html 2、マルチスレッド爬虫(第二回整理)
ここは安全なキューを利用してスレッドの安全を保証し、まずアドレスをキューに入れます(ネットワークから抜粋します).
# coding=utf-8
import threading, queue, time, urllib
from urllib import request
baseUrl = 'http://www.pythontab.com/html/pythonjichu/'
urlQueue = queue.Queue()
for i in range(2, 10):
url = baseUrl + str(i) + '.html'
urlQueue.put(url)
#print(url)
def fetchUrl(urlQueue):
while True:
try:
#
url = urlQueue.get_nowait()
i = urlQueue.qsize()
except Exception as e:
break
print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
try:
response = urllib.request.urlopen(url)
responseCode = response.getcode()
except Exception as e:
continue
if responseCode == 200:
#
# ,
html = response.read().decode('utf-8')
time.sleep(1)
print(html)
if __name__ == '__main__':
startTime = time.time()
threads = []
# ,
threadNum = 4
for i in range(0, threadNum):
t = threading.Thread(target=fetchUrl, args=(urlQueue,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
# join , join , ,
t.join()
endTime = time.time()
print ('Done, Time cost: %s ' % (endTime - startTime))
3、自分で改善しました.整理していませんが、使っています.# coding=utf-8
import threading, queue, time, urllib
import urllib.parse
import urllib.request
import os
import datetime
import json
from urllib import request
baseUrl = 'http://www.xxxxxxxxx.cn/xxx/402/'
urlQueue = queue.Queue()
def writeToTxt(html, file_path):
print(file_path)
try:
# write item , , json
fp = open(file_path, "w+", encoding='utf-8')
fp.write(html)
fp.close()
except IOError:
print("fail to open file")
#
def createFiles():
# date = datetime.datetime.now().strftime('%Y-%m-%d')
path = r'P:\Users3' + '/402'
if os.path.exists(path):
return path
else:
os.mkdir(path)
return path
for i in range(1, 881):
url = baseUrl + str(i) + "/"
urlQueue.put(url)
#print(url)
def fetchUrl(urlQueue):
while True:
try:
#
url = urlQueue.get_nowait()
i = urlQueue.qsize()
except Exception as e:
break
print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
try:
response = urllib.request.urlopen(url)
responseCode = response.getcode()
except Exception as e:
continue
if responseCode == 200:
#
# ,
html = response.read().decode('utf-8')
path = createFiles()
writeToTxt(html, path + '/filename' + str(i) + '.txt')
if __name__ == '__main__':
startTime = time.time()
threads = []
# ,
threadNum = 4
for i in range(0, threadNum):
t = threading.Thread(target=fetchUrl, args=(urlQueue,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
# join , join , ,
t.join()
endTime = time.time()
print ('Done, Time cost: %s ' % (endTime - startTime))
def saveBlogs():
i = 51; # 873
while 1 == 1:
try:
print('request for ' + str(i) + '...')
blogs = requestCnblogs(i)
#
path = createFiles()
writeToTxt(blogs, path + '/nongyeyinhang' + str(i) + '.txt')
print(' ' + str(i) + ' ')
i = i + 1;
except IOError:
print("sleep 10min and retry")
return 'success'
個人記録、ダウンロードしたファイルを処理してデータベースに書き込む(javaコード):package com.zzt.spider;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
/**
*
* @author ZX
*
*/
public class ReadSpiderData3 {
public static void main(String[] args) {
List folderList = new ArrayList<>();
//
File fileDir = new File("P:\\Users3\\102");
if(!fileDir.exists()){
return;
}
String[] list = fileDir.list();
for(String str :list) {//
readTxt("P:\\Users3\\102\\"+str);
//return;
}
Scanner sc = new Scanner(System.in);
}
public static void readTxt(String path) {
try {
File file = new File(path);
BufferedReader br = new BufferedReader(new FileReader(file));
String line = null;
int isVaribales=-1;
int lineCount=-1;// 1-20
while ((line = br.readLine()) != null) {
if(line.contains("SWIFT CODE")){
isVaribales=1;
}
if(isVaribales==1) {
lineCount++;
if(lineCount>=1&&lineCount<84) {
if(line==null||"".equals(line.trim())) {
continue;
}
System.out.println(line);
//insertBank(code, name, phone, addr, "170");
}
}
if(line.contains("")){
isVaribales=-1;
}
}
if (br != null) {
br.close();
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*
*/
public static void insertBank(String BANK_CODE,String BANK_NAME,String BANK_PHONE,String BANK_ADDR,String BANK_NO) {
Connection connerction= createConn();
String sql="insert into SP_BANK_DETILS_S2 (BANK_CODE,BANK_NAME,BANK_PHONE,BANK_ADDR,BANK_NO) values(?,?,?,?,?)";
try {
PreparedStatement pstmt = connerction.prepareStatement(sql);
pstmt.setString(1, BANK_CODE);
pstmt.setString(2, BANK_NAME);
pstmt.setString(3, BANK_PHONE);
pstmt.setString(4, BANK_ADDR);
pstmt.setString(5, BANK_NO);
pstmt.executeUpdate();
closeConn(null, pstmt, connerction);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* jdbc
* @return
*/
private static Connection createConn(){
Connection conn = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn =DriverManager.getConnection("jdbc:mysql://192.168.0.100:3306/payrecdb?characterEncoding=utf8","name","pwd");
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
/**
*
* @param rs
* @param stmt
* @param conn
*/
private static void closeConn(ResultSet rs,PreparedStatement stmt,Connection conn){
try {
if(rs!=null)
rs.close();
if(stmt!=null)
stmt.close();
if(conn!=null)
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
:https://www.cnblogs.com/the-fool/p/11054115.html