Ruby版ネットワーク爬虫
以前書いた爬虫類はデータを取って、そしてmysqlデータベースを保管して、マルチスレッドをサポートします。
#! /usr/local/bin/ruby
require 'hpricot'
require "open-uri"
require 'net/http'
require 'CompanyLink'
require 'parser/CompanyParser'
require 'parser/CompanyParser2'
require 'export/CsvExport'
require 'export/MysqlExport'
class AlibabaWebCatcher
def initialize
@exporter = nil
end
def setExporter(export = nil)
@exporter = export
end
#collect by given company site
def collectByCompanyLink(companySite)
companyLink = CompanyLink.new(companySite)
companyHostUri = companyLink.hostUri
companyIntroduceUri = companyLink.introduceUri
companyContactUri = companyLink.contactUri
begin
puts 'Collect company : ' + companySite
http = Net::HTTP.new(companyHostUri, 80)
puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri
resp, data = http.get(companyIntroduceUri, nil)
#puts data
if resp.message == "OK" then
puts 'Opened Page: http://' + companyHostUri + companyIntroduceUri
companyLink.parser.parseProfile(data)
else
puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri + ' failed'
end
puts 'Opening Page: http://' + companyHostUri + companyContactUri
resp2, data2 = http.get(companyContactUri, nil)
#put data2
if resp2.message == 'OK' then
puts 'Opened Page: http://' + companyHostUri + companyContactUri
companyLink.parser.parseContact(data2)
else
puts 'Opening Page: http://' + companyHostUri + companyContactUri + ' failed'
end
if @exporter.nil? then
puts 'Please provider exporter for export function.'
else
@exporter.doExport(companyLink.parser.getCompany)
end
rescue Exception => err
puts 'Exception happend caused by : ' + err.to_s
#raise err
ensure
puts 'Collect company : ' + companySite + " finished."
end
end
#collect by company list
def collectByCompanyList(companyListUrl)
companySearchHostUri = getCompanySearchHost
companySearchUri = companyListUrl.split(/alibaba.com/)[1]
#puts companySearchUri
begin
http = Net::HTTP.new(companySearchHostUri, 80)
puts 'Opening Page: http://' + companySearchHostUri + companySearchUri
resp, data = http.get(companySearchUri, nil)
#puts data
if resp.message == "OK"
puts 'Opened Page: http://' + companySearchHostUri + companySearchUri
#parse company list
companySites = parseCompanySites(data)
for companySite in companySites
collectByCompanyLink(companySite)
end
nextPage = getNextSearchPage(data)
if !nextPage.nil? then
collectByCompanyList(nextPage)
end
else
puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed'
end
rescue Exception => err
puts 'Exception happend caused by : ' + err.to_s
raise err
ensure
end
end
#collect by company list url
def collectByKeywords(searchKeywords)
puts 'Searching ... ' + searchKeywords
companySearchHostUri = getCompanySearchHost()
companySearchUri = getCompanySearchUri(searchKeywords)
collectByCompanyList(companySearchHostUri + companySearchUri)
end
#collect by company list url use thread
def collectByKeywordsByThread(searchKeywords)
puts 'Searching ... ' + searchKeywords
companySearchHostUri = getCompanySearchHost()
companySearchUri = getCompanySearchUri(searchKeywords)
http = Net::HTTP.new(companySearchHostUri, 80)
puts 'Opening Page: http://' + companySearchHostUri + companySearchUri
resp, data = http.get(companySearchUri, nil)
#puts data
if resp.message == "OK" then
puts 'Opened Page: http://' + companySearchHostUri + companySearchUri
totalPage = getTotalPage(data)
secondPage = getNextSearchPage(data)
uri = '/company' + secondPage.split(/\/company/)[1]
#puts uri
threads = []
for i in 1..totalPage
threads << Thread.new(companySearchHostUri) do |page|
begin
h = Net::HTTP.new(page, 80)
uri = uri.gsub(/\d+\.html/, i.to_s + '.html')
#puts uri
puts "Opening Page: http://#{page}#{uri}"
resp_t, data_t = h.get(uri, nil)
#puts data
#puts resp_t.message
if resp_t.message == "OK" then
puts "Opened Page: http://#{page}#{uri}"
#parse company list
companySites = parseCompanySites(data_t)
for companySite in companySites
collectByCompanyLink(companySite)
end
else
puts "Opening Page: http://#{page}#{uri} failed."
end
rescue Exception => err
puts 'Exception thrown out when got page ' + i.to_s + ' since ' + err.to_s
#raise
ensure
end
end
end
threads.each { |t| t.join }
else
puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed'
end
end
protected
#parse the company site list
def parseCompanySites(companyListHtml)
companySites = Array.new
doc = Hpricot(companyListHtml)
doc.search('//div[@class="content"]/div[@class="info"]/span[@class="m undline"]/a').each do |item|
companyHref = item.attributes['href']
puts ' : ' + companyHref
companySites << companyHref
end
return companySites
end
private
#get company search host
def getCompanySearchHost
return 'search.china.alibaba.com';
end
#get company search uri according to keywords
def getCompanySearchUri(keywords)
return '/search/company_search.htm?filt=y&categoryId=0&maxCatId=&isNoRepost=false&descend_order=&show_fid=&cat_field=' +
'&tradeType=&searchType=&pageSize=30&sm=&seType=&townId=0&onlineStatus=all&memberlevel=' +
'&province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3&city=&biztype=&established_year=' +
'&keywords=' + URI.escape(keywords)
#return '/company/' + URI.escape(keywords, 'utf-8') + '/1.html?province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3'
end
#get company search next page
def getNextSearchPage(data)
doc = Hpricot(data)
nextPageHref = doc.at('//div[@class="pages"]/div[@class="list_offer_pages"]/h1/b/a[text()=" "]')
if !nextPageHref.nil? then
#puts ' : ' + nextPageHref.attributes['href']
return nextPageHref.attributes['href']
end
end
#get company search total page
def getTotalPage(data)
totalItems = data.scan(/ \s<span class="red sm">(\d+)<\/span>\s /)[0]
if !totalItems.nil? then
totalPage = ((totalItems.last.to_i%30 == 0) ? totalItems.last.to_i/30 : (totalItems.last.to_i/30 + 1))
puts ' ' + totalItems.last + ' ' + totalPage.to_s + ' '
return totalPage
else
return 0
end
end
end
使い方は簡単です。キーワードを入力すればいいです。
#! /usr/local/bin/ruby
require 'test/unit'
require 'AlibabaWebCatcher'
require 'export/CsvExport'
require 'export/MysqlExport'
class AlibabaWebCatcherTest < Test::Unit::TestCase
def setup
@webCatcher = AlibabaWebCatcher.new
#@webCatcher.export = CsvExport.new('test.csv')
@webCatcher.exporter = MysqlExport.new
end
def teardown
@webCatcher = nil
end
def test_download
assert_not_nil(@webCatcher)
#@webCatcher.collectByCompanyShortName('qianbusha')
@webCatcher.collectByKeywords(' ')
assert(true, 'Should go here')
end
end