python爬虫類の第1課、検索エンジンを作ります
8701 ワード
from BeautifulSoup import *
from urlparse import urljoin
ignaorewords=set(['the','of','to','and','a','in','is','it'])
私たちの検索エンジンはキーワードに基づいているので、連語、冠詞を無視します.
次のコードは爬虫類で、ホームページのテキストデータを私たちのsqliteに保存して、みんなが読めなくても大丈夫で、これらの関数が何をしているのか知っていればいいです.
from sqlite3 import dbapi2 as sqlite
import urllib2
class crawler:
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
# , dbname , 'xxx.db'
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def getentryid(self,table,field,value,createnew=True):
cur=self.con.execute(
"select rowid from %s where %s='%s'" %(table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute(
"insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing',url
#Get words
text=self.gettextonly(soup)
words=self.separatewords(text)
#Get URL id
urlid=self.getentryid('urllist','url',url)
# Link word to url
for i in range(len(words)):
word=words[i]
if word in ignaorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) \
values(%d,%d,%d)" % (urlid,wordid,i))
def gettextonly(self,soup):
v=soup.string
if v==None:
c=soup.contents
resulttext=''
for t in c:
subtext=self.gettextonly(t)
resulttext+=subtext+'
'
return resulttext
else:
return v.strip()
def separatewords(self,text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
def isindexed(self,url):
u=self.con.execute(
"select rowid from urllist where url='%s'" % url).fetchone()
if u!=None:
#if crawled
v=self.con.execute(
'select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v != None: return True
return False
def addlinkref(self,urlFrom,urlTo,linkText):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set()
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open",page
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if 'href' in dict(link.attrs):
url=urljoin(page,link['href'])
if url.find("'") != -1:
continue
url=url.split('#')[0] #remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
pages=newpages
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
はい、爬虫類ができました.また、爬虫類が必要なページを書きます.pagelist=[['http://en.xjtu.edu.cn/'],
['http://www.lib.xjtu.edu.cn/'],
['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]
データベースの構築mycrawler=crawler('searchindex.db')
mycrawler.createindextables()
爬取mycrawler.crawl(pagelist[0])
検索エンジンclass searcher:
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def getmatchrows(self,q):
# Strings to build the query
fieldlist='w0.urlid'
tablelist=''
clauselist=''
wordids=[]
# Split the words by spaces
words=q.split(' ')
tablenumber=0
for word in words:
#Get the word ID
wordrow=self.con.execute(
"select rowid from wordlist where word='%s'" % word).fetchone()
if wordrow!=None:
wordid=wordrow[0]
wordids.append(wordid)
if tablenumber>0:
tablelist+=','
clauselist+=' and '
clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
fieldlist+=',w%d.location' % tablenumber
tablelist+='wordlocation w%d' % tablenumber
clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
tablenumber+=1
# Create the query from the separate parts
fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
print fullquery
cur=self.con.execute(fullquery)
rows=[row for row in cur]
return rows,wordids
def geturlname(self,id):
return self.con.execute(
"select url from urllist where rowid=%d" % id).fetchone()[0]
def normaliszescores(self,scores,smallIsBetter=0):
vsmall=0.00001
if smallIsBetter:
minscore=min(scores.value())
return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)\
in scores.items()])
else:
maxscore=max(scores.values())
if maxscore==0:
maxscore=vsmall
return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])
#score methods
def frequencyscore(self,rows):
counts=dict([(row[0],0) for row in rows])
for row in rows:
counts[row[0]]+=1
return self.normaliszescores(counts)
def locationscore(self,rows):
locations=dict([(row[0],1000000) for row in rows])
for row in rows:
loc=sum(row[1:])
if loc
検索エンジンとデータベースの関連付けを確立するe=searcher('searchindex.db')
検索e.query('xjtu college')
これで最初の検索エンジンが構築されました.1.000000 http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm
0.941176 http://en.xjtu.edu.cn/info/1044/1683.htm
0.705882 http://en.xjtu.edu.cn/Schools_and_Colleges.htm
0.529412 http://en.xjtu.edu.cn/info/1044/1681.htm
0.470588 http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm
0.382353 http://en.xjtu.edu.cn/XJTU_News/News.htm
0.382353 http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm
0.294118 http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm
0.294118 http://en.xjtu.edu.cn/info/1044/1572.htm
0.279412 http://en.xjtu.edu.cn/info/1044/1571.htm