python爬虫類の第1課、検索エンジンを作ります

8701 ワード

from BeautifulSoup import *
from urlparse import urljoin

ignaorewords=set(['the','of','to','and','a','in','is','it'])

私たちの検索エンジンはキーワードに基づいているので、連語、冠詞を無視します.
次のコードは爬虫類で、ホームページのテキストデータを私たちのsqliteに保存して、みんなが読めなくても大丈夫で、これらの関数が何をしているのか知っていればいいです.
from sqlite3 import dbapi2 as sqlite
import urllib2
class crawler:
    def __init__(self,dbname):
        self.con=sqlite.connect(dbname)
        #        , dbname   , 'xxx.db'   
    def __del__(self):
        self.con.close()
    def dbcommit(self):
        self.con.commit()
    
    def getentryid(self,table,field,value,createnew=True):
        cur=self.con.execute(
            "select rowid from %s where %s='%s'" %(table,field,value))
        res=cur.fetchone()
        if res==None:
            cur=self.con.execute(
                "insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    
    
    def addtoindex(self,url,soup):
        if self.isindexed(url): return
        print 'Indexing',url
        
        #Get words
        text=self.gettextonly(soup)
        words=self.separatewords(text)
        
        #Get URL id
        urlid=self.getentryid('urllist','url',url)
        
        # Link word to url
        for i in range(len(words)):
            word=words[i]
            if word in ignaorewords: continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into wordlocation(urlid,wordid,location) \
            values(%d,%d,%d)" % (urlid,wordid,i))
            
            
    
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=subtext+'
' return resulttext else: return v.strip() def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute( "select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #if crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v != None: return True return False def addlinkref(self,urlFrom,urlTo,linkText): pass def crawl(self,pages,depth=2): for i in range(depth): newpages=set() for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open",page continue soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if 'href' in dict(link.attrs): url=urljoin(page,link['href']) if url.find("'") != -1: continue url=url.split('#')[0] #remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages.add(url) linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() pages=newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit()
はい、爬虫類ができました.また、爬虫類が必要なページを書きます.
pagelist=[['http://en.xjtu.edu.cn/'],
          ['http://www.lib.xjtu.edu.cn/'],
          ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]
データベースの構築
mycrawler=crawler('searchindex.db')
mycrawler.createindextables()
爬取
mycrawler.crawl(pagelist[0])
検索エンジン
class searcher:
    def __init__(self,dbname):
        self.con=sqlite.connect(dbname)
    
    def __del__(self):
        self.con.close()
    
    def getmatchrows(self,q):
        # Strings to build the query
        fieldlist='w0.urlid'
        tablelist=''  
        clauselist=''
        wordids=[]

        # Split the words by spaces
        words=q.split(' ')  
        tablenumber=0

        for word in words:
            #Get the word ID
            wordrow=self.con.execute(
                "select rowid from wordlist where word='%s'" % word).fetchone()
            if wordrow!=None:
                wordid=wordrow[0]
                wordids.append(wordid)
                if tablenumber>0:
                    tablelist+=','
                    clauselist+=' and '
                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
                fieldlist+=',w%d.location' % tablenumber
                tablelist+='wordlocation w%d' % tablenumber      
                clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
                tablenumber+=1

        # Create the query from the separate parts
        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
        print fullquery
        cur=self.con.execute(fullquery)
        rows=[row for row in cur]

        return rows,wordids
    
    def geturlname(self,id):
        return self.con.execute(
            "select url from urllist where rowid=%d" % id).fetchone()[0]
    
    def normaliszescores(self,scores,smallIsBetter=0):
        vsmall=0.00001
        if smallIsBetter:
            minscore=min(scores.value())
            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)\
                        in scores.items()])
        else:
            maxscore=max(scores.values())
            if maxscore==0:
                maxscore=vsmall
            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

#score methods
    def frequencyscore(self,rows):
        counts=dict([(row[0],0) for row in rows])
        for row in rows:
            counts[row[0]]+=1
        return self.normaliszescores(counts)
    
    def locationscore(self,rows):
        locations=dict([(row[0],1000000) for row in rows])
        for row in rows:
            loc=sum(row[1:])
            if loc
検索エンジンとデータベースの関連付けを確立する
e=searcher('searchindex.db')
検索
e.query('xjtu college')
これで最初の検索エンジンが構築されました.
1.000000	http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm
0.941176	http://en.xjtu.edu.cn/info/1044/1683.htm
0.705882	http://en.xjtu.edu.cn/Schools_and_Colleges.htm
0.529412	http://en.xjtu.edu.cn/info/1044/1681.htm
0.470588	http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm
0.382353	http://en.xjtu.edu.cn/XJTU_News/News.htm
0.382353	http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm
0.294118	http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm
0.294118	http://en.xjtu.edu.cn/info/1044/1572.htm
0.279412	http://en.xjtu.edu.cn/info/1044/1571.htm