Screen scraping 3

3237 ワード

Use Beautiful Soup
from urllib import urlopen
from bs4 import BeautifulSoup as BS

text = urlopen("http://www.python.org/community/jobs/").read()
soup = BS(text.decode('gbk', 'ignore'))

jobs = set()
for header in soup('h2'):
    links = header('a', 'reference')
    if not links:
        continue
    link = links[0]
    jobs.add('%s (%s)' % (link.string, link['href']))
        
print '
'.join(sorted(jobs, key = lambda s: s.lower())) eliminate duplicates and print the names in sorted order soup('h2'): to get a list of all h2 elements header('a', 'reference') to get a list of child elements of the reference class