Pythonが新浪ブログのバックアップを実現する方法


この実例は、Pythonが新浪ブログのバックアップを実現する方法を述べている。皆さんに参考にしてあげます。具体的には以下の通りです。
Python 2.7.2.2バージョンが実現し、IDEでの運行を推奨します。

# -*- coding:UTF-8 -*- #
'''
Created on 2011-12-18
@author: Ahan
'''
import re
import sys
import os
import time
import socket
import locale
import datetime
import codecs
from urllib import urlopen
#       
#        
pattern1=u"""<a href="(http:.*?)">    </a>"""
prog1 = re.compile(pattern1)
#        
pattern2=u"""<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>"""
prog2=re.compile(pattern2)
#       
pattern3=u"""<a href="([^"]+)" title="[^"]+">   """
prog3=re.compile(pattern3)
#      
pattern4=u"""<!--     begin -->[\\s\\S]*?<!--      -->"""
prog4=re.compile(pattern4)
#        
pattern5=u"""(src="[^"]+"( real_src ="([^"]+)"))"""
prog5=re.compile(pattern5)
def read_date_from_url(url):
  """ Unicode     url        
  """
  try:
    data = ""
    request = urlopen(url)
    while True:
      s = request.read(1024)
      if not s:
        break
      data += s
    return unicode(data)
  except:
    print '       '
    print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
    return None
  finally:
    if request:
      request.close()
def save_to_file(url,filename,blog_address):
  """url     ,filename        ,     html
  """
  #              
  if os.path.exists(blog_address)==False:
    os.makedirs(blog_address)
  #           
  filename=ReplaceBadCharOfFileName(filename)
  file_no=0
  while os.path.isfile(blog_address+'/'+filename+'.html')==True:
    filename=filename+'('+file_no.__str__()+')'
    file_no+=1
  text = read_date_from_url(url)
  text=_filter(text)
  #        
  result=prog5.findall(text)
  i=1
  for pic in result:
    folder=blog_address+'/'+filename+'/'
    pic_name='image'+i.__str__()+'.gif' 
    if os.path.exists(folder)==False:
      os.makedirs(folder)
    try:
      url_file = urlopen(pic[2])
      pic_file = codecs.open(folder+pic_name,'wb')
      while True:
        s = url_file.read(1024)
        if not s:
          break
        pic_file.write(s)
      pic_file.close()
      url_file.close()
    except:
      print ' ,            ,      ...'
      print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
    else:
      print '      ...'
      #          
      text=text.replace(pic[0],unicode("src=\"" + filename + "/" + pic_name + "\"" + pic[1]),1)
      i=i+1
  blog_file = codecs.open(blog_address+'/'+filename+'.html','wb')
  blog_file.write(text)
  blog_file.close()
#          
def _filter(t):
  """          ,  Unicode      
  """
  result=prog4.search(t)
  if result is not None:
    return u'<html><head></head><body>' + unicode(result.group()) + u'</dody></html>'
  else:
    raise Exception(' ,       ……')
#            
def ReplaceBadCharOfFileName(filename):
  filename=filename.replace("&nbsp;","")
  filename=filename.replace("\\", "")
  filename=filename.replace("/", "")
  filename=filename.replace(":", "")
  filename=filename.replace("*", "")
  filename=filename.replace("?", "")
  filename=filename.replace("<", "")
  filename=filename.replace(">", "")
  filename=filename.replace("|", "")
  filename=filename.replace("&","")
  filename=filename.replace(";","")
  return filename
#   
if __name__ == '__main__':
  #    
  blog_no=1#    
  begin=1#    
  end=0#    
  page=0#  
  saved=0#       
  timeout = 60*5#    5  
  socket.setdefaulttimeout(timeout)#     socket       。           socket,     
  blog_address=raw_input("         (        ,         http://blog.sina.com.cn/jiangafu,    jiangafu):")
  blog_address=blog_address.replace('\r','')
  begin=raw_input('      :')  
  begin=locale.atoi(begin)
  while begin<=0:
    begin=raw_input('     0  :')
    begin=locale.atoi(begin)
  end=raw_input('      (      0):')
  end=locale.atoi(end)
  while end<0:
    end=raw_input('       0  :')
    end=locale.atoi(end)
  if end==0:
    print '       :http://blog.sina.com.cn/'+blog_address+',   '+begin.__str__()+'        '
  else:
    print '       :http://blog.sina.com.cn/'+blog_address+',   '+begin.__str__()+'   '\
       +end.__str__()+'    '
  starttime = datetime.datetime.now()
  text=read_date_from_url('http://blog.sina.com.cn/'+blog_address)
  time.sleep(0.5)
  #  “    ” url
  result = prog1.search(text)
  if result is not None:
    print '      :' , result.group(1)
    text=read_date_from_url(result.group(1))
    time.sleep(0.4)
  else:
    print '          '
    #      
    sys.exit()
  #          ,  、  、   
  while True:
    page+=1
    print '     ' , page , ' '
    #           
    result=prog2.findall(text)
    #          
    for blog in result: 
      if blog_no < begin:
        blog_no += 1
      elif end != 0 and blog_no > end:
        break
      else:
        try:
          save_to_file(blog[1],unicode(blog[0]),blog_address)
        except:
          print ' ,   ',blog_no,'   ',blog[0],'        ,  ...'
          blog_no += 1
          print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
        else:
          print '      ', blog_no, '   :', blog[0]
          blog_no += 1
          saved += 1
          time.sleep(0.4)
    #        
    result = prog3.search(text)
    if result is not None:
      text = read_date_from_url(result.group(1))
    else:
      print '      '
      break
  print '          ',saved,'   '
  print '   :',datetime.datetime.now() - starttime
  raw_input('      ...')

Python関連の内容についてもっと興味がある方は、本駅のテーマを見てください。「Pythonデータ構造とアルゴリズム教程」「Python Socketプログラミング技術のまとめ」「Python関数使用テクニックのまとめ」「Python文字列操作テクニックのまとめ」「Python入門と階段の経典教程」「Pythonファイルとディレクトリ操作の概要
ここで述べたように、皆様のPythonプログラムの設計に役に立ちます。