Python正規表現でHTMLタグをフィルタまたは置換する方法

2701 ワード

Python

python正規表現のキー:
python正規表現エスケープ:
・改行文字以外の任意の文字を一致させるwアルファベットまたは数字または下線または漢字を一致させるs任意の空白文字を一致させるd数字を一致させるb単語の開始または終了を一致させる^文字列の開始を一致させる$文字列の終了を一致させるW任意のアルファベット、数字、下線を一致させる漢字の文字Sは任意の空白文字ではない文字Dは任意の非数字の文字Bは単語の先頭または終了の位置ではない[^x]はx以外の任意の文字に一致する
[^aeiou]aeiouというアルファベット以外の任意の文字に一致します
一般的なpython正規表現修飾子コード/構文説明:
*0回以上+1回以上繰り返しますか?0回または1回{n}n回{n,}n回以上{n,m}nからm回繰り返しpython正規表現命名グループについて:命名グループ:(?P.....)この文章では、定義(疑問符の先頭に、前方には「前方定義(?<=...)後方定義(?=...)前方非定義(?後方非定義(?!……)についても言及している.
Python正規表現によるHTMLタグサンプルコードの除去(フィルタリング)

#-*- coding:utf-8 -*-
import re

##  HTML    
#  HTML        
# @param htmlstr HTML   .
def filter_tags(htmlstr):
    #    CDATA
    re_cdata = re.compile("//]*//\]\]>", re.I)  #  CDATA
    re_script = re.compile(']*>[^', re.I)  # Script
    re_style = re.compile(']*>[^', re.I)  # style
    re_br = re.compile('
')  #     
    re_h = re.compile('?\w+[^>]*>')  # HTML  
    re_comment = re.compile('')  # HTML  
    s = re_cdata.sub('', htmlstr)  #   CDATA
    s = re_script.sub('', s)  #   SCRIPT
    s = re_style.sub('', s)  #   style
    s = re_br.sub('
', s)  #  br     
    s = re_h.sub('', s)  #   HTML   
    s = re_comment.sub('', s)  #   HTML  
    #        
    blank_line = re.compile('
+')
    s = blank_line.sub('
', s)
    s = replaceCharEntity(s)  #     
    return s

##    HTML    .
#          HTML        .
#             CHAR_ENTITIES ,    HTML    .
# @param htmlstr HTML   .
def replaceCharEntity(htmlstr):
    CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                     'lt': '', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }

    re_charEntity = re.compile(r'?(?P\w+);')
    sz = re_charEntity.search(htmlstr)
    while sz:
        entity = sz.group()  # entity  ， >
        key = sz.group('name')  #   &; entity, > gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
        except KeyError:
            #      
            htmlstr = re_charEntity.sub('', htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
    return htmlstr


def repalce(s, re_exp, repl_string):
    return re_exp.sub(repl_string, s)


if __name__ == '__main__':
    s = file('test.html').read()
    news = filter_tags(s)
    print news

datetimeとpytzでタイムゾーンを変換する

標準化されたシーケンスデータ