Python string句読点を取り除くベストプラクティス

15862 ワード

Python文字列の句読点削除のベストプラクティス
方法1:str.isalnum : S.isalnum() -> bool Return True if all characters in S are alphanumeric and there is at least one character in S, False otherwise.
>>> string = "Special $#! characters   spaces 888323"
>>> ''.join(e for e in string if e.isalnum())
'Specialcharactersspaces888323'

特徴:
  • はアルファベットと数字しか認識できず、殺傷力が大きく、中国語やスペースなども
  • 方法2:
    string.punctuation
    import re, string
    
    s ="string. With. Punctuation?" # Sample string 
    
    #    :
    out = s.translate(string.maketrans("",""), string.punctuation)
    
    #    :
    out = s.translate(None, string.punctuation)
    
    #    :
    exclude = set(string.punctuation)
    out = ''.join(ch for ch in s if ch not in exclude)
    
    #    :
    >>> for c in string.punctuation:
    			s = s.replace(c,"")
    >>> s
    'string With Punctuation'
    
    #    :
    out = re.sub('[%s]' % re.escape(string.punctuation), '', s)
    ## re.escape:                         
    
    #    :
    # string.punctuation     ascii   ;         (    )      : unicodedata module :
    from unicodedata import category
    s = u'String — with - «Punctuation »...'
    out = re.sub('[%s]' % re.escape(string.punctuation), '', s)
    print 'Stripped', out
    #   :u'Stripped String \u2014 with  \xabPunctuation \xbb'
    out = ''.join(ch for ch in s if category(ch)[0] != 'P')
    print 'Stripped', out
    #   :u'Stripped String  with  Punctuation '
    
    
    # For Python 3 str or Python 2 unicode values, str.translate() only takes a dictionary; codepoints (integers) are looked up in that mapping and anything mapped to None is removed.
    # To remove (some?) punctuation then, use:
    import string
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
    s.translate(remove_punct_map)
    
    
    # Your method doesn't work in Python 3, as the translate method doesn't accept the second argument any more. 
    import unicodedata
    import sys
    tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
    def remove_punctuation(text):
    	return text.translate(tbl)
    

    方法3:
    re
    import re
    s ="string. With. Punctuation?"
    s = re.sub(r'[^\w\s]','',s)
    

    テスト:
    import re, string, timeit
    
    s ="string. With. Punctuation"
    
    exclude = set(string.punctuation)
    table = string.maketrans("","")
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    
    def test_set(s):
    	return ''.join(ch for ch in s if ch not in exclude)
    
    def test_re(s): 
    	return regex.sub('', s)
    
    def test_trans(s):
    	return s.translate(table, string.punctuation)
    
    def test_repl(s):
    	for c in string.punctuation:
    		s=s.replace(c,"")
    	return s
    
    print"sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000)
    print"regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000)
    print"translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000)
    print"replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000)
    
    out_put:
    # sets : 19.8566138744
    # regex : 6.86155414581
    # translate : 2.12455511093
    # replace : 28.4436721802
    

    作者:Chihwei_hsuソース:http://chihweihsu.com Github:https://github.com/HsuChihwei