pythonテキストの中国語、中国語の記号を除去

17854 ワード

中国語や中国語の記号を削除する方法は、ネット上ではpython 2の書き方が多いが、python 3の使い方を以下のように記録している.
インテリジェントポイント
-      
-      zhon.hanzi,     pip3 install zhon     
#!/usr/bin/env python3
# encoding: utf-8
# coding style: pep8
# ====================================================
#   Copyright (C)2020 All rights reserved.
#
#   Author        : xxx
#   Email         : [email protected]
#   File Name     : check_chinese_and_symbol.py
#   Last Modified : 2020-06-03 18:11
#   Description   :
#
# ====================================================

import sys
# import os

import re

from zhon.hanzi import punctuation
from zhon.hanzi import characters

def lm_find_unchinese(file):
    pattern = re.compile(r'[\u4e00-\u9fa5]')
    unchinese = re.sub(pattern,"",file) #    
    unchinese = re.sub('[{}]'.format(punctuation),"",unchinese) #      
    #print("unchinese:",unchinese)
    return unchinese


def lm_find_chinese(m_str):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', m_str)
    print("chinese:",chinese)

def lm_find_chinese_symbol(m_str):
    t_symbol = re.findall("[{}]".format(punctuation),m_str)
    print("chinese symbols:",t_symbol)

def lm_find_chinese_and_symbol(m_str):
    lm_find_unchinese(m_str)
    lm_find_chinese(m_str)
    lm_find_chinese_symbol(m_str)

def lm_delete_chinese_and_symbol(m_str):
    print("delete chinese and symbol")
    
#     
def test():
    fp = open("./CenterCrop.frag","r+")
    content = fp.read()
    print("        :",re.findall("[{}]".format(punctuation),content))
    print("    :",re.findall("[{}]".format(characters),content))
    lm_find_chinese(content)
    unchinese =  lm_find_unchinese(content)
    fp.seek(0,0)
    fp.truncate()
    fp.write(unchinese)
    fp.close()

def main(argv=None):
#    lm_find_chinese_and_symbol(line)
    print("main")

if __name__ == "__main__":
    sys.exit(main())
  • 大量削除テキスト中の中国語、中国語記号
  • #!/usr/bin/env python3
    # encoding: utf-8
    # coding style: pep8
    # ====================================================
    #   Copyright (C)2020 All rights reserved.
    #
    #   Author        : xxx
    #   Email         : [email protected]
    #   File Name     : lm_find_files.py
    #   Last Modified : 2020-06-04 11:03
    #   Description   :
    #
    # ====================================================
    
    import sys
    import os
    
    import check_chinese_and_symbol as chinese
    
    def lm_find_files(path, target, result):
        """
        Basic Description:
                              
                
        Detailed Description:
            
        Args:
            path:      
    		target:       ,  ".json"
    		result:          
        Returns:
    		result:          
        Raises:
            exceptions
        """
        files = os.listdir(path);
        for f in files:
            npath = path + '/' + f
            if(os.path.isfile(npath)):
                if(os.path.splitext(npath)[1] in target):
                    normal_path = os.path.normpath(os.path.abspath(npath))
                    #print(normal_path)
                    result.append(normal_path)
            if(os.path.isdir(npath)):
                if (f[0] == '.'):
                    pass
                else:
                    lm_find_files(npath, target, result)
        return result
        
    def main(argv=None):
        result=[]
        #lm_find_files("./",".json",result)
        lm_find_files(sys.argv[1],[".frag",".vert",".lua"],result)
        #print(result)
        for f in result:
            print(f)
            fp = open(f,"r+")
            content = fp.read()
            unchinese = chinese.lm_find_unchinese(content)
            fp.seek(0,0)
            fp.truncate()
            fp.write(unchinese)
            fp.close()
    
    if __name__=="__main__":
        sys.exit(main())