漢字処理に関するPythonユーティリティ


詳細
1.文字列に漢字が含まれているかどうかを判断します.
 
 
def has_hz(text):
    hz_yes = False
    for ch in text:
        if isinstance(ch, unicode):
            if unicodedata.east_asian_width(ch)!= 'Na':
                hz_yes = True
                break
        else:
            continue   
       
    return hz_yes

ユニットテスト:
        assert not has_hz("")
        assert not has_hz("  ")
        assert not has_hz("123")
        assert not has_hz(u"123abc")
        assert has_hz(u"123abc  ")
        assert has_hz(u"  ")

 
2.指定した長さごとに改行記号()を挿入し、漢字1文字につき2文字の長さを計算します.
 
def get_hz_string_width(text):
    """
                   (1    2    )
    """
    s = 0
    for ch in text:
        if isinstance(ch, unicode):
            if unicodedata.east_asian_width(ch)!= 'Na': 
                s += 2
            else:
                s += 1
        else:
            s += 1
    return s

def get_hz_sub_string(text,startat,sub_len=None):
    """
                   (     ,1    2    )
    
      :
    get_hz_sub_string(record,0,44)  #   ,   0 43
    get_hz_sub_string(record,44)    #   ,   44   
    """
    s = []
    pos = 0
    for ch in text:
        if pos >= startat:
            s.append(ch)
        if isinstance(ch, unicode):
            if unicodedata.east_asian_width(ch)!= 'Na': 
                pos += 2
            else:
                pos += 1
        else:
            pos += 1
        if sub_len!=None and get_hz_string_width(''.join(s))>=sub_len:
            break   
    return ''.join(s)

def insert_line_feed(my_str,interval,line_feed="
"): """
( 2 )""" if len(my_str)==0: return "" n = int((get_hz_string_width(my_str)-1)/interval)+1 str_list = [] k = 1 pos_start = 0 while k <= n: sub_str = get_hz_sub_string(my_str,pos_start,interval) str_list.append(sub_str) k = k + 1 pos_start = pos_start + get_hz_string_width(sub_str) return line_feed.join(str_list)

ユニットテスト:
        assert insert_line_feed("",1)==""
        assert insert_line_feed("1",1)=="1"
        assert insert_line_feed("1234567890",5)=="12345
67890" assert insert_line_feed(u" 1 234567890",5)==u" 1
2
34567
890" assert insert_line_feed(u" 1 234567890",4)==u"
1
2345
6789
0"

 
3.指定された長さでテキストブロックを分割し(Word効果のように)、末尾の空白行をキャンセルします.
 
def wrap_text_block(text,line_length,do_trim=True):
    if do_trim:
        str_list = split(text.rstrip(),'
') else: str_list = split(text,'
') # text_to_line = -1 if do_trim: i = len(str_list)-1 while i > 0: line_str = str_list[i] if len(line_str.strip())==0: text_to_line = i i -= 1 else: break new_str_list = [] i = 0 for obj in str_list: if do_trim and i == text_to_line: break new_str_list += split(insert_line_feed(obj,line_length),'
') i += 1 # u'' “'unicode' object is not callable”!? return u''+'
'.join(new_str_list)
 
ユニットテスト:
        assert wrap_text_block("",1)==""
        assert wrap_text_block("",1,do_trim=False)==""
        
        assert wrap_text_block(u"  1234",2)==u" 

12
34" assert wrap_text_block(u" 12345 ",2)==u"

12
34
5" assert wrap_text_block(u" 1
234",2)==u"

1
23
4" assert wrap_text_block(u" 1
2345 ",2)==u"

1
23
45"