中国語の分詞ツール

7985 ワード

jieba
import jieba
import re

sentence = '            ,             。          ,   ,       。'
sentence = re.sub('[,。?!、]', '', sentence)
result = list(jieba.cut(sentence))
print(result)
# ['  ', '  ', '   ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', '  ', ' ', '  ', '  ']

thulac
import thulac
import re

sentence = '            ,             。          ,   ,       。'
sentence = re.sub('[,。?!、]', '', sentence)

thu = thulac.thulac(seg_only=True)

result = thu.cut(sentence, text=True)

print(result.split())
# ['  ', ' ', ' ', ' ', '  ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', ' ', ' ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']

pkuseg
import pkuseg
import re

sentence = '            ,             。          ,   ,       。'
sentence = re.sub('[,。?!、]', '', sentence)

pk_useg = pkuseg.pkuseg()
result = pk_useg.cut(sentence)

print(result)
# ['  ', ' ', ' ', '  ', '  ', '  ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']

pyhanlp
from pyhanlp import HanLP
import re

sentence = '            ,             。          ,   ,       。'
sentence = re.sub('[,。?!、]', '', sentence)

result = HanLP.segment(sentence)

print([term.word for term in result])
# ['   ', ' ', '   ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', '  ', ' ', '  ', '  ']

snownlp
from snownlp import SnowNLP
import re

sentence = '            ,             。          ,   ,       。'
sentence = re.sub('[,。?!、]', '', sentence)

result = SnowNLP(sentence)

print(result.words)
# ['  ', ' ', ' ', ' ', '  ', '   ', '  ', ' ', ' ', '  ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']