中国語の分詞ツール

7985 ワード

jieba

import jieba
import re

sentence = '            ，             。          ，   ，       。'
sentence = re.sub('[，。？！、]', '', sentence)
result = list(jieba.cut(sentence))
print(result)
# ['  ', '  ', '   ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', '  ', ' ', '  ', '  ']

thulac

import thulac
import re

sentence = '            ，             。          ，   ，       。'
sentence = re.sub('[，。？！、]', '', sentence)

thu = thulac.thulac(seg_only=True)

result = thu.cut(sentence, text=True)

print(result.split())
# ['  ', ' ', ' ', ' ', '  ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', ' ', ' ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']

pkuseg

import pkuseg
import re

sentence = '            ，             。          ，   ，       。'
sentence = re.sub('[，。？！、]', '', sentence)

pk_useg = pkuseg.pkuseg()
result = pk_useg.cut(sentence)

print(result)
# ['  ', ' ', ' ', '  ', '  ', '  ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']

pyhanlp

from pyhanlp import HanLP
import re

sentence = '            ，             。          ，   ，       。'
sentence = re.sub('[，。？！、]', '', sentence)

result = HanLP.segment(sentence)

print([term.word for term in result])
# ['   ', ' ', '   ', '  ', ' ', '  ', ' ', ' ', '  ', ' ', ' ', '  ', '  ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', '  ', ' ', '  ', '  ']

snownlp

from snownlp import SnowNLP
import re

sentence = '            ，             。          ，   ，       。'
sentence = re.sub('[，。？！、]', '', sentence)

result = SnowNLP(sentence)

print(result.words)
# ['  ', ' ', ' ', ' ', '  ', '   ', '  ', ' ', ' ', '  ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '  ', '  ', '  ', ' ', '  ', ' ', '  ', '  ', ' ', ' ', ' ', ' ', '  ', '  ']

C/C++のtrigraph

【超初心者向け】Apacheとは何か？を大雑把に解説する記事