【python】pyltpのインストール

36927 ワード

1,インストールと設定1,Ubuntu 18.04取付
git clone https://github.com/HIT-SCIR/pyltp
git submodule init
git submodule update
python setup.py install

1.2,win 10インストール
git clone https://github.com/hit-scir/pyltp #   pyltp  
git clone https://github.com/hit-scir/ltp #   ltp
#     ltp  pyltp ltp  
cd pyltp
python setup.py build
python setup.py install

2、ダウンロードモデルのダウンロードアドレス:https://github.com/hit-scir/ltp
3、応用
自然言語処理はコンピュータ科学分野と人工知能分野の重要な方向である.人工知能はシミュレーション、拡張、拡張のための知能を研究、開発する理論、方法、技術及び応用システムの新しい技術科学である.
3.1節
>>> from pyltp import SentenceSplitter
>>> sentences = "                             。\
...        、      、            、  、                。"
>>> sents = SentenceSplitter.split(sentences)
>>> sents
<pyltp.VectorOfString object at 0x0000018D23343570>
>>> print('
'
.join(sents)) 。 、 、 、 、 。

3.2、分詞
>>> import os
>>> from pyltp import Segmentor
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0" # ltp    
>>> cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #       
>>> segmentor = Segmentor()
>>> segmentor.load(cws_model_path)
>>> sentence = "                             。"
>>> words = segmentor.segment(sentence)
>>> type(words)
<class 'pyltp.VectorOfString'>
>>> " ".join(words)
'                                            。'
>>> segmentor.release()

3.3、カスタム辞書D:serveruserdict.を使用txt人工知能
>>> import os
>>> from pyltp import Segmentor
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0" # ltp    
>>> cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #       
>>> segmentor = Segmentor()
>>> segmentor.load_with_lexicon(cws_model_path, r'D:\server\userdict.txt')
[INFO] 2019-10-14 13:59:41 loaded 1 lexicon entries
>>> sentence = "                             。"
>>> words = segmentor.segment(sentence)
>>> type(words)
<class 'pyltp.VectorOfString'>
>>> " ".join(words)
'                                            。'
>>> segmentor.release()

3.4、品詞表記
>>> import os
>>> from pyltp import Postagger
>>> from pyltp import Segmentor
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0" # ltp    
>>> cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #       
>>> pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') #       
>>> sentence = "                             。"
>>> segmentor = Segmentor() #   
>>> segmentor.load(cws_model_path)
>>> words = list(segmentor.segment(sentence))
>>> words
['  ', '  ', '  ', ' ', '   ', '  ', '  ', ' ', \
'    ', '  ', ' ', ' ', '  ', '  ', '  ', '。']
>>> postagger = Postagger() #     
>>> postagger.load(pos_model_path)
>>> postags = postagger.postag(words)
>>> list(postags)
['n', 'n', 'v', 'v', 'n', 'n', 'n', 'p', 'n', 'n', 'nd', 'u', 'm', 'a', 'n', 'wp']
>>> segmentor.release()
>>> postagger.release()

3.5名前付きエンティティ識別
>>> import os
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0"
>>> ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
>>> from pyltp import NamedEntityRecognizer
>>> recognizer = NamedEntityRecognizer()
>>> recognizer.load(ner_model_path)
>>> words = ['  ', '  ', '  ', ' ', '   ', '  ', '  ', ' ', \
... '    ', '  ', ' ', ' ', '  ', '  ', '  ', '。']
>>> postags = ['n', 'n', 'v', 'v', 'n', 'n', 'n', 'p', 'n', 'n', 'nd', 'u', 'm', 'a', 'n', 'wp']
>>> nertags = recognizer.recognize(words, postags)
>>> list(nertags)
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
>>> recognizer.release()

3.6,依存構文解析
>>> import os
>>> from pyltp import Parser
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0"
>>> par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
>>> parser = Parser()
>>> parser.load(par_model_path)
>>> words = ['  ', '  ', '  ', ' ', '   ', '  ', '  ', ' ', \
... '    ', '  ', ' ', ' ', '  ', '  ', '  ', '。']
>>> postags = ['n', 'n', 'v', 'v', 'n', 'n', 'n', 'p', 'n', 'n', 'nd', 'u', 'm', 'a', 'n', 'wp']
>>> arcs = parser.parse(words, postags)
>>> print("\t".join("%d:%s" %(arc.head, arc.relation)for arc in arcs))
2:ATT   3:ATT   4:SBV   0:HED   7:ATT   7:ATT   4:VOB   15:LAD  
10:ATT  11:ATT  15:ATT  11:RAD  15:ATT  15:ATT  7:COO   4:WP
>>> parser.release()

3.7、意味ロールの寸法
>>> import os
>>> LTP_DATA_DIR = r"C:\Software\ltp_data_v3.4.0"  # ltp       
>>> srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl_win.model')  #             ,     `srl`。            ,       。
>>>
>>> from pyltp import SementicRoleLabeller
>>> labeller = SementicRoleLabeller() #      
>>> labeller.load(srl_model_path)  #     
>>>
>>> words = ['  ', ' ', '  ', ' ']
>>> postags = ['nh', 'r', 'r', 'v']
# arcs            
>>> roles = labeller.label(words, postags, arcs)  #       
>>>
>>> #     
>>>for role in roles:
   ... print(role.index, "".join(
   ...     ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
[dynet] random seed: 1676210130
[dynet] allocating memory: 2000MB
[dynet] memory allocation done.
3 A0:(1,1)ADV:(2,2)
>>> labeller.release()  #    

4,参考リンク4.1簡書