2-gram条件下での英語テキストの分詞処理
3149 ワード
#coding=utf-8
import re
# 1 , list
with open('/home/zheng/firstproject/lecture.txt', 'r') as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', content)
# print lecture_list
n=len(lecture_list)# list
new_list = []# list
# list 2 list
i=0
while i1:
j=1
while j' ' + lecture_list[j])
j+=1
i+=1
while j>=n:
print new_list
break