2-gram条件下での英語テキストの分詞処理

3149 ワード

#coding=utf-8
import re
#  1       ,         list 
with open('/home/zheng/firstproject/lecture.txt', 'r') as lecture:
    content = lecture.read().strip().decode('gbk').encode('utf-8')
    lecture_list = re.findall('([A-Za-z\']+)', content)
    # print lecture_list
    n=len(lecture_list)#  list       
new_list = []#      list
# listlist 
i=0
while i1:
    j=1
    while j' ' + lecture_list[j])
     j+=1
     i+=1
     while j>=n:
      print new_list
      break