【python】入門学習(十)
24885 ワード
#入門学習シリーズの内容は、『Pythonプログラミング入門(第3版)』の学習ノート
文書ドキュメントの情報を集計し、最も頻度の高い10単語を出力します.
さらに完全なコード:
文書ドキュメントの情報を集計し、最も頻度の高い10単語を出力します.
#text.py
#
keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'
'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
#
def normalize(s):
"""Convert s to a normalized string."""
result = ''
for c in s.lower():
if c in keep:
result += c
return result
#
def file_stats(fname):
"""Print statistics for the given file."""
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('
')
num_words = len(normalize(s).split())
print("The file %s has:" % fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
#
def make_freq_dict(s):
"""Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
return d
#
def file_stats2(fname):
"""Print statistics for the given file."""
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('
')
d = make_freq_dict(s)
num_words = sum(d[w] for w in d)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file %s has:" % fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
print("
The top 10 most frequent words are:")
i = 1
for count,word in lst[:99]:
print('%2s. %4s %s' % (i, count, word))
i += 1
>>> file_stats2('a.txt')
The file a.txt has:
12927 characters
297 lines
1645 words
The top 10 most frequent words are:
1. 62 to
2. 62 the
3. 47 is
4. 42 a
5. 41 of
6. 40 it
7. 36 that
8. 35 and
9. 32 as
10. 24 so
さらに完全なコード:
#text.py
#
keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'
'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
#
def normalize(s):
"""Convert s to a normalized string."""
result = ''
for c in s.lower():
if c in keep:
result += c
return result
#
def file_stats(fname):
"""Print statistics for the given file."""
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('
')
num_words = len(normalize(s).split())
print("The file %s has:" % fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
#
def make_freq_dict(s):
"""Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
return d
#
def file_stats2(fname):
"""Print statistics for the given file."""
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('
')
d = make_freq_dict(s)
num_different_words = sum(d[w]/d[w] for w in d)
num_words = sum(d[w] for w in d)
words_average_length = sum(len(w) for w in d)/num_different_words
num_once = sum(d[w] for w in d if d[w] == 1)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file %s has:" % fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
print(" %s words appreance one time" % num_once)
print(" %s different words" % int(num_different_words))
print(" %s average length" % words_average_length)
print("
The top 10 most frequent words are:")
i = 1
for count,word in lst[:10]:
print('%2s. %4s %s' % (i, count, word))
i += 1
def main():
file_stats2('a.txt')
if __name__=='__main__':
main()
>>> ================================ RESTART ================================
>>>
The file a.txt has:
12927 characters
297 lines
1645 words
515 words appreance one time
699 different words
6.539341917024321 average length
The top 10 most frequent words are:
1. 62 to
2. 62 the
3. 47 is
4. 42 a
5. 41 of
6. 40 it
7. 36 that
8. 35 and
9. 32 as
10. 24 so