【python】入門学習(十)

24885 ワード

#入門学習シリーズの内容は、『Pythonプログラミング入門(第3版)』の学習ノート
文書ドキュメントの情報を集計し、最も頻度の高い10単語を出力します.
#text.py

#     

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#       

def normalize(s): 

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result



#        

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
') num_words = len(normalize(s).split()) print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) # def make_freq_dict(s): """Return a dictionary whose keys are the words of s,and whose values are the counts of those words.""" s = normalize(s) words = s.split() d = {} for w in words: if w in d: d[w] += 1 else: d[w] = 1 return d # def file_stats2(fname): """Print statistics for the given file.""" s = open(fname,'r').read() num_chars = len(s) num_lines = s.count('
') d = make_freq_dict(s) num_words = sum(d[w] for w in d) lst = [(d[w],w) for w in d] lst.sort() lst.reverse() print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) print("
The top 10 most frequent words are:
") i = 1 for count,word in lst[:99]: print('%2s. %4s %s' % (i, count, word)) i += 1
>>> file_stats2('a.txt')

The file a.txt has:

  12927 characters

  297 lines

  1645 words



The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so

 
さらに完全なコード:
#text.py

#     

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#       

def normalize(s): 

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result



#        

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
') num_words = len(normalize(s).split()) print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) # def make_freq_dict(s): """Return a dictionary whose keys are the words of s,and whose values are the counts of those words.""" s = normalize(s) words = s.split() d = {} for w in words: if w in d: d[w] += 1 else: d[w] = 1 return d # def file_stats2(fname): """Print statistics for the given file.""" s = open(fname,'r').read() num_chars = len(s) num_lines = s.count('
') d = make_freq_dict(s) num_different_words = sum(d[w]/d[w] for w in d) num_words = sum(d[w] for w in d) words_average_length = sum(len(w) for w in d)/num_different_words num_once = sum(d[w] for w in d if d[w] == 1) lst = [(d[w],w) for w in d] lst.sort() lst.reverse() print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) print(" %s words appreance one time" % num_once) print(" %s different words" % int(num_different_words)) print(" %s average length" % words_average_length) print("
The top 10 most frequent words are:
") i = 1 for count,word in lst[:10]: print('%2s. %4s %s' % (i, count, word)) i += 1 def main(): file_stats2('a.txt') if __name__=='__main__': main()
>>> ================================ RESTART ================================

>>> 

The file a.txt has:

  12927 characters

  297 lines

  1645 words

  515 words appreance one time

  699 different words

  6.539341917024321 average length



The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so