【python】入門学習(十)

24885 ワード

python

#入門学習シリーズの内容は、『Pythonプログラミング入門(第3版)』の学習ノート
文書ドキュメントの情報を集計し、最も頻度の高い10単語を出力します.

#text.py

#     

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#       

def normalize(s): 

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result



#        

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
')

    num_words = len(normalize(s).split())

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)



#         

def make_freq_dict(s):

    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""

    s = normalize(s)

    words = s.split()

    d = {}

    for w in words:

        if w in d:

            d[w] += 1

        else:

            d[w] = 1

    return d



#        

def file_stats2(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
')

    d = make_freq_dict(s)

    num_words = sum(d[w] for w in d)

    lst = [(d[w],w) for w in d]

    lst.sort()

    lst.reverse()

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

    print("
The top 10 most frequent words are:")

    i = 1

    for count,word in lst[:99]:

        print('%2s. %4s %s' % (i, count, word))

        i += 1

>>> file_stats2('a.txt')

The file a.txt has:

  12927 characters

  297 lines

  1645 words



The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so

さらに完全なコード:

#text.py

#     

keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'

        'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}

#       

def normalize(s): 

    """Convert s to a normalized string."""

    result = ''

    for c in s.lower():

        if c in keep:

            result += c

    return result



#        

def file_stats(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
')

    num_words = len(normalize(s).split())

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)



#         

def make_freq_dict(s):

    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""

    s = normalize(s)

    words = s.split()

    d = {}

    for w in words:

        if w in d:

            d[w] += 1

        else:

            d[w] = 1

    return d



#        

def file_stats2(fname):

    """Print statistics for the given file."""

    s = open(fname,'r').read()

    num_chars = len(s)

    num_lines = s.count('
')

    d = make_freq_dict(s)

    num_different_words = sum(d[w]/d[w] for w in d)

    num_words = sum(d[w] for w in d)

    words_average_length = sum(len(w) for w in d)/num_different_words

    num_once = sum(d[w] for w in d if d[w] == 1)

    lst = [(d[w],w) for w in d]

    lst.sort()

    lst.reverse()

    print("The file %s has:" % fname)

    print("  %s characters" % num_chars)

    print("  %s lines" % num_lines)

    print("  %s words" % num_words)

    print("  %s words appreance one time" % num_once)

    print("  %s different words" % int(num_different_words))

    print("  %s average length" % words_average_length)

    print("
The top 10 most frequent words are:")

    i = 1

    for count,word in lst[:10]:

        print('%2s. %4s %s' % (i, count, word))

        i += 1



def main():

    file_stats2('a.txt')



if __name__=='__main__':

    main()

>>> ================================ RESTART ================================

>>> 

The file a.txt has:

  12927 characters

  297 lines

  1645 words

  515 words appreance one time

  699 different words

  6.539341917024321 average length



The top 10 most frequent words are:

 1.   62 to

 2.   62 the

 3.   47 is

 4.   42 a

 5.   41 of

 6.   40 it

 7.   36 that

 8.   35 and

 9.   32 as

10.   24 so

jQuery hashプラグイン

Modernizr学習ノート