機械学習実戦-素朴ベイズノート

17067 ワード

python マシン学習

python 3の符号化および復号化の問題encodeおよびdecode

encodeおよびdecode問題によるpython 3における「r」および「rb」問題

range(50)リスト形式

を返すにはlist(range(50))が必要です.

set()の|並列動作

python正規表現,reのpatternおよびmatch searchの方法

import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

#        
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet |= set(document)
    return list(vocabSet)

listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)

print(type(myVocabList))
print(myVocabList)


['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']

a = set([1,2,3,1])

{1, 2, 3}

#         ，    
def setOfWord2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            # list  index  ()。      。      find  
            returnVec[vocablist.index(word)] = 1
        else: 
            print("the word: {0} is not in vocablist".format(word))
    return returnVec

print(setOfWord2Vec(myVocabList, listOfPosts[3]))

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]

str = 'abca'
lis = ['a','b','a']

print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
#rfind()
print(str.rfind('a'))
#rindex
print(str.rindex('a'))
#list  find  
print(lis.find('a'))

0
3
0
3
3
3



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

 in ()
      8 print(str.rindex('a'))
      9 #list  find  
---> 10 print(lis.find('a'))


AttributeError: 'list' object has no attribute 'find'

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    #  np.shape        ，      np.array
    numWords = len(trainMatrix[0])
    #   p(c1) ，        
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    #            
#     p0Num = np.zeros(numWords)
#     p1Num = np.zeros(numWords)
#     p0Denom = 0.0
#     p1Denom = 0.0
#      ，   
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #       
            p1Num += trainMatrix[i]
            #               
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #                 , p(w1|c1)
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive

trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)

[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154  0.07692308  0.15384615  0.03846154  0.07692308  0.03846154
  0.07692308  0.07692308  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.03846154  0.07692308  0.03846154  0.07692308
  0.07692308  0.11538462  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.07692308  0.03846154  0.03846154  0.03846154
  0.03846154  0.07692308]
[ 0.0952381   0.04761905  0.04761905  0.0952381   0.04761905  0.0952381
  0.04761905  0.04761905  0.04761905  0.04761905  0.0952381   0.04761905
  0.04761905  0.04761905  0.0952381   0.04761905  0.0952381   0.04761905
  0.04761905  0.0952381   0.04761905  0.04761905  0.19047619  0.0952381
  0.14285714  0.04761905  0.04761905  0.14285714  0.0952381   0.0952381
  0.0952381   0.0952381 ]
0.5

pAb

0.5

#  Bayes    
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    #   ，    ，    
    p1Vec = np.log(p1Vect)
    p0Vec = np.log(p0Vect)
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postInDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
    p0V,p1V,pAb = trainNB0(trainMat, listClasses)
    testEntry1 = ['love', 'my', 'dalmation']
    thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
    print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
    testEntry2 = ['stupid', 'garbage']
    thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
    print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))

testingNB()

['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1

def textParse(bigString):
    import re
    if bigString != None:
        #     pattern，    *   +
        listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(26):
        #read()      ，        
        #print(i)#，           
        #  'rb'  ，  'gbk'  
        #        0.txt，    range(26)   
        wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #ham\6.txt   ，    are   。
        wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    #  list()  ，python3  range   
    traingSet = list(range(50))
    testSet = []

    for i in range(10):
        randIndex = int(np.random.uniform(0, len(traingSet)))
        #testSet  index
        testSet.append(traingSet[randIndex])
        del(traingSet[randIndex])
    traingMat = []
    traingClasses = []
    for docIndex in traingSet:
        traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        traingClasses.append(classList[docIndex])
    p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : ', float(errorCount) / len(testSet))

spamTest()

the error rate is :  0.1

'ab'.encode('gbk')
'\0xab'.encode('gbk')

b'\x00xab'

#   ham\6.txt     0.73， windows-1225  ？         
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())

{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}

print(list(range(50)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

python年、月情報によるこの月の日数

Seleniumで通信内容を全部Captureする