機械学習実戦-素朴ベイズノート


  • python 3の符号化および復号化の問題encodeおよびdecode
  • encodeおよびdecode問題によるpython 3における「r」および「rb」問題
  • range(50)リスト形式
  • を返すにはlist(range(50))が必要です.
  • set()の|並列動作
  • python正規表現,reのpatternおよびmatch searchの方法
  • import numpy as np
    
    def loadDataSet():
        postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                     ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                     ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                     ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                     ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                     ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
        return postingList,classVec
    #        
    def createVocabList(dataSet):
        vocabSet = set([])
        for document in dataSet:
            vocabSet |= set(document)
        return list(vocabSet)
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    print(type(myVocabList))
    print(myVocabList)
    
    ['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']
    
    a = set([1,2,3,1])
    a
    {1, 2, 3}
    
    #         ,    
    def setOfWord2Vec(vocablist, inputSet):
        returnVec = [0] * len(vocablist)
        for word in inputSet:
            if word in vocablist:
                # list  index  ()。      。      find  
                returnVec[vocablist.index(word)] = 1
            else: 
                print("the word: {0} is not in vocablist".format(word))
        return returnVec
    print(setOfWord2Vec(myVocabList, listOfPosts[3]))
    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]
    
    str = 'abca'
    lis = ['a','b','a']
    print(str.index('a'))
    print(str.index('a',1))
    print(str.find('a'))
    print(str.find('a',2))
    #rfind()
    print(str.rfind('a'))
    #rindex
    print(str.rindex('a'))
    #list  find  
    print(lis.find('a'))
    0
    3
    0
    3
    3
    3
    
    
    
    ---------------------------------------------------------------------------
    
    AttributeError                            Traceback (most recent call last)
    
     in ()
          8 print(str.rindex('a'))
          9 #list  find  
    ---> 10 print(lis.find('a'))
    
    
    AttributeError: 'list' object has no attribute 'find'
    
    def trainNB0(trainMatrix, trainCategory):
        numTrainDocs = len(trainMatrix)
        #  np.shape        ,      np.array
        numWords = len(trainMatrix[0])
        #   p(c1) ,        
        pAbusive = sum(trainCategory) / float(numTrainDocs)
        #            
    #     p0Num = np.zeros(numWords)
    #     p1Num = np.zeros(numWords)
    #     p0Denom = 0.0
    #     p1Denom = 0.0
    #      ,   
        p0Num = np.ones(numWords)
        p1Num = np.ones(numWords)
        p0Denom = 2.0
        p1Denom = 2.0
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                #       
                p1Num += trainMatrix[i]
                #               
                p1Denom += sum(trainMatrix[i])
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
        #                 , p(w1|c1)
        p1Vect = p1Num/p1Denom
        p0Vect = p0Num/p0Denom
        return p0Vect, p1Vect, pAbusive
    trainMat = []
    for postinDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(trainMat, listClasses)
    print(trainMat)
    print(p0V)
    print(p1V)
    print(pAb)
    [[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
    [ 0.03846154  0.07692308  0.15384615  0.03846154  0.07692308  0.03846154
      0.07692308  0.07692308  0.07692308  0.07692308  0.03846154  0.07692308
      0.07692308  0.07692308  0.03846154  0.07692308  0.03846154  0.07692308
      0.07692308  0.11538462  0.07692308  0.07692308  0.03846154  0.07692308
      0.07692308  0.07692308  0.07692308  0.03846154  0.03846154  0.03846154
      0.03846154  0.07692308]
    [ 0.0952381   0.04761905  0.04761905  0.0952381   0.04761905  0.0952381
      0.04761905  0.04761905  0.04761905  0.04761905  0.0952381   0.04761905
      0.04761905  0.04761905  0.0952381   0.04761905  0.0952381   0.04761905
      0.04761905  0.0952381   0.04761905  0.04761905  0.19047619  0.0952381
      0.14285714  0.04761905  0.04761905  0.14285714  0.0952381   0.0952381
      0.0952381   0.0952381 ]
    0.5
    
    pAb
    0.5
    
    #  Bayes    
    def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
        #   ,    ,    
        p1Vec = np.log(p1Vect)
        p0Vec = np.log(p0Vect)
        p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
        p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
        if p1 > p0:
            return 1
        else:
            return 0
    def testingNB():
        listOfPosts, listClasses = loadDataSet()
        myVocabList = createVocabList(listOfPosts)
        trainMat = []
        for postInDoc in listOfPosts:
            trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
        p0V,p1V,pAb = trainNB0(trainMat, listClasses)
        testEntry1 = ['love', 'my', 'dalmation']
        thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
        print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
        testEntry2 = ['stupid', 'garbage']
        thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
        print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))
    testingNB()
    ['love', 'my', 'dalmation'] classified as : 0
    ['stupid', 'garbage'] classified as : 1
    
    def textParse(bigString):
        import re
        if bigString != None:
            #     pattern,    *   +
            listOfTokens = re.split(r'\W+', bigString)
        return [tok.lower() for tok in listOfTokens if len(tok) > 2]
    def spamTest():
        docList = []
        classList = []
        fullText = []
        for i in range(26):
            #read()      ,        
            #print(i)#,           
            #  'rb'  ,  'gbk'  
            #        0.txt,    range(26)   
            wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            #ham\6.txt   ,    are   。
            wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
    
        vocabList = createVocabList(docList)
        #  list()  ,python3  range   
        traingSet = list(range(50))
        testSet = []
    
        for i in range(10):
            randIndex = int(np.random.uniform(0, len(traingSet)))
            #testSet  index
            testSet.append(traingSet[randIndex])
            del(traingSet[randIndex])
        traingMat = []
        traingClasses = []
        for docIndex in traingSet:
            traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
            traingClasses.append(classList[docIndex])
        p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
        errorCount = 0.0
        for docIndex in testSet:
            wordVector = setOfWord2Vec(vocabList, docList[docIndex])
            if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
                errorCount += 1
        print('the error rate is : ', float(errorCount) / len(testSet))
    spamTest()
    the error rate is :  0.1
    
    'ab'.encode('gbk')
    '\0xab'.encode('gbk')
    b'\x00xab'
    
    #   ham\6.txt     0.73, windows-1225  ?         
    import chardet
    f = open('email\\ham\\6.txt','rb')
    chardet.detect(f.read())
    {'confidence': 1.0, 'encoding': 'ascii', 'language': ''}
    
    print(list(range(50)))
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]