pyhton 3に基づく.6-機械学習実戦-fpGrowthコード解釈

11282 ワード

私は数学科の大学院生で、2017年末に初めてpythonと機械に接触して勉強して、初心者として、みんなと交流することを歓迎します.
私は主にコードを説明して、理論の部分はみんなに3冊の本を推薦します:
『機械学習実戦中国語版』
『機械学習』周志華
『統計学習方法』李航
以上の3冊の本、1冊目はpython 2に基づくコード実装である.残りの2冊は主に第1冊の理論の省略部分の補充として、理論の大部分はすべて細かく話しています.
ブログには機械学習の実戦理論についての解釈が多く、バラツキがあり、良い作品も以上の3冊の本を参考にしており、ネット上には電子版の本がたくさんあります.
無駄なブログを読むよりも、以上の3冊の本が収穫されたほうがいい.
正直に言うと、勉強は必ず落ち着いて、イライラしないでください.毎日少し読むことができて、毎日あなたは少し理解して、毎日蓄積して多くなりました.
OS:windows 8.1
pythonバージョン:python 3.6
実行環境:spyder(anaconda)
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 27 21:39:50 2018

@author: Lelouch_C.C
"""

#FP        
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None                 #nodeLink             
        self.parent = parentNode             #          
        self.children = {}                   #   ,        

    def inc(self, numOccur):
        self.count += numOccur

    def disp(self, ind=1):
        """
            :         
        """
        print ('  ' * ind, self.name, ' ', self.count)
        for child in self.children.values():
            child.disp(ind + 1)    #self.children    child    treeNode,      

"""
if __name__=='__main__':
    rootNode=treeNode('pyramid',9,None)
    rootNode.children['eye']=treeNode('eye',13,None)
    print('rootNode.disp()=',rootNode.disp())
    rootNode.children['phoenix']=treeNode('phoenix',3,None) 
    print('rootNode.disp()=',rootNode.disp())
#"""

#
def createTree(dataSet, minSup=1):
    """
        :FP-tree    
        :    dataSet
             (      dataSet     ,      ,      ,           )
                   minSup
       :     retTree
                headerTable
    """
    headerTable = {}                           #              
    ##############        :            ################
    for trans in dataSet:              
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    #####################################################################
    #######            ########
    for k in list(headerTable):        
        #  headerTable  list,         del  ,                   
        if headerTable[k] < minSup:
            del (headerTable[k])
    ######################################
    freqItemSet = set(headerTable.keys())       #                
    #print ('freqItemSet= ',freqItemSet)        
    if len(freqItemSet) == 0:
        return None, None                       #         0
    for k in headerTable:                       #      
        headerTable[k] = [headerTable[k], None] #                     
    #print ('headerTable= ',headerTable)
    retTree = treeNode('Null Set', 1, None)     #           
    ######################     ###########################
    for tranSet, count in dataSet.items():      #  count        
        localD = {}
        #############################################
        for item in tranSet:          #  for  ,                 localD                
            if item in freqItemSet:             
                localD[item] = headerTable[item][0]  
        #############################################
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
            #items()          dict_items  ,          2-  
            #sorted(    ,key,reverse),                ,
            #      sorted(iterable,key,reverse)   key                     
            #  key=lambda p: p[1]          ,                  
            updateTree(orderedItems, retTree, headerTable, count)  
            #                
    ##########################################################
    return retTree, headerTable                  #        

def updateTree(items, inTree, headerTable, count):
    """
        : FP   
    """
    if items[0] in inTree.children:           
        #       items               
        #          
        inTree.children[items[0]].inc(count)      #            ,
    else:                                         #          
        inTree.children[items[0]] = treeNode(items[0], count, inTree)  
        #         ,       inTree,         ,      
        ############################       #############################
        if headerTable[items[0]][1] == None:      
            #                    None,          
            headerTable[items[0]][1] = inTree.children[items[0]]       
            #      ,                    
        else:                                     
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
            #      ,  ,          ,     ,               
        ######################################################################
    if len(items) > 1:                            #        ,  
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)
        # items[1::]  ,    updateTree             ,  

def updateHeader(nodeToTest, targetNode):
    """
        :                     。
    """
    while (nodeToTest.nodeLink != None):
        nodeToTest = nodeToTest.nodeLink
        #       nodeLink   ,    nodeLink        ,       
    nodeToTest.nodeLink = targetNode
    #            ,  ondeLink             。

def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat
#createInitSet() 
def createInitSet(dataSet):
    """
        :           dataSet      ‘      ,      ’   
    """
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = 1
    return retDict
"""
if __name__=='__main__':
    simpDat=loadSimpDat()
    print('simpDat=',simpDat)
    initSet=createInitSet(simpDat)
    print('initSet=',initSet)
    myFPtree,myHeaderTab=createTree(initSet,3)
    myFPtree.disp()
    #  :   fp-        ,                  。
    #        (z5,r3,x4,y3,s3,t3)
    #     {z,r},            ,  z1->r1
    #     {z,x,y,s,t},            ,
    #               z2->r1
    #                 ->x1->...
    #(  :x     y,s,t        ,    ,           )
#"""

"""
 FP               :
(1)  FP         ;
(2)        ,      FP ;
(3)       (1)  (2),            。
"""

# FP        
def ascendTree(leafNode, prefixPath): 
    """
        :       ,                prefixPath   
    """
    if leafNode.parent != None:                   
        #             ,
        prefixPath.append(leafNode.name)    
        #     ,  prefixPath         ,         
        ascendTree(leafNode.parent, prefixPath)   
        #        prefixPath         ,       


def findPrefixPath(basePat, treeNode):  
    """
        :                 
      :     :basePat
              headerTable                   :treeNode
       :            
    """
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)      #           
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count 
            #                 ,      
            #prefixPath[1:],                 ,       
        treeNode = treeNode.nodeLink         #          ,        
    return condPats
"""
if __name__=='__main__':
    simpDat=loadSimpDat()
    initSet=createInitSet(simpDat)
    myFPtree,myHeaderTab=createTree(initSet,3)
    #print('myFPtree=',myFPtree)                #myFPtree      
    #print('myHeaderTab=',myHeaderTab)
    print('x      :',findPrefixPath('x', myHeaderTab['x'][1]))
    print('z      :',findPrefixPath('z', myHeaderTab['z'][1]))
    print('r      :',findPrefixPath('r', myHeaderTab['r'][1]))
    print('t      :',findPrefixPath('t', myHeaderTab['t'][1]))
#"""

#    FP 
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    """
        :        
      : FP  inTree
               headerTable
                minSup     
               preFix(       ,        )
                freqItemList(       ,        )
    """
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: str(p[1]))]
    #python3  
    #                   ,    (  ),          
    for basePat in bigL:  
        #        ,      ,                bigL
        #                  ,           ,             
        #                      ,      。
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        print ('finalFrequent Item: ',newFreqSet)
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #           
        print ('condPattBases :',basePat, condPattBases)  #       
        myCondTree, myHead = createTree(condPattBases, minSup)
        #                  FP     ,     FP 
        #            
        print ('head from conditional tree: ', myHead)
        if myHead != None:          
            # myHead   ,     FP        。
            #      ,           basePat,       ,      FP 
            #    FP         basePat     (    basePat          )
            #   ,     myHead   ,               
            print ('conditional tree for: ',newFreqSet)
            myCondTree.disp(1) 
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
            # myHead   ,          FP           ,         ,  。
            #     basePat         ,             
"""
if __name__=='__main__':
    simpDat=loadSimpDat()
    initSet=createInitSet(simpDat)
    myFPtree,myHeaderTab=createTree(initSet,3)
    myFreqItems=[]
    mineTree(myFPtree,myHeaderTab,3,set([]),myFreqItems)
    print('myFreqItems=',myFreqItems)

#"""
"""
#  :            
if __name__=='__main__':
    parsedDat=[line.split() for line in open('kosarak.dat').readlines()] #      
    initSet=createInitSet(parsedDat)                            #        
    myFPtree,myHeaderTab=createTree(initSet,100000)
    #  FP ,          10          
    myFreqList=[]                                  #                  
    mineTree(myFPtree,myHeaderTab,100000,set([]),myFreqList)
    print('len(myFreqList)=',len(myFreqList))
    #                 10          :
    print('myFreqList=',myFreqList)

#"""