aprioriAll


データセットのテスト

seq1 = [           [30], [90]          ]
seq2 = [ [10, 20], [30], [40, 60, 70]  ]
seq3 = [         [30, 50, 70],         ]
seq4 = [      [30], [40, 70], [90]     ]
seq5 = [              [90],            ]
dataSet = [seq1, seq2, seq3, seq4, seq5]
min_support=0.25
import sys
sys.path.append('E:/myscripts')

import itertools
import pandas as pd
from apriori import apriori

1、Sort Phase


プロセス略、整理されたサンプルデータを直接使用

2、Litemset Phase


litemsetを検索し、aprioriアルゴリズムを直接使用します.主な違いは、サポート度を計算する際に、1人の顧客customerが同じアイテムセット(itemset)を購入した場合、サポート頻度は1回のみ計算されます.これはaprioriアルゴリズムではサポート度がトランザクション(transaction)にとって望ましいが,シーケンスモードの計算では大きなアイテムセットのサポート度がクライアント(customer)にとって望ましいためである.
def createLs1(dataSet, min_support):
    '''
    Using  algorithm apriorito mining large 1-sequences 
    `Ls` for Large Sequence
    '''
    n = len(dataSet)
    flattenSet = list(itertools.chain(*dataSet))
    flatten_n = len(flattenSet)
    
    # Transform the min_support to litemset_support
    min_support_new = min_support * n /flatten_n
    litemsets = apriori(flattenSet, min_support=min_support_new)
    mapping = {v: k for k, v in enumerate(litemsets)}
    
    # Transform the litemset_support to sequence_support
    supportLs1 = {(mapping[k],): v *flatten_n / n 
                  for k, v in litemsets.items()}
    
    return mapping, supportLs1
  • テスト
  • mapping, supportLs1 = createLs1(dataSet, min_support=min_support)
    mapping
    
    {frozenset({30}): 0,
     frozenset({70}): 1,
     frozenset({90}): 2,
     frozenset({40}): 4,
     frozenset({40, 70}): 3}
    
    supportLs1
    
    {(0,): 0.8, (1,): 0.6, (2,): 0.6, (3,): 0.4, (4,): 0.4}
    
    Ls1 = [list(k) for k in supportLs1]
    Ls1
    
    [[2], [0], [3], [1], [4]]
    

    3、Transformation Phase

    def seqMapping(seq, mapping):
        '''
        Mapping litemsets to integer objects, for treating litemsets as
        single entities, and reducing the time required 
        '''
        newSeq = []
        for iSet in seq:
            newSet = [v for k, v in mapping.items() if k <= set(iSet)]
            if newSet != []:
                newSeq.append(newSet)
        return newSeq
    
    def transform(dataSet, mapping):
        '''
        Transform each customer sequence into an alternative representation.
        '''
        transformDS = []
        for seq in dataSet:
            newSeq = seqMapping(seq, mapping)
            if newSeq != []:
                transformDS.append(newSeq)
        return transformDS                    
    
  • テスト
  • transformDS  = transform(dataSet, mapping)
    for seq in transformDS :
        print(seq)
    
    [[0], [2]]
    [[0], [1, 4, 3]]
    [[0, 1]]
    [[0], [1, 4, 3], [2]]
    [[2]]
    

    4、Sequence Phase


    候補シーケンスの生成
    def seqGen(seqA, seqB):
        '''
        Generate candidate k+1 sequences with two large k-sequences
        '''
        newA, newB = seqA.copy(), seqB.copy()
        if seqA[:-1] == seqB[:-1]:
            newA.append(seqB[-1])
            newB.append(seqA[-1])
            return [newA, newB]
    
    def CsGen(Ls):
        '''
        Generate all candidate k+1 sequences from large k-sequences
        '''
        Cs = []
        for seqA, seqB in itertools.combinations(Ls, 2):
            newSeqs = seqGen(seqA, seqB)
            if newSeqs != None:
                Cs.extend(newSeqs)
        return [seq for seq in Cs if seq[1:] in Ls] #  Pruning  
    
  • テスト
  • testLs = [
        [1, 2, 3], 
        [1, 2, 4],
        [1, 3, 4],
        [1, 3, 5],
        [2, 3, 4]]
    CsGen(testLs)
    
    [[1, 2, 3, 4]]
    

    サブシーケンス判定
    def isSubSeq(seq, cusSeq):
        '''
        Check if a sequence is contained in a customer sequence.
        '''
        nSeq, nCusSeq = len(seq), len(cusSeq)
        if nSeq > nCusSeq:
            return False 
        if nSeq == 1:        
            return any([seq[0] in i for i in cusSeq])
        if nSeq > 1 :
            head = [seq[0] in i for i in cusSeq]
            if any(head):
                split = head.index(True)
                return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
            else:
                return False
    
  • テスト
  • seq = [3, 4, 8]
    cusSeq = [[7], [3, 8], [9], [4, 5, 6], [8]]
    isSubSeq(seq, cusSeq)
    
    True
    

    頻繁にkシーケンスが生成されます.このステップは反復的に実行する必要があります.
    def calcSupport(transformDS, Cs, min_support):
        '''
        Return: 1. a list of large-sequences
                2. a dictionary of `large-sequence: support` pairs
        '''
        supportLsk = {}; n = len(transformDS)
        if len(Cs) >= 1:
            for seq in Cs:
                support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]\
                             ) / n
                if support >= min_support:
                    supportLsk.update({tuple(seq): support})
        return [list(k) for k in supportLsk], supportLsk       
    
  • テスト
  • Cs2 = CsGen(Ls1)
    Ls2, supportLs2 = calcSupport(transformDS, Cs2, min_support)
    print(Ls2)
    print(supportLs2)
    
    [[0, 1], [0, 3], [0, 2], [0, 4]]
    {(0, 1): 0.4, (0, 3): 0.4, (0, 2): 0.4, (0, 4): 0.4}
    

    5、Maximal Phase

  • より迅速にサブシーケンスを検索するアルゴリズムは、R.Agrawal and R.Srikant.を参照することができる.Mining sequential patterns. Research Report RJ 9910, IBM Almaden Research Center, San Jose, California, Oc tober 1994.

  • 大きなシーケンスのアイテムセットを元のショッピングバスケットに変換してシーケンスを最大化する必要があります.
    tr_mapping = {v: k for k, v in mapping.items()}
    
    Ls = Ls1 + Ls2
    Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
    
    supportLs = {}
    supportLs.update(supportLs1); supportLs.update(supportLs2)
    supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in \
                 supportLs.items()}
    
    print(supportLs)
    
    {(frozenset({40}),): 0.4, (frozenset({30}), frozenset({70})): 0.4, (frozenset({70}),): 0.6, (frozenset({30}), frozenset({40, 70})): 0.4, (frozenset({30}), frozenset({40})): 0.4, (frozenset({40, 70}),): 0.4, (frozenset({30}), frozenset({90})): 0.4, (frozenset({30}),): 0.8, (frozenset({90}),): 0.6}
    

    シーケンスの最大化で保持する必要があるのは、あるシーケンスの非空の真のサブシーケンス(非空の真のサブセットのように、ここではシーケンス自体を保持する)です.このステップは、Transformationフェーズでサブシーケンスを判断する方法と似ています.これは、アイテムセットがマッピングされたため、少し修正された点とは異なります.
    def isSubSeq2(seq, cusSeq):
        nSeq, nCusSeq = len(seq), len(cusSeq)
        
        if nSeq > nCusSeq:
            return False 
        if nSeq == 1:        
            return any([seq[0].issubset(i) for i in cusSeq])
        if nSeq > 1 :
            head = [seq[0].issubset(i) for i in cusSeq]
            if any(head):
                split = head.index(True)
                return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
            else:
                return False           
    
    def notProperSubSeq(seq, cusSeq):
        '''
        Return True if `seq` is not proper sub sequence of `cusSeq`
        '''
        if seq == cusSeq:
            return True
        else:
            return not isSubSeq2(seq, cusSeq)
    

    代替シーケンスの最大化されたシーケンスを保持
    def maxLs(Ls, supportLs):
        LsCopy = Ls.copy()
        lenL, lenC = len(Ls), len(LsCopy)
        while lenC > 1 and lenL > 1:
            if LsCopy[lenC - 1] in Ls:
                mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
                Ls = list(itertools.compress(Ls, mask))
                lenL = len(Ls)
                
            lenC -= 1
            
        supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} 
        return Ls, supportLs
    
    Ls, supportLs = maxLs(Ls, supportLs)
    supportLs
    
    {(frozenset({30}), frozenset({40, 70})): 0.4,
     (frozenset({30}), frozenset({90})): 0.4}
    

    aprioriAll

    def aprioriAll(dataSet, min_support=0.4):
        '''
        Proceeding aprioriall algorithm to mining sequential patterns
        
        Refer to:    
        Agrawal,R.,Srikant,R.,Institute of Electric and Electronic 
        Engineer et al. Mining sequential patterns[C]. Proceedings 
        of the Eleventh International Conference on Data Engineering,
        Washington DC, USA: IEEE Computer Society,1995:3-14.
        '''
        # Litemset Phase
        mapping, supportLs1 = createLs1(dataSet, min_support)
        Ls1 = [list(k) for k in supportLs1]
        
        # Transformation Phase
        transformDS  = transform(dataSet, mapping)
        
        # Sequence Phase
        LsList = [Ls1]; supportLs = supportLs1.copy()
        k = 1
        while k >= 1 and len(LsList[-1]) > 1:
            Csk = CsGen(LsList[-1])
            Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
            if len(Lsk) > 0:
                LsList.append(Lsk); supportLs.update(supportLsk)
                k += 1
            else:
                break
                
        Ls = list(itertools.chain(*LsList))
        tr_mapping = {v: k for k, v in mapping.items()}
        Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
        supportLs = {tuple([tr_mapping[i] for i in k]):v 
                     for k, v in supportLs.items()}
        
        # Maximal Phase
        Ls, supportLs = maxLs(Ls, supportLs)
           
        return pd.DataFrame(list(supportLs.items()), 
                            columns=['sequence', 'support'])
    
    aprioriAll(dataSet, min_support=0.25)
    

    sequence
    support
    0
    ((30), (40, 70))
    0.4
    1
    ((30), (90))
    0.4
    testSet = [
        [[1, 5], [2], [3], [4] ],
        [[1], [3], [4], [3, 5] ],
        [[1], [2], [3], [4]    ],
        [[1], [3], [5]         ],
        [[4], [5]              ]
        ]
    
    import sys
    sys.path.append('E:/myscripts')
    from aprioriAll import aprioriAll
    
    aprioriAll(testSet, min_support=0.4)
    

    sequence
    support
    0
    ((4), (5))
    0.4
    1
    ((1), (2), (3), (4))
    0.4
    2
    ((1), (3), (5))
    0.4

    適用

    import os, sys
    import itertools
    import pandas as pd
    
    sys.path.append('E:/myscripts')
    
    from aprioriAll import aprioriAll
    
    os.chdir('Q:/data')
    transactions = pd.read_csv('Transactions.csv')
    
    '''
    Sort phase: The database is sorted, with customer-id as the major key 
    and transaction-time as the minor key. This step implicitly 
    converts the original transaction database into a database of 
    customer sequences.
    '''
    
    '
    Sort phase: The database is sorted, with customer-id as the major key
    and transaction-time as the minor key. This step implicitly
    converts the original transaction database into a database of
    customer sequences.
    '
    def aggFunc(*args):
        agg = itertools.chain(*args)
        return list(agg)
    
    baskets = transactions['Model']\
        .groupby([transactions['OrderNumber'], transactions['LineNumber']])\
        .apply(aggFunc)
    baskets.head()
    
    OrderNumber  LineNumber
    SO51176      1                     [Road-250]
                 2             [Road Bottle Cage]
    SO51177      1                 [Touring-2000]
                 2                    [Sport-100]
    SO51178      1                 [Mountain-200]
    Name: Model, dtype: object
    
    dataSet = list(baskets.groupby(level=0).apply(list))
    dataSet[:3]
    
    [[['Road-250'], ['Road Bottle Cage']],
     [['Touring-2000'], ['Sport-100']],
     [['Mountain-200'], ['Mountain Bottle Cage'], ['Water Bottle']]]
    
    aprioriAll(dataSet, min_support=0.05).head()
    

    sequence
    support
    0
    ((Short-Sleeve Classic Jersey),)
    0.072312
    1
    ((Long-Sleeve Logo Jersey),)
    0.077252
    2
    ((Patch kit),)
    0.141614
    3
    ((Road-750),)
    0.067890
    4
    ((Road Bottle Cage),)
    0.080075