aprioriAll

54753 ワード

データ科学に精通する

データセットのテスト

seq1 = [           [30], [90]          ]
seq2 = [ [10, 20], [30], [40, 60, 70]  ]
seq3 = [         [30, 50, 70],         ]
seq4 = [      [30], [40, 70], [90]     ]
seq5 = [              [90],            ]
dataSet = [seq1, seq2, seq3, seq4, seq5]
min_support=0.25

import sys
sys.path.append('E:/myscripts')

import itertools
import pandas as pd
from apriori import apriori

1、Sort Phase

プロセス略、整理されたサンプルデータを直接使用

2、Litemset Phase

litemsetを検索し、aprioriアルゴリズムを直接使用します.主な違いは、サポート度を計算する際に、1人の顧客customerが同じアイテムセット(itemset)を購入した場合、サポート頻度は1回のみ計算されます.これはaprioriアルゴリズムではサポート度がトランザクション(transaction)にとって望ましいが,シーケンスモードの計算では大きなアイテムセットのサポート度がクライアント(customer)にとって望ましいためである.

def createLs1(dataSet, min_support):
    '''
    Using  algorithm apriorito mining large 1-sequences 
    `Ls` for Large Sequence
    '''
    n = len(dataSet)
    flattenSet = list(itertools.chain(*dataSet))
    flatten_n = len(flattenSet)
    
    # Transform the min_support to litemset_support
    min_support_new = min_support * n /flatten_n
    litemsets = apriori(flattenSet, min_support=min_support_new)
    mapping = {v: k for k, v in enumerate(litemsets)}
    
    # Transform the litemset_support to sequence_support
    supportLs1 = {(mapping[k],): v *flatten_n / n 
                  for k, v in litemsets.items()}
    
    return mapping, supportLs1

テスト

mapping, supportLs1 = createLs1(dataSet, min_support=min_support)
mapping

{frozenset({30}): 0,
 frozenset({70}): 1,
 frozenset({90}): 2,
 frozenset({40}): 4,
 frozenset({40, 70}): 3}

supportLs1

{(0,): 0.8, (1,): 0.6, (2,): 0.6, (3,): 0.4, (4,): 0.4}

Ls1 = [list(k) for k in supportLs1]
Ls1

[[2], [0], [3], [1], [4]]

3、Transformation Phase

def seqMapping(seq, mapping):
    '''
    Mapping litemsets to integer objects, for treating litemsets as
    single entities, and reducing the time required 
    '''
    newSeq = []
    for iSet in seq:
        newSet = [v for k, v in mapping.items() if k <= set(iSet)]
        if newSet != []:
            newSeq.append(newSet)
    return newSeq

def transform(dataSet, mapping):
    '''
    Transform each customer sequence into an alternative representation.
    '''
    transformDS = []
    for seq in dataSet:
        newSeq = seqMapping(seq, mapping)
        if newSeq != []:
            transformDS.append(newSeq)
    return transformDS

テスト

transformDS  = transform(dataSet, mapping)
for seq in transformDS :
    print(seq)

[[0], [2]]
[[0], [1, 4, 3]]
[[0, 1]]
[[0], [1, 4, 3], [2]]
[[2]]

4、Sequence Phase

候補シーケンスの生成

def seqGen(seqA, seqB):
    '''
    Generate candidate k+1 sequences with two large k-sequences
    '''
    newA, newB = seqA.copy(), seqB.copy()
    if seqA[:-1] == seqB[:-1]:
        newA.append(seqB[-1])
        newB.append(seqA[-1])
        return [newA, newB]

def CsGen(Ls):
    '''
    Generate all candidate k+1 sequences from large k-sequences
    '''
    Cs = []
    for seqA, seqB in itertools.combinations(Ls, 2):
        newSeqs = seqGen(seqA, seqB)
        if newSeqs != None:
            Cs.extend(newSeqs)
    return [seq for seq in Cs if seq[1:] in Ls] #  Pruning

テスト

testLs = [
    [1, 2, 3], 
    [1, 2, 4],
    [1, 3, 4],
    [1, 3, 5],
    [2, 3, 4]]
CsGen(testLs)

[[1, 2, 3, 4]]

サブシーケンス判定

def isSubSeq(seq, cusSeq):
    '''
    Check if a sequence is contained in a customer sequence.
    '''
    nSeq, nCusSeq = len(seq), len(cusSeq)
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0] in i for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0] in i for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
        else:
            return False

テスト

seq = [3, 4, 8]
cusSeq = [[7], [3, 8], [9], [4, 5, 6], [8]]
isSubSeq(seq, cusSeq)

True

頻繁にkシーケンスが生成されます.このステップは反復的に実行する必要があります.

def calcSupport(transformDS, Cs, min_support):
    '''
    Return: 1. a list of large-sequences
            2. a dictionary of `large-sequence: support` pairs
    '''
    supportLsk = {}; n = len(transformDS)
    if len(Cs) >= 1:
        for seq in Cs:
            support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]\
                         ) / n
            if support >= min_support:
                supportLsk.update({tuple(seq): support})
    return [list(k) for k in supportLsk], supportLsk

テスト

Cs2 = CsGen(Ls1)
Ls2, supportLs2 = calcSupport(transformDS, Cs2, min_support)
print(Ls2)
print(supportLs2)

[[0, 1], [0, 3], [0, 2], [0, 4]]
{(0, 1): 0.4, (0, 3): 0.4, (0, 2): 0.4, (0, 4): 0.4}

5、Maximal Phase

より迅速にサブシーケンスを検索するアルゴリズムは、R.Agrawal and R.Srikant.を参照することができる.Mining sequential patterns. Research Report RJ 9910, IBM Almaden Research Center, San Jose, California, Oc tober 1994.

大きなシーケンスのアイテムセットを元のショッピングバスケットに変換してシーケンスを最大化する必要があります.

tr_mapping = {v: k for k, v in mapping.items()}

Ls = Ls1 + Ls2
Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]

supportLs = {}
supportLs.update(supportLs1); supportLs.update(supportLs2)
supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in \
             supportLs.items()}

print(supportLs)

{(frozenset({40}),): 0.4, (frozenset({30}), frozenset({70})): 0.4, (frozenset({70}),): 0.6, (frozenset({30}), frozenset({40, 70})): 0.4, (frozenset({30}), frozenset({40})): 0.4, (frozenset({40, 70}),): 0.4, (frozenset({30}), frozenset({90})): 0.4, (frozenset({30}),): 0.8, (frozenset({90}),): 0.6}

シーケンスの最大化で保持する必要があるのは、あるシーケンスの非空の真のサブシーケンス(非空の真のサブセットのように、ここではシーケンス自体を保持する)です.このステップは、Transformationフェーズでサブシーケンスを判断する方法と似ています.これは、アイテムセットがマッピングされたため、少し修正された点とは異なります.

def isSubSeq2(seq, cusSeq):
    nSeq, nCusSeq = len(seq), len(cusSeq)
    
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0].issubset(i) for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0].issubset(i) for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
        else:
            return False           

def notProperSubSeq(seq, cusSeq):
    '''
    Return True if `seq` is not proper sub sequence of `cusSeq`
    '''
    if seq == cusSeq:
        return True
    else:
        return not isSubSeq2(seq, cusSeq)

代替シーケンスの最大化されたシーケンスを保持

def maxLs(Ls, supportLs):
    LsCopy = Ls.copy()
    lenL, lenC = len(Ls), len(LsCopy)
    while lenC > 1 and lenL > 1:
        if LsCopy[lenC - 1] in Ls:
            mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
            Ls = list(itertools.compress(Ls, mask))
            lenL = len(Ls)
            
        lenC -= 1
        
    supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} 
    return Ls, supportLs

Ls, supportLs = maxLs(Ls, supportLs)
supportLs

{(frozenset({30}), frozenset({40, 70})): 0.4,
 (frozenset({30}), frozenset({90})): 0.4}

aprioriAll

def aprioriAll(dataSet, min_support=0.4):
    '''
    Proceeding aprioriall algorithm to mining sequential patterns
    
    Refer to:    
    Agrawal,R.,Srikant,R.,Institute of Electric and Electronic 
    Engineer et al. Mining sequential patterns[C]. Proceedings 
    of the Eleventh International Conference on Data Engineering,
    Washington DC, USA: IEEE Computer Society,1995:3-14.
    '''
    # Litemset Phase
    mapping, supportLs1 = createLs1(dataSet, min_support)
    Ls1 = [list(k) for k in supportLs1]
    
    # Transformation Phase
    transformDS  = transform(dataSet, mapping)
    
    # Sequence Phase
    LsList = [Ls1]; supportLs = supportLs1.copy()
    k = 1
    while k >= 1 and len(LsList[-1]) > 1:
        Csk = CsGen(LsList[-1])
        Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
        if len(Lsk) > 0:
            LsList.append(Lsk); supportLs.update(supportLsk)
            k += 1
        else:
            break
            
    Ls = list(itertools.chain(*LsList))
    tr_mapping = {v: k for k, v in mapping.items()}
    Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
    supportLs = {tuple([tr_mapping[i] for i in k]):v 
                 for k, v in supportLs.items()}
    
    # Maximal Phase
    Ls, supportLs = maxLs(Ls, supportLs)
       
    return pd.DataFrame(list(supportLs.items()), 
                        columns=['sequence', 'support'])

aprioriAll(dataSet, min_support=0.25)

sequence
support
0
((30), (40, 70))
0.4
1
((30), (90))
0.4

testSet = [
    [[1, 5], [2], [3], [4] ],
    [[1], [3], [4], [3, 5] ],
    [[1], [2], [3], [4]    ],
    [[1], [3], [5]         ],
    [[4], [5]              ]
    ]

import sys
sys.path.append('E:/myscripts')
from aprioriAll import aprioriAll

aprioriAll(testSet, min_support=0.4)

sequence
support
0
((4), (5))
0.4
1
((1), (2), (3), (4))
0.4
2
((1), (3), (5))
0.4

適用

import os, sys
import itertools
import pandas as pd

sys.path.append('E:/myscripts')

from aprioriAll import aprioriAll

os.chdir('Q:/data')
transactions = pd.read_csv('Transactions.csv')

'''
Sort phase: The database is sorted, with customer-id as the major key 
and transaction-time as the minor key. This step implicitly 
converts the original transaction database into a database of 
customer sequences.
'''

'
Sort phase: The database is sorted, with customer-id as the major key 
and transaction-time as the minor key. This step implicitly 
converts the original transaction database into a database of 
customer sequences.
'

def aggFunc(*args):
    agg = itertools.chain(*args)
    return list(agg)

baskets = transactions['Model']\
    .groupby([transactions['OrderNumber'], transactions['LineNumber']])\
    .apply(aggFunc)
baskets.head()

OrderNumber  LineNumber
SO51176      1                     [Road-250]
             2             [Road Bottle Cage]
SO51177      1                 [Touring-2000]
             2                    [Sport-100]
SO51178      1                 [Mountain-200]
Name: Model, dtype: object

dataSet = list(baskets.groupby(level=0).apply(list))
dataSet[:3]

[[['Road-250'], ['Road Bottle Cage']],
 [['Touring-2000'], ['Sport-100']],
 [['Mountain-200'], ['Mountain Bottle Cage'], ['Water Bottle']]]

aprioriAll(dataSet, min_support=0.05).head()

sequence
support
0
((Short-Sleeve Classic Jersey),)
0.072312
1
((Long-Sleeve Logo Jersey),)
0.077252
2
((Patch kit),)
0.141614
3
((Road-750),)
0.067890
4
((Road Bottle Cage),)
0.080075

jackson 2.9.9.3逆シーケンス化バグ

yamlプロファイルの作成時の問題の概要