aprioriAll
54753 ワード
データセットのテスト
seq1 = [ [30], [90] ]
seq2 = [ [10, 20], [30], [40, 60, 70] ]
seq3 = [ [30, 50, 70], ]
seq4 = [ [30], [40, 70], [90] ]
seq5 = [ [90], ]
dataSet = [seq1, seq2, seq3, seq4, seq5]
min_support=0.25
import sys
sys.path.append('E:/myscripts')
import itertools
import pandas as pd
from apriori import apriori
1、Sort Phase
プロセス略、整理されたサンプルデータを直接使用
2、Litemset Phase
litemsetを検索し、aprioriアルゴリズムを直接使用します.主な違いは、サポート度を計算する際に、1人の顧客customerが同じアイテムセット(itemset)を購入した場合、サポート頻度は1回のみ計算されます.これはaprioriアルゴリズムではサポート度がトランザクション(transaction)にとって望ましいが,シーケンスモードの計算では大きなアイテムセットのサポート度がクライアント(customer)にとって望ましいためである.
def createLs1(dataSet, min_support):
'''
Using algorithm apriorito mining large 1-sequences
`Ls` for Large Sequence
'''
n = len(dataSet)
flattenSet = list(itertools.chain(*dataSet))
flatten_n = len(flattenSet)
# Transform the min_support to litemset_support
min_support_new = min_support * n /flatten_n
litemsets = apriori(flattenSet, min_support=min_support_new)
mapping = {v: k for k, v in enumerate(litemsets)}
# Transform the litemset_support to sequence_support
supportLs1 = {(mapping[k],): v *flatten_n / n
for k, v in litemsets.items()}
return mapping, supportLs1
mapping, supportLs1 = createLs1(dataSet, min_support=min_support)
mapping
{frozenset({30}): 0,
frozenset({70}): 1,
frozenset({90}): 2,
frozenset({40}): 4,
frozenset({40, 70}): 3}
supportLs1
{(0,): 0.8, (1,): 0.6, (2,): 0.6, (3,): 0.4, (4,): 0.4}
Ls1 = [list(k) for k in supportLs1]
Ls1
[[2], [0], [3], [1], [4]]
3、Transformation Phase
def seqMapping(seq, mapping):
'''
Mapping litemsets to integer objects, for treating litemsets as
single entities, and reducing the time required
'''
newSeq = []
for iSet in seq:
newSet = [v for k, v in mapping.items() if k <= set(iSet)]
if newSet != []:
newSeq.append(newSet)
return newSeq
def transform(dataSet, mapping):
'''
Transform each customer sequence into an alternative representation.
'''
transformDS = []
for seq in dataSet:
newSeq = seqMapping(seq, mapping)
if newSeq != []:
transformDS.append(newSeq)
return transformDS
transformDS = transform(dataSet, mapping)
for seq in transformDS :
print(seq)
[[0], [2]]
[[0], [1, 4, 3]]
[[0, 1]]
[[0], [1, 4, 3], [2]]
[[2]]
4、Sequence Phase
候補シーケンスの生成
def seqGen(seqA, seqB):
'''
Generate candidate k+1 sequences with two large k-sequences
'''
newA, newB = seqA.copy(), seqB.copy()
if seqA[:-1] == seqB[:-1]:
newA.append(seqB[-1])
newB.append(seqA[-1])
return [newA, newB]
def CsGen(Ls):
'''
Generate all candidate k+1 sequences from large k-sequences
'''
Cs = []
for seqA, seqB in itertools.combinations(Ls, 2):
newSeqs = seqGen(seqA, seqB)
if newSeqs != None:
Cs.extend(newSeqs)
return [seq for seq in Cs if seq[1:] in Ls] # Pruning
testLs = [
[1, 2, 3],
[1, 2, 4],
[1, 3, 4],
[1, 3, 5],
[2, 3, 4]]
CsGen(testLs)
[[1, 2, 3, 4]]
サブシーケンス判定
def isSubSeq(seq, cusSeq):
'''
Check if a sequence is contained in a customer sequence.
'''
nSeq, nCusSeq = len(seq), len(cusSeq)
if nSeq > nCusSeq:
return False
if nSeq == 1:
return any([seq[0] in i for i in cusSeq])
if nSeq > 1 :
head = [seq[0] in i for i in cusSeq]
if any(head):
split = head.index(True)
return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
else:
return False
seq = [3, 4, 8]
cusSeq = [[7], [3, 8], [9], [4, 5, 6], [8]]
isSubSeq(seq, cusSeq)
True
頻繁にkシーケンスが生成されます.このステップは反復的に実行する必要があります.
def calcSupport(transformDS, Cs, min_support):
'''
Return: 1. a list of large-sequences
2. a dictionary of `large-sequence: support` pairs
'''
supportLsk = {}; n = len(transformDS)
if len(Cs) >= 1:
for seq in Cs:
support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]\
) / n
if support >= min_support:
supportLsk.update({tuple(seq): support})
return [list(k) for k in supportLsk], supportLsk
Cs2 = CsGen(Ls1)
Ls2, supportLs2 = calcSupport(transformDS, Cs2, min_support)
print(Ls2)
print(supportLs2)
[[0, 1], [0, 3], [0, 2], [0, 4]]
{(0, 1): 0.4, (0, 3): 0.4, (0, 2): 0.4, (0, 4): 0.4}
5、Maximal Phase
大きなシーケンスのアイテムセットを元のショッピングバスケットに変換してシーケンスを最大化する必要があります.
tr_mapping = {v: k for k, v in mapping.items()}
Ls = Ls1 + Ls2
Ls = [[tr_mapping[k] for k in seq] for seq in Ls ]
supportLs = {}
supportLs.update(supportLs1); supportLs.update(supportLs2)
supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in \
supportLs.items()}
print(supportLs)
{(frozenset({40}),): 0.4, (frozenset({30}), frozenset({70})): 0.4, (frozenset({70}),): 0.6, (frozenset({30}), frozenset({40, 70})): 0.4, (frozenset({30}), frozenset({40})): 0.4, (frozenset({40, 70}),): 0.4, (frozenset({30}), frozenset({90})): 0.4, (frozenset({30}),): 0.8, (frozenset({90}),): 0.6}
シーケンスの最大化で保持する必要があるのは、あるシーケンスの非空の真のサブシーケンス(非空の真のサブセットのように、ここではシーケンス自体を保持する)です.このステップは、Transformationフェーズでサブシーケンスを判断する方法と似ています.これは、アイテムセットがマッピングされたため、少し修正された点とは異なります.
def isSubSeq2(seq, cusSeq):
nSeq, nCusSeq = len(seq), len(cusSeq)
if nSeq > nCusSeq:
return False
if nSeq == 1:
return any([seq[0].issubset(i) for i in cusSeq])
if nSeq > 1 :
head = [seq[0].issubset(i) for i in cusSeq]
if any(head):
split = head.index(True)
return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
else:
return False
def notProperSubSeq(seq, cusSeq):
'''
Return True if `seq` is not proper sub sequence of `cusSeq`
'''
if seq == cusSeq:
return True
else:
return not isSubSeq2(seq, cusSeq)
代替シーケンスの最大化されたシーケンスを保持
def maxLs(Ls, supportLs):
LsCopy = Ls.copy()
lenL, lenC = len(Ls), len(LsCopy)
while lenC > 1 and lenL > 1:
if LsCopy[lenC - 1] in Ls:
mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
Ls = list(itertools.compress(Ls, mask))
lenL = len(Ls)
lenC -= 1
supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls}
return Ls, supportLs
Ls, supportLs = maxLs(Ls, supportLs)
supportLs
{(frozenset({30}), frozenset({40, 70})): 0.4,
(frozenset({30}), frozenset({90})): 0.4}
aprioriAll
def aprioriAll(dataSet, min_support=0.4):
'''
Proceeding aprioriall algorithm to mining sequential patterns
Refer to:
Agrawal,R.,Srikant,R.,Institute of Electric and Electronic
Engineer et al. Mining sequential patterns[C]. Proceedings
of the Eleventh International Conference on Data Engineering,
Washington DC, USA: IEEE Computer Society,1995:3-14.
'''
# Litemset Phase
mapping, supportLs1 = createLs1(dataSet, min_support)
Ls1 = [list(k) for k in supportLs1]
# Transformation Phase
transformDS = transform(dataSet, mapping)
# Sequence Phase
LsList = [Ls1]; supportLs = supportLs1.copy()
k = 1
while k >= 1 and len(LsList[-1]) > 1:
Csk = CsGen(LsList[-1])
Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
if len(Lsk) > 0:
LsList.append(Lsk); supportLs.update(supportLsk)
k += 1
else:
break
Ls = list(itertools.chain(*LsList))
tr_mapping = {v: k for k, v in mapping.items()}
Ls = [[tr_mapping[k] for k in seq] for seq in Ls ]
supportLs = {tuple([tr_mapping[i] for i in k]):v
for k, v in supportLs.items()}
# Maximal Phase
Ls, supportLs = maxLs(Ls, supportLs)
return pd.DataFrame(list(supportLs.items()),
columns=['sequence', 'support'])
aprioriAll(dataSet, min_support=0.25)
sequence
support
0
((30), (40, 70))
0.4
1
((30), (90))
0.4
testSet = [
[[1, 5], [2], [3], [4] ],
[[1], [3], [4], [3, 5] ],
[[1], [2], [3], [4] ],
[[1], [3], [5] ],
[[4], [5] ]
]
import sys
sys.path.append('E:/myscripts')
from aprioriAll import aprioriAll
aprioriAll(testSet, min_support=0.4)
sequence
support
0
((4), (5))
0.4
1
((1), (2), (3), (4))
0.4
2
((1), (3), (5))
0.4
適用
import os, sys
import itertools
import pandas as pd
sys.path.append('E:/myscripts')
from aprioriAll import aprioriAll
os.chdir('Q:/data')
transactions = pd.read_csv('Transactions.csv')
'''
Sort phase: The database is sorted, with customer-id as the major key
and transaction-time as the minor key. This step implicitly
converts the original transaction database into a database of
customer sequences.
'''
'
Sort phase: The database is sorted, with customer-id as the major key
and transaction-time as the minor key. This step implicitly
converts the original transaction database into a database of
customer sequences.
'
def aggFunc(*args):
agg = itertools.chain(*args)
return list(agg)
baskets = transactions['Model']\
.groupby([transactions['OrderNumber'], transactions['LineNumber']])\
.apply(aggFunc)
baskets.head()
OrderNumber LineNumber
SO51176 1 [Road-250]
2 [Road Bottle Cage]
SO51177 1 [Touring-2000]
2 [Sport-100]
SO51178 1 [Mountain-200]
Name: Model, dtype: object
dataSet = list(baskets.groupby(level=0).apply(list))
dataSet[:3]
[[['Road-250'], ['Road Bottle Cage']],
[['Touring-2000'], ['Sport-100']],
[['Mountain-200'], ['Mountain Bottle Cage'], ['Water Bottle']]]
aprioriAll(dataSet, min_support=0.05).head()
sequence
support
0
((Short-Sleeve Classic Jersey),)
0.072312
1
((Long-Sleeve Logo Jersey),)
0.077252
2
((Patch kit),)
0.141614
3
((Road-750),)
0.067890
4
((Road Bottle Cage),)
0.080075