@@ -61,40 +61,37 @@ def generate_new_combinations(old_combinations):
6161 # early exit from for-loop skips else clause just below
6262 break
6363 else :
64- yield from candidate
64+ yield candidate
6565 j = j + 1
6666
6767
68- def compute_supports_low_memory (X , is_sparse , combin ):
69- supports = np .zeros (combin .shape [0 ])
70- ncomb , nitems = combin .shape
68+ def generate_supports_and_itemsets (X , is_sparse , combin , min_support ):
69+ counter = 0
7170 if is_sparse :
72- _bools = X [:, 0 ].toarray ()
73- for c in range (ncomb ):
74- _bools [:] = X [:, combin [c , 0 ]].toarray ()
75- for j in range (1 , nitems ):
76- np .logical_and (_bools , X [:, combin [c , j ]].toarray (), out = _bools )
77- supports [c ] = np .count_nonzero (_bools )
71+ count = np .empty (X .shape [0 ], dtype = int )
72+ for itemset in combin :
73+ counter += 1
74+ count [:] = 0
75+ for item in itemset :
76+ nnz = np .nonzero (X .data [X .indptr [item ]:X .indptr [item + 1 ]])[0 ]
77+ # much faster than X[:, item].toarray() or X.getcol(item).indices
78+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]][nnz ]] += 1
79+ support = np .count_nonzero (count == len (itemset ))
80+ if support >= min_support :
81+ yield support
82+ yield from itemset
7883 else :
79- _bools = np .copy (X [:, 0 ])
80- for c in range (ncomb ):
81- _bools [:] = X [:, combin [c , 0 ]]
82- for j in range (1 , nitems ):
83- np .logical_and (_bools , X [:, combin [c , j ]], out = _bools )
84- supports [c ] = np .count_nonzero (_bools )
85- return supports
86-
87-
88- def compute_supports (X , is_sparse , combin ):
89- if is_sparse :
90- all_ones = np .ones ((X .shape [0 ], 1 ))
91- _bools = np .array (X [:, combin [:, 0 ]] == all_ones )
92- for n in range (1 , combin .shape [1 ]):
93- _bools = _bools & np .array (X [:, combin [:, n ]] == all_ones )
94- else :
95- _bools = np .all (X [:, combin ], axis = 2 )
96-
97- return np .sum (_bools , axis = 0 )
84+ for itemset in combin :
85+ counter += 1
86+ _bools = np .ones (X .shape [0 ], dtype = bool )
87+ for item in itemset :
88+ np .logical_and (_bools , X [:, item ], out = _bools )
89+ support = np .count_nonzero (_bools )
90+ if support >= min_support :
91+ yield support
92+ yield from itemset
93+ # return the total of processed itemsets as last element
94+ yield counter
9895
9996
10097def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -223,37 +220,24 @@ def _support(_x, _n_rows, _is_sparse):
223220 support_dict = {1 : support [support >= min_support ]}
224221 itemset_dict = {1 : ary_col_idx [support >= min_support ].reshape (- 1 , 1 )}
225222 max_itemset = 1
226- rows_count = float (X .shape [0 ])
227-
228- all_ones = np .ones ((int (rows_count ), 1 ))
229223
230224 while max_itemset and max_itemset < (max_len or float ('inf' )):
231225 next_max_itemset = max_itemset + 1
232226
233227 combin = generate_new_combinations (itemset_dict [max_itemset ])
234- combin = np .fromiter (combin , dtype = int )
235- combin = combin .reshape (- 1 , next_max_itemset )
236-
237- if combin .size == 0 :
238- break
239- if verbose :
240- print (
241- '\r Processing %d combinations | Sampling itemset size %d' %
242- (combin .size , next_max_itemset ), end = "" )
243-
244- # With exceptionally large datasets, the matrix operations can use a
245- # substantial amount of memory. For low memory applications or large
246- # datasets, set `low_memory=True` to use a slower but more memory-
247- # efficient implementation.
248- if low_memory :
249- support = compute_supports_low_memory (X , is_sparse , combin ) / rows_count
250- else :
251- support = compute_supports (X , is_sparse , combin ) / rows_count
252-
253- _mask = (support >= min_support )
254- if any (_mask ):
255- itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
256- support_dict [next_max_itemset ] = np .array (support [_mask ])
228+ gen_itemsets = generate_supports_and_itemsets (X , is_sparse , combin , np .math .ceil (min_support * X .shape [0 ]))
229+
230+ support_valid_itemsets = np .fromiter (gen_itemsets , dtype = int )
231+ processed_itemsets = support_valid_itemsets [- 1 ]
232+
233+ support_valid_itemsets = support_valid_itemsets [:- 1 ].reshape (- 1 , 1 + next_max_itemset )
234+ if support_valid_itemsets .size > 0 :
235+ if verbose :
236+ print (
237+ '\r Processed %d combinations | Sampling itemset size %d' %
238+ (processed_itemsets , next_max_itemset ), end = "" )
239+ itemset_dict [next_max_itemset ] = support_valid_itemsets [:, 1 :]
240+ support_dict [next_max_itemset ] = support_valid_itemsets [:, 0 ] / X .shape [0 ]
257241 max_itemset = next_max_itemset
258242 else :
259243 # Exit condition
0 commit comments