@@ -61,40 +61,36 @@ def generate_new_combinations(old_combinations):
6161 # early exit from for-loop skips else clause just below
6262 break
6363 else :
64- yield from candidate
64+ yield candidate
6565 j = j + 1
6666
6767
68- def compute_supports_low_memory (X , is_sparse , combin ):
69- supports = np .zeros (combin .shape [0 ])
70- ncomb , nitems = combin .shape
68+ def generate_supports_and_itemsets (X , is_sparse , combin , min_support ):
69+ counter = 0
7170 if is_sparse :
72- _bools = X [:, 0 ].toarray ()
73- for c in range (ncomb ):
74- _bools [:] = X [:, combin [c , 0 ]].toarray ()
75- for j in range (1 , nitems ):
76- _bools [:] &= X [:, combin [c , j ]].toarray ()
77- supports [c ] = np .count_nonzero (_bools )
71+ count = np .empty (X .shape [0 ], dtype = int )
72+ for itemset in combin :
73+ counter += 1
74+ count [:] = 0
75+ for item in itemset :
76+ # much faster than X[:, item].toarray() or X.getcol(item).indices
77+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]]] += 1
78+ support = np .count_nonzero (count == len (itemset ))
79+ if support >= min_support :
80+ yield support
81+ yield from itemset
7882 else :
79- _bools = np .copy (X [:, 0 ])
80- for c in range (ncomb ):
81- _bools [:] = X [:, combin [c , 0 ]]
82- for j in range (1 , nitems ):
83- _bools [:] &= X [:, combin [c , j ]]
84- supports [c ] = np .count_nonzero (_bools )
85- return supports
86-
87-
88- def compute_supports (X , is_sparse , combin ):
89- all_ones = np .ones ((X .shape [0 ], 1 ))
90- if is_sparse :
91- _bools = X [:, combin [:, 0 ]] == all_ones
92- for n in range (1 , combin .shape [1 ]):
93- _bools = _bools & (X [:, combin [:, n ]] == all_ones )
94- else :
95- _bools = np .all (X [:, combin ], axis = 2 )
96-
97- return np .sum (_bools , axis = 0 )
83+ for itemset in combin :
84+ counter += 1
85+ _bools = np .ones (X .shape [0 ], dtype = bool )
86+ for item in itemset :
87+ _bools [:] &= X [:, item ]
88+ support = np .count_nonzero (_bools )
89+ if support >= min_support :
90+ yield support
91+ yield from itemset
92+ # return the total of processed itemsets as last element
93+ yield counter
9894
9995
10096def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -223,38 +219,25 @@ def _support(_x, _n_rows, _is_sparse):
223219 support_dict = {1 : support [support >= min_support ]}
224220 itemset_dict = {1 : ary_col_idx [support >= min_support ].reshape (- 1 , 1 )}
225221 max_itemset = 1
226- rows_count = float (X .shape [0 ])
227-
228- all_ones = np .ones ((int (rows_count ), 1 ))
229222
230223 while max_itemset and max_itemset < (max_len or float ('inf' )):
231224 next_max_itemset = max_itemset + 1
232225
233226 combin = generate_new_combinations (itemset_dict [max_itemset ])
234- combin = np .fromiter (combin , dtype = int )
235- combin = combin .reshape (- 1 , next_max_itemset )
227+ gen_itemsets = generate_supports_and_itemsets (X , is_sparse , combin , int (min_support * X .shape [0 ]))
228+
229+ support_valid_itemsets = np .fromiter (gen_itemsets , dtype = int )
230+ processed_itemsets = support_valid_itemsets [- 1 ]
236231
237- if combin .size == 0 :
238- break
239232 if verbose :
240233 print (
241- '\r Processing %d combinations | Sampling itemset size %d' %
242- (combin .size , next_max_itemset ), end = "" )
243-
244- # With exceptionally large datasets, the matrix operations can use a
245- # substantial amount of memory. For low memory applications or large
246- # datasets, set `low_memory=True` to use a slower but more memory-
247- # efficient implementation.
248- if low_memory :
249- support = compute_supports_low_memory (X , is_sparse , combin )
250- else :
251- support = compute_supports (X , is_sparse , combin )
252- support /= rows_count
234+ '\r Processed %d combinations | Sampling itemset size %d' %
235+ (processed_itemsets , next_max_itemset ), end = "" )
253236
254- _mask = ( support >= min_support )
255- if any ( _mask ) :
256- itemset_dict [next_max_itemset ] = np . array ( combin [ _mask ])
257- support_dict [next_max_itemset ] = np . array ( support [ _mask ])
237+ support_valid_itemsets = support_valid_itemsets [: - 1 ]. reshape ( - 1 , 1 + next_max_itemset )
238+ if support_valid_itemsets . size > 0 :
239+ itemset_dict [next_max_itemset ] = support_valid_itemsets [:, 1 :]
240+ support_dict [next_max_itemset ] = support_valid_itemsets [:, 0 ] / X . shape [ 0 ]
258241 max_itemset = next_max_itemset
259242 else :
260243 # Exit condition
0 commit comments