@@ -61,40 +61,36 @@ def generate_new_combinations(old_combinations):
6161 # early exit from for-loop skips else clause just below
6262 break
6363 else :
64- yield from candidate
64+ yield candidate
6565 j = j + 1
6666
6767
68- def compute_supports_low_memory (X , is_sparse , combin ):
69- supports = np .zeros (combin .shape [0 ])
70- ncomb , nitems = combin .shape
68+ def generate_supports_and_itemsets (X , is_sparse , combin , min_support ):
69+ counter = 0
7170 if is_sparse :
72- _bools = X [:, 0 ].toarray ()
73- for c in range (ncomb ):
74- _bools [:] = X [:, combin [c , 0 ]].toarray ()
75- for j in range (1 , nitems ):
76- np .logical_and (_bools , X [:, combin [c , j ]].toarray (), out = _bools )
77- supports [c ] = np .count_nonzero (_bools )
71+ count = np .empty (X .shape [0 ], dtype = int )
72+ for itemset in combin :
73+ counter += 1
74+ count [:] = 0
75+ for item in itemset :
76+ # faster than X[:, item].toarray() or X.getcol(item).indices
77+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]]] += 1
78+ support = np .count_nonzero (count == len (itemset ))
79+ if support >= min_support :
80+ yield support
81+ yield from itemset
7882 else :
79- _bools = np .copy (X [:, 0 ])
80- for c in range (ncomb ):
81- _bools [:] = X [:, combin [c , 0 ]]
82- for j in range (1 , nitems ):
83- np .logical_and (_bools , X [:, combin [c , j ]], out = _bools )
84- supports [c ] = np .count_nonzero (_bools )
85- return supports
86-
87-
88- def compute_supports (X , is_sparse , combin ):
89- if is_sparse :
90- all_ones = np .ones ((X .shape [0 ], 1 ))
91- _bools = np .array (X [:, combin [:, 0 ]] == all_ones )
92- for n in range (1 , combin .shape [1 ]):
93- _bools = _bools & np .array (X [:, combin [:, n ]] == all_ones )
94- else :
95- _bools = np .all (X [:, combin ], axis = 2 )
96-
97- return np .sum (_bools , axis = 0 )
83+ for itemset in combin :
84+ counter += 1
85+ _bools = np .ones (X .shape [0 ], dtype = bool )
86+ for item in itemset :
87+ np .logical_and (_bools , X [:, item ], out = _bools )
88+ support = np .count_nonzero (_bools )
89+ if support >= min_support :
90+ yield support
91+ yield from itemset
92+ # return the total of processed itemsets as last element
93+ yield counter
9894
9995
10096def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -206,13 +202,15 @@ def _support(_x, _n_rows, _is_sparse):
206202 X = df .values
207203 else :
208204 X = df .to_coo ().tocsc ()
205+ X .eliminate_zeros ()
209206 is_sparse = True
210207 elif hasattr (df , "sparse" ):
211208 # DataFrame with SparseArray (pandas >= 0.24)
212209 if df .size == 0 :
213210 X = df .values
214211 else :
215212 X = df .sparse .to_coo ().tocsc ()
213+ X .eliminate_zeros ()
216214 is_sparse = True
217215 else :
218216 # dense DataFrame
@@ -223,37 +221,26 @@ def _support(_x, _n_rows, _is_sparse):
223221 support_dict = {1 : support [support >= min_support ]}
224222 itemset_dict = {1 : ary_col_idx [support >= min_support ].reshape (- 1 , 1 )}
225223 max_itemset = 1
226- rows_count = float (X .shape [0 ])
227-
228- all_ones = np .ones ((int (rows_count ), 1 ))
229224
230225 while max_itemset and max_itemset < (max_len or float ('inf' )):
231226 next_max_itemset = max_itemset + 1
232227
233228 combin = generate_new_combinations (itemset_dict [max_itemset ])
234- combin = np .fromiter (combin , dtype = int )
235- combin = combin .reshape (- 1 , next_max_itemset )
236-
237- if combin .size == 0 :
238- break
239- if verbose :
240- print (
241- '\r Processing %d combinations | Sampling itemset size %d' %
242- (combin .size , next_max_itemset ), end = "" )
243-
244- # With exceptionally large datasets, the matrix operations can use a
245- # substantial amount of memory. For low memory applications or large
246- # datasets, set `low_memory=True` to use a slower but more memory-
247- # efficient implementation.
248- if low_memory :
249- support = compute_supports_low_memory (X , is_sparse , combin ) / rows_count
250- else :
251- support = compute_supports (X , is_sparse , combin ) / rows_count
252-
253- _mask = (support >= min_support )
254- if any (_mask ):
255- itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
256- support_dict [next_max_itemset ] = np .array (support [_mask ])
229+ min_rows = np .math .ceil (min_support * X .shape [0 ])
230+ gen_itemsets = generate_supports_and_itemsets (
231+ X , is_sparse , combin , min_rows )
232+
233+ support_valid_itemsets = np .fromiter (gen_itemsets , dtype = int )
234+ processed_itemsets = support_valid_itemsets [- 1 ]
235+ support_valid_itemsets = support_valid_itemsets [:- 1 ]
236+ if support_valid_itemsets .size > 0 :
237+ if verbose :
238+ print (
239+ '\r Processed %d combinations | Sampling itemset size %d' %
240+ (processed_itemsets , next_max_itemset ), end = "" )
241+ support_valid_itemsets .shape = (- 1 , 1 + next_max_itemset )
242+ itemset_dict [next_max_itemset ] = support_valid_itemsets [:, 1 :]
243+ support_dict [next_max_itemset ] = support_valid_itemsets [:, 0 ] / X .shape [0 ]
257244 max_itemset = next_max_itemset
258245 else :
259246 # Exit condition
0 commit comments