@@ -61,88 +61,36 @@ def generate_new_combinations(old_combinations):
6161 # early exit from for-loop skips else clause just below
6262 break
6363 else :
64- yield from candidate
64+ yield candidate
6565 j = j + 1
6666
6767
68- def generate_new_combinations_low_memory (old_combinations , X , min_support ,
69- is_sparse ):
70- """
71- Generator of all combinations based on the last state of Apriori algorithm
72- Parameters
73- -----------
74- old_combinations: np.array
75- All combinations with enough support in the last step
76- Combinations are represented by a matrix.
77- Number of columns is equal to the combination size
78- of the previous step.
79- Each row represents one combination
80- and contains item type ids in the ascending order
81- ```
82- 0 1
83- 0 15 20
84- 1 15 22
85- 2 17 19
86- ```
87-
88- X: np.array or scipy sparse matrix
89- The allowed values are either 0/1 or True/False.
90- For example,
91-
92- ```
93- 0 True False True True False True
94- 1 True False True False False True
95- 2 True False True False False False
96- 3 True True False False False False
97- 4 False False True True True True
98- 5 False False True False True True
99- 6 False False True False True False
100- 7 True True False False False False
101- ```
102-
103- min_support : float (default: 0.5)
104- A float between 0 and 1 for minumum support of the itemsets returned.
105- The support is computed as the fraction
106- `transactions_where_item(s)_occur / total_transactions`.
107-
108- is_sparse : bool True if X is sparse
109-
110- Returns
111- -----------
112- Generator of all combinations from the last step x items
113- from the previous step. Every combination contains the
114- number of transactions where this item occurs, followed
115- by item type ids in the ascending order.
116- No combination other than generated
117- do not have a chance to get enough support
118-
119- Examples
120- -----------
121- For usage examples, please see
122- http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
123-
124- """
125-
126- items_types_in_previous_step = np .unique (old_combinations .flatten ())
127- rows_count = X .shape [0 ]
128- threshold = min_support * rows_count
129- for old_combination in old_combinations :
130- max_combination = old_combination [- 1 ]
131- mask = items_types_in_previous_step > max_combination
132- valid_items = items_types_in_previous_step [mask ]
133- old_tuple = tuple (old_combination )
134- if is_sparse :
135- mask_rows = X [:, old_tuple ].toarray ().all (axis = 1 )
136- X_cols = X [:, valid_items ].toarray ()
137- supports = X_cols [mask_rows ].sum (axis = 0 )
138- else :
139- mask_rows = X [:, old_tuple ].all (axis = 1 )
140- supports = X [mask_rows ][:, valid_items ].sum (axis = 0 )
141- valid_indices = (supports >= threshold ).nonzero ()[0 ]
142- for index in valid_indices :
143- yield supports [index ]
144- yield from old_tuple
145- yield valid_items [index ]
68+ def generate_supports_and_itemsets (X , is_sparse , combin , min_support ):
69+ counter = 0
70+ if is_sparse :
71+ count = np .empty (X .shape [0 ], dtype = int )
72+ for itemset in combin :
73+ counter += 1
74+ count [:] = 0
75+ for item in itemset :
76+ # faster than X[:, item].toarray() or X.getcol(item).indices
77+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]]] += 1
78+ support = np .count_nonzero (count == len (itemset ))
79+ if support >= min_support :
80+ yield support
81+ yield from itemset
82+ else :
83+ for itemset in combin :
84+ counter += 1
85+ _bools = np .ones (X .shape [0 ], dtype = bool )
86+ for item in itemset :
87+ np .logical_and (_bools , X [:, item ], out = _bools )
88+ support = np .count_nonzero (_bools )
89+ if support >= min_support :
90+ yield support
91+ yield from itemset
92+ # return the total of processed itemsets as last element
93+ yield counter
14694
14795
14896def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -254,13 +202,15 @@ def _support(_x, _n_rows, _is_sparse):
254202 X = df .values
255203 else :
256204 X = df .to_coo ().tocsc ()
205+ X .eliminate_zeros ()
257206 is_sparse = True
258207 elif hasattr (df , "sparse" ):
259208 # DataFrame with SparseArray (pandas >= 0.24)
260209 if df .size == 0 :
261210 X = df .values
262211 else :
263212 X = df .sparse .to_coo ().tocsc ()
213+ X .eliminate_zeros ()
264214 is_sparse = True
265215 else :
266216 # dense DataFrame
@@ -271,63 +221,30 @@ def _support(_x, _n_rows, _is_sparse):
271221 support_dict = {1 : support [support >= min_support ]}
272222 itemset_dict = {1 : ary_col_idx [support >= min_support ].reshape (- 1 , 1 )}
273223 max_itemset = 1
274- rows_count = float (X .shape [0 ])
275-
276- all_ones = np .ones ((int (rows_count ), 1 ))
277224
278225 while max_itemset and max_itemset < (max_len or float ('inf' )):
279226 next_max_itemset = max_itemset + 1
280227
281- # With exceptionally large datasets, the matrix operations can use a
282- # substantial amount of memory. For low memory applications or large
283- # datasets, set `low_memory=True` to use a slower but more memory-
284- # efficient implementation.
285- if low_memory :
286- combin = generate_new_combinations_low_memory (
287- itemset_dict [max_itemset ], X , min_support , is_sparse )
288- # slightly faster than creating an array from a list of tuples
289- combin = np .fromiter (combin , dtype = int )
290- combin = combin .reshape (- 1 , next_max_itemset + 1 )
291-
292- if combin .size == 0 :
293- break
228+ combin = generate_new_combinations (itemset_dict [max_itemset ])
229+ min_rows = np .math .ceil (min_support * X .shape [0 ])
230+ gen_itemsets = generate_supports_and_itemsets (
231+ X , is_sparse , combin , min_rows )
232+
233+ support_valid_itemsets = np .fromiter (gen_itemsets , dtype = int )
234+ processed_itemsets = support_valid_itemsets [- 1 ]
235+ support_valid_itemsets = support_valid_itemsets [:- 1 ]
236+ if support_valid_itemsets .size > 0 :
294237 if verbose :
295238 print (
296- '\r Processing %d combinations | Sampling itemset size %d' %
297- (combin .size , next_max_itemset ), end = "" )
298-
299- itemset_dict [next_max_itemset ] = combin [:, 1 :]
300- support_dict [next_max_itemset ] = combin [:, 0 ].astype (float ) \
301- / rows_count
239+ '\r Processed %d combinations | Sampling itemset size %d' %
240+ (processed_itemsets , next_max_itemset ), end = "" )
241+ support_valid_itemsets .shape = (- 1 , 1 + next_max_itemset )
242+ itemset_dict [next_max_itemset ] = support_valid_itemsets [:, 1 :]
243+ support_dict [next_max_itemset ] = support_valid_itemsets [:, 0 ] / X .shape [0 ]
302244 max_itemset = next_max_itemset
303245 else :
304- combin = generate_new_combinations (itemset_dict [max_itemset ])
305- combin = np .fromiter (combin , dtype = int )
306- combin = combin .reshape (- 1 , next_max_itemset )
307-
308- if combin .size == 0 :
309- break
310- if verbose :
311- print (
312- '\r Processing %d combinations | Sampling itemset size %d' %
313- (combin .size , next_max_itemset ), end = "" )
314-
315- if is_sparse :
316- _bools = X [:, combin [:, 0 ]] == all_ones
317- for n in range (1 , combin .shape [1 ]):
318- _bools = _bools & (X [:, combin [:, n ]] == all_ones )
319- else :
320- _bools = np .all (X [:, combin ], axis = 2 )
321-
322- support = _support (np .array (_bools ), rows_count , is_sparse )
323- _mask = (support >= min_support ).reshape (- 1 )
324- if any (_mask ):
325- itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
326- support_dict [next_max_itemset ] = np .array (support [_mask ])
327- max_itemset = next_max_itemset
328- else :
329- # Exit condition
330- break
246+ # Exit condition
247+ break
331248
332249 all_res = []
333250 for k in sorted (itemset_dict ):
0 commit comments