@@ -65,84 +65,36 @@ def generate_new_combinations(old_combinations):
6565 j = j + 1
6666
6767
68- def generate_new_combinations_low_memory (old_combinations , X , min_support ,
69- is_sparse ):
70- """
71- Generator of all combinations based on the last state of Apriori algorithm
72- Parameters
73- -----------
74- old_combinations: np.array
75- All combinations with enough support in the last step
76- Combinations are represented by a matrix.
77- Number of columns is equal to the combination size
78- of the previous step.
79- Each row represents one combination
80- and contains item type ids in the ascending order
81- ```
82- 0 1
83- 0 15 20
84- 1 15 22
85- 2 17 19
86- ```
87-
88- X: np.array or scipy sparse matrix
89- The allowed values are either 0/1 or True/False.
90- For example,
91-
92- ```
93- 0 True False True True False True
94- 1 True False True False False True
95- 2 True False True False False False
96- 3 True True False False False False
97- 4 False False True True True True
98- 5 False False True False True True
99- 6 False False True False True False
100- 7 True True False False False False
101- ```
102-
103- min_support : float (default: 0.5)
104- A float between 0 and 1 for minumum support of the itemsets returned.
105- The support is computed as the fraction
106- `transactions_where_item(s)_occur / total_transactions`.
107-
108- is_sparse : bool True if X is sparse
109-
110- Returns
111- -----------
112- Generator of all combinations from the last step x items
113- from the previous step. Every combination contains the
114- number of transactions where this item occurs, followed
115- by item type ids in the ascending order.
116- No combination other than generated
117- do not have a chance to get enough support
118-
119- Examples
120- -----------
121- For usage examples, please see
122- http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
123-
124- """
68+ def compute_supports_low_memory (X , is_sparse , combin ):
69+ supports = np .zeros (combin .shape [0 ])
70+ ncomb , nitems = combin .shape
71+ if is_sparse :
72+ _bools = X [:, 0 ].toarray ()
73+ for c in range (ncomb ):
74+ _bools [:] = X [:, combin [c , 0 ]].toarray ()
75+ for j in range (1 , nitems ):
76+ np .logical_and (_bools , X [:, combin [c , j ]].toarray (), out = _bools )
77+ supports [c ] = np .count_nonzero (_bools )
78+ else :
79+ _bools = np .copy (X [:, 0 ])
80+ for c in range (ncomb ):
81+ _bools [:] = X [:, combin [c , 0 ]]
82+ for j in range (1 , nitems ):
83+ np .logical_and (_bools , X [:, combin [c , j ]], out = _bools )
84+ supports [c ] = np .count_nonzero (_bools )
85+ return supports
86+
87+
88+ def compute_supports (X , is_sparse , combin ):
89+ if is_sparse :
90+ all_ones = np .ones ((X .shape [0 ], 1 ))
91+ _bools = np .array (X [:, combin [:, 0 ]] == all_ones )
92+ for n in range (1 , combin .shape [1 ]):
93+ _bools = _bools & np .array (X [:, combin [:, n ]] == all_ones )
94+ else :
95+ _bools = np .all (X [:, combin ], axis = 2 )
12596
126- items_types_in_previous_step = np .unique (old_combinations .flatten ())
127- rows_count = X .shape [0 ]
128- threshold = min_support * rows_count
129- for old_combination in old_combinations :
130- max_combination = old_combination [- 1 ]
131- mask = items_types_in_previous_step > max_combination
132- valid_items = items_types_in_previous_step [mask ]
133- old_tuple = tuple (old_combination )
134- if is_sparse :
135- mask_rows = X [:, old_tuple ].toarray ().all (axis = 1 )
136- X_cols = X [:, valid_items ].toarray ()
137- supports = X_cols [mask_rows ].sum (axis = 0 )
138- else :
139- mask_rows = X [:, old_tuple ].all (axis = 1 )
140- supports = X [mask_rows ][:, valid_items ].sum (axis = 0 )
141- valid_indices = (supports >= threshold ).nonzero ()[0 ]
142- for index in valid_indices :
143- yield supports [index ]
144- yield from old_tuple
145- yield valid_items [index ]
97+ return np .sum (_bools , axis = 0 )
14698
14799
148100def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -278,56 +230,34 @@ def _support(_x, _n_rows, _is_sparse):
278230 while max_itemset and max_itemset < (max_len or float ('inf' )):
279231 next_max_itemset = max_itemset + 1
280232
233+ combin = generate_new_combinations (itemset_dict [max_itemset ])
234+ combin = np .fromiter (combin , dtype = int )
235+ combin = combin .reshape (- 1 , next_max_itemset )
236+
237+ if combin .size == 0 :
238+ break
239+ if verbose :
240+ print (
241+ '\r Processing %d combinations | Sampling itemset size %d' %
242+ (combin .size , next_max_itemset ), end = "" )
243+
281244 # With exceptionally large datasets, the matrix operations can use a
282245 # substantial amount of memory. For low memory applications or large
283246 # datasets, set `low_memory=True` to use a slower but more memory-
284247 # efficient implementation.
285248 if low_memory :
286- combin = generate_new_combinations_low_memory (
287- itemset_dict [max_itemset ], X , min_support , is_sparse )
288- # slightly faster than creating an array from a list of tuples
289- combin = np .fromiter (combin , dtype = int )
290- combin = combin .reshape (- 1 , next_max_itemset + 1 )
249+ support = compute_supports_low_memory (X , is_sparse , combin ) / rows_count
250+ else :
251+ support = compute_supports (X , is_sparse , combin ) / rows_count
291252
292- if combin .size == 0 :
293- break
294- if verbose :
295- print (
296- '\r Processing %d combinations | Sampling itemset size %d' %
297- (combin .size , next_max_itemset ), end = "" )
298-
299- itemset_dict [next_max_itemset ] = combin [:, 1 :]
300- support_dict [next_max_itemset ] = combin [:, 0 ].astype (float ) \
301- / rows_count
253+ _mask = (support >= min_support )
254+ if any (_mask ):
255+ itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
256+ support_dict [next_max_itemset ] = np .array (support [_mask ])
302257 max_itemset = next_max_itemset
303258 else :
304- combin = generate_new_combinations (itemset_dict [max_itemset ])
305- combin = np .fromiter (combin , dtype = int )
306- combin = combin .reshape (- 1 , next_max_itemset )
307-
308- if combin .size == 0 :
309- break
310- if verbose :
311- print (
312- '\r Processing %d combinations | Sampling itemset size %d' %
313- (combin .size , next_max_itemset ), end = "" )
314-
315- if is_sparse :
316- _bools = X [:, combin [:, 0 ]] == all_ones
317- for n in range (1 , combin .shape [1 ]):
318- _bools = _bools & (X [:, combin [:, n ]] == all_ones )
319- else :
320- _bools = np .all (X [:, combin ], axis = 2 )
321-
322- support = _support (np .array (_bools ), rows_count , is_sparse )
323- _mask = (support >= min_support ).reshape (- 1 )
324- if any (_mask ):
325- itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
326- support_dict [next_max_itemset ] = np .array (support [_mask ])
327- max_itemset = next_max_itemset
328- else :
329- # Exit condition
330- break
259+ # Exit condition
260+ break
331261
332262 all_res = []
333263 for k in sorted (itemset_dict ):
0 commit comments