@@ -52,18 +52,17 @@ def generate_new_combinations(old_combinations):
5252 Generator of all combinations based on the last state of Apriori algorithm
5353 Parameters
5454 -----------
55- old_combinations: np.array
55+ old_combinations: list of tuples
5656 All combinations with enough support in the last step
57- Combinations are represented by a matrix .
58- Number of columns is equal to the combination size
57+ Combinations are represented by a list of tuples .
58+ All tuples have the same length, which is equal to the combination size
5959 of the previous step.
60- Each row represents one combination
60+ Each tuple represents one combination
6161 and contains item type ids in the ascending order
6262 ```
63- 0 1
64- 0 15 20
65- 1 15 22
66- 2 17 19
63+ 15 20
64+ 15 22
65+ 17 19
6766 ```
6867
6968 Returns
@@ -89,7 +88,7 @@ def generate_new_combinations(old_combinations):
8988 if head_i != head_j :
9089 break
9190 # Prune old_combination+(item,) if any subset is not frequent
92- candidate = tuple ( old_combination ) + (tail_j ,)
91+ candidate = old_combination + (tail_j ,)
9392 # No need to check the last two values, because test_candidate
9493 # is then old_combinations[i] and old_combinations[j]
9594 for idx in range (len (candidate ) - 2 ):
@@ -99,90 +98,10 @@ def generate_new_combinations(old_combinations):
9998 # early exit from for-loop skips else clause just below
10099 break
101100 else :
102- yield from candidate
101+ yield candidate
103102 j = j + 1
104103
105104
106- def generate_new_combinations_low_memory (old_combinations , X , min_support ,
107- is_sparse ):
108- """
109- Generator of all combinations based on the last state of Apriori algorithm
110- Parameters
111- -----------
112- old_combinations: np.array
113- All combinations with enough support in the last step
114- Combinations are represented by a matrix.
115- Number of columns is equal to the combination size
116- of the previous step.
117- Each row represents one combination
118- and contains item type ids in the ascending order
119- ```
120- 0 1
121- 0 15 20
122- 1 15 22
123- 2 17 19
124- ```
125-
126- X: np.array or scipy sparse matrix
127- The allowed values are either 0/1 or True/False.
128- For example,
129-
130- ```
131- 0 True False True True False True
132- 1 True False True False False True
133- 2 True False True False False False
134- 3 True True False False False False
135- 4 False False True True True True
136- 5 False False True False True True
137- 6 False False True False True False
138- 7 True True False False False False
139- ```
140-
141- min_support : float (default: 0.5)
142- A float between 0 and 1 for minumum support of the itemsets returned.
143- The support is computed as the fraction
144- `transactions_where_item(s)_occur / total_transactions`.
145-
146- is_sparse : bool True if X is sparse
147-
148- Returns
149- -----------
150- Generator of all combinations from the last step x items
151- from the previous step. Every combination contains the
152- number of transactions where this item occurs, followed
153- by item type ids in the ascending order.
154- No combination other than generated
155- do not have a chance to get enough support
156-
157- Examples
158- -----------
159- For usage examples, please see
160- http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
161-
162- """
163-
164- items_types_in_previous_step = np .unique (old_combinations .flatten ())
165- rows_count = X .shape [0 ]
166- threshold = min_support * rows_count
167- for old_combination in old_combinations :
168- max_combination = old_combination [- 1 ]
169- mask = items_types_in_previous_step > max_combination
170- valid_items = items_types_in_previous_step [mask ]
171- old_tuple = tuple (old_combination )
172- if is_sparse :
173- mask_rows = X [:, old_tuple ].toarray ().all (axis = 1 )
174- X_cols = X [:, valid_items ].toarray ()
175- supports = X_cols [mask_rows ].sum (axis = 0 )
176- else :
177- mask_rows = X [:, old_tuple ].all (axis = 1 )
178- supports = X [mask_rows ][:, valid_items ].sum (axis = 0 )
179- valid_indices = (supports >= threshold ).nonzero ()[0 ]
180- for index in valid_indices :
181- yield supports [index ]
182- yield from old_tuple
183- yield valid_items [index ]
184-
185-
186105def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
187106 low_memory = False ):
188107 """Get frequent itemsets from a one-hot DataFrame
@@ -220,16 +139,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
220139 possible itemsets lengths (under the apriori condition) are evaluated.
221140
222141 verbose : int (default: 0)
223- Shows the number of iterations if >= 1 and `low_memory` is `True`. If
224- >=1 and `low_memory` is `False`, shows the number of combinations.
225-
226- low_memory : bool (default: False)
227- If `True`, uses an iterator to search for combinations above
228- `min_support`.
229- Note that while `low_memory=True` should only be used for large dataset
230- if memory resources are limited, because this implementation is approx.
231- 3-6x slower than the default.
232-
142+ Shows the number of combinations if >= 1.
233143
234144 Returns
235145 -----------
@@ -292,80 +202,70 @@ def _support(_x, _n_rows, _is_sparse):
292202 X = df .values
293203 else :
294204 X = df .to_coo ().tocsc ()
205+ # See comment below
206+ X .eliminate_zeros ()
295207 is_sparse = True
296208 elif hasattr (df , "sparse" ):
297209 # DataFrame with SparseArray (pandas >= 0.24)
298210 if df .size == 0 :
299211 X = df .values
300212 else :
301213 X = df .sparse .to_coo ().tocsc ()
214+ # See comment below
215+ X .eliminate_zeros ()
302216 is_sparse = True
303217 else :
304218 # dense DataFrame
305219 X = df .values
306220 is_sparse = False
307221 support = _support (X , X .shape [0 ], is_sparse )
308- ary_col_idx = np .arange (X .shape [1 ])
309222 support_dict = {1 : support [support >= min_support ]}
310- itemset_dict = {1 : ary_col_idx [ support >= min_support ]. reshape ( - 1 , 1 ) }
223+ itemset_dict = {1 : [( idx ,) for idx in np . where ( support >= min_support )[ 0 ]] }
311224 max_itemset = 1
312- rows_count = float (X .shape [0 ])
313-
314- all_ones = np .ones ((int (rows_count ), 1 ))
315225
316226 while max_itemset and max_itemset < (max_len or float ('inf' )):
317227 next_max_itemset = max_itemset + 1
318228
319- # With exceptionally large datasets, the matrix operations can use a
320- # substantial amount of memory. For low memory applications or large
321- # datasets, set `low_memory=True` to use a slower but more memory-
322- # efficient implementation.
323- if low_memory :
324- combin = generate_new_combinations_low_memory (
325- itemset_dict [max_itemset ], X , min_support , is_sparse )
326- # slightly faster than creating an array from a list of tuples
327- combin = np .fromiter (combin , dtype = int )
328- combin = combin .reshape (- 1 , next_max_itemset + 1 )
329-
330- if combin .size == 0 :
331- break
332- if verbose :
333- print (
334- '\r Processing %d combinations | Sampling itemset size %d' %
335- (combin .size , next_max_itemset ), end = "" )
336-
337- itemset_dict [next_max_itemset ] = combin [:, 1 :]
338- support_dict [next_max_itemset ] = combin [:, 0 ].astype (float ) \
339- / rows_count
340- max_itemset = next_max_itemset
229+ combin = generate_new_combinations (itemset_dict [max_itemset ])
230+ # count supports
231+ frequent_itemsets = []
232+ frequent_supports = []
233+ processed = 0
234+ if is_sparse :
235+ count = np .empty (X .shape [0 ], dtype = int )
236+ for itemset in combin :
237+ processed += 1
238+ count [:] = 0
239+ for item in itemset :
240+ # Count nonnull entries via direct access to X indices;
241+ # this requires X to be stored in CSC format, and to call
242+ # X.eliminate_zeros() to remove null entries from X.
243+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]]] += 1
244+ support = np .count_nonzero (count == len (itemset )) / X .shape [0 ]
245+ if support >= min_support :
246+ frequent_itemsets .append (itemset )
247+ frequent_supports .append (support )
341248 else :
342- combin = generate_new_combinations (itemset_dict [max_itemset ])
343- combin = np .fromiter (combin , dtype = int )
344- combin = combin .reshape (- 1 , next_max_itemset )
345-
346- if combin .size == 0 :
347- break
348- if verbose :
349- print (
350- '\r Processing %d combinations | Sampling itemset size %d' %
351- (combin .size , next_max_itemset ), end = "" )
352-
353- if is_sparse :
354- _bools = X [:, combin [:, 0 ]] == all_ones
355- for n in range (1 , combin .shape [1 ]):
356- _bools = _bools & (X [:, combin [:, n ]] == all_ones )
357- else :
358- _bools = np .all (X [:, combin ], axis = 2 )
359-
360- support = _support (np .array (_bools ), rows_count , is_sparse )
361- _mask = (support >= min_support ).reshape (- 1 )
362- if any (_mask ):
363- itemset_dict [next_max_itemset ] = np .array (combin [_mask ])
364- support_dict [next_max_itemset ] = np .array (support [_mask ])
365- max_itemset = next_max_itemset
366- else :
367- # Exit condition
368- break
249+ _bools = np .empty (X .shape [0 ], dtype = bool )
250+ for itemset in combin :
251+ processed += 1
252+ _bools .fill (True )
253+ for item in itemset :
254+ np .logical_and (_bools , X [:, item ], out = _bools )
255+ support = np .count_nonzero (_bools ) / X .shape [0 ]
256+ if support >= min_support :
257+ frequent_itemsets .append (itemset )
258+ frequent_supports .append (support )
259+ if not frequent_itemsets :
260+ # Exit condition
261+ break
262+ if verbose :
263+ print (
264+ '\r Processed %d combinations | Sampling itemset size %d' %
265+ (processed , next_max_itemset ), end = "" )
266+ itemset_dict [next_max_itemset ] = frequent_itemsets
267+ support_dict [next_max_itemset ] = frequent_supports
268+ max_itemset = next_max_itemset
369269
370270 all_res = []
371271 for k in sorted (itemset_dict ):
0 commit comments