Skip to content

Commit 85ca67d

Browse files
committed
Let apriori always use low_memory processing
Thanks to previous optimizations, processing with low_memory=True is now as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. We now display the number of itemsets after prune step. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length. Since vectorized operations are no more performed on frequent itemsets, they are stored as list of tuples.
1 parent f8131a7 commit 85ca67d

File tree

2 files changed

+55
-155
lines changed

2 files changed

+55
-155
lines changed

mlxtend/frequent_patterns/apriori.py

Lines changed: 54 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,17 @@ def generate_new_combinations(old_combinations):
5252
Generator of all combinations based on the last state of Apriori algorithm
5353
Parameters
5454
-----------
55-
old_combinations: np.array
55+
old_combinations: list of tuples
5656
All combinations with enough support in the last step
57-
Combinations are represented by a matrix.
58-
Number of columns is equal to the combination size
57+
Combinations are represented by a list of tuples.
58+
All tuples have the same length, which is equal to the combination size
5959
of the previous step.
60-
Each row represents one combination
60+
Each tuple represents one combination
6161
and contains item type ids in the ascending order
6262
```
63-
0 1
64-
0 15 20
65-
1 15 22
66-
2 17 19
63+
15 20
64+
15 22
65+
17 19
6766
```
6867
6968
Returns
@@ -89,7 +88,7 @@ def generate_new_combinations(old_combinations):
8988
if head_i != head_j:
9089
break
9190
# Prune old_combination+(item,) if any subset is not frequent
92-
candidate = tuple(old_combination) + (tail_j,)
91+
candidate = old_combination + (tail_j,)
9392
# No need to check the last two values, because test_candidate
9493
# is then old_combinations[i] and old_combinations[j]
9594
for idx in range(len(candidate) - 2):
@@ -99,90 +98,10 @@ def generate_new_combinations(old_combinations):
9998
# early exit from for-loop skips else clause just below
10099
break
101100
else:
102-
yield from candidate
101+
yield candidate
103102
j = j + 1
104103

105104

106-
def generate_new_combinations_low_memory(old_combinations, X, min_support,
107-
is_sparse):
108-
"""
109-
Generator of all combinations based on the last state of Apriori algorithm
110-
Parameters
111-
-----------
112-
old_combinations: np.array
113-
All combinations with enough support in the last step
114-
Combinations are represented by a matrix.
115-
Number of columns is equal to the combination size
116-
of the previous step.
117-
Each row represents one combination
118-
and contains item type ids in the ascending order
119-
```
120-
0 1
121-
0 15 20
122-
1 15 22
123-
2 17 19
124-
```
125-
126-
X: np.array or scipy sparse matrix
127-
The allowed values are either 0/1 or True/False.
128-
For example,
129-
130-
```
131-
0 True False True True False True
132-
1 True False True False False True
133-
2 True False True False False False
134-
3 True True False False False False
135-
4 False False True True True True
136-
5 False False True False True True
137-
6 False False True False True False
138-
7 True True False False False False
139-
```
140-
141-
min_support : float (default: 0.5)
142-
A float between 0 and 1 for minumum support of the itemsets returned.
143-
The support is computed as the fraction
144-
`transactions_where_item(s)_occur / total_transactions`.
145-
146-
is_sparse : bool True if X is sparse
147-
148-
Returns
149-
-----------
150-
Generator of all combinations from the last step x items
151-
from the previous step. Every combination contains the
152-
number of transactions where this item occurs, followed
153-
by item type ids in the ascending order.
154-
No combination other than generated
155-
do not have a chance to get enough support
156-
157-
Examples
158-
-----------
159-
For usage examples, please see
160-
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
161-
162-
"""
163-
164-
items_types_in_previous_step = np.unique(old_combinations.flatten())
165-
rows_count = X.shape[0]
166-
threshold = min_support * rows_count
167-
for old_combination in old_combinations:
168-
max_combination = old_combination[-1]
169-
mask = items_types_in_previous_step > max_combination
170-
valid_items = items_types_in_previous_step[mask]
171-
old_tuple = tuple(old_combination)
172-
if is_sparse:
173-
mask_rows = X[:, old_tuple].toarray().all(axis=1)
174-
X_cols = X[:, valid_items].toarray()
175-
supports = X_cols[mask_rows].sum(axis=0)
176-
else:
177-
mask_rows = X[:, old_tuple].all(axis=1)
178-
supports = X[mask_rows][:, valid_items].sum(axis=0)
179-
valid_indices = (supports >= threshold).nonzero()[0]
180-
for index in valid_indices:
181-
yield supports[index]
182-
yield from old_tuple
183-
yield valid_items[index]
184-
185-
186105
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
187106
low_memory=False):
188107
"""Get frequent itemsets from a one-hot DataFrame
@@ -220,16 +139,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
220139
possible itemsets lengths (under the apriori condition) are evaluated.
221140
222141
verbose : int (default: 0)
223-
Shows the number of iterations if >= 1 and `low_memory` is `True`. If
224-
>=1 and `low_memory` is `False`, shows the number of combinations.
225-
226-
low_memory : bool (default: False)
227-
If `True`, uses an iterator to search for combinations above
228-
`min_support`.
229-
Note that while `low_memory=True` should only be used for large dataset
230-
if memory resources are limited, because this implementation is approx.
231-
3-6x slower than the default.
232-
142+
Shows the number of combinations if >= 1.
233143
234144
Returns
235145
-----------
@@ -292,80 +202,70 @@ def _support(_x, _n_rows, _is_sparse):
292202
X = df.values
293203
else:
294204
X = df.to_coo().tocsc()
205+
# See comment below
206+
X.eliminate_zeros()
295207
is_sparse = True
296208
elif hasattr(df, "sparse"):
297209
# DataFrame with SparseArray (pandas >= 0.24)
298210
if df.size == 0:
299211
X = df.values
300212
else:
301213
X = df.sparse.to_coo().tocsc()
214+
# See comment below
215+
X.eliminate_zeros()
302216
is_sparse = True
303217
else:
304218
# dense DataFrame
305219
X = df.values
306220
is_sparse = False
307221
support = _support(X, X.shape[0], is_sparse)
308-
ary_col_idx = np.arange(X.shape[1])
309222
support_dict = {1: support[support >= min_support]}
310-
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
223+
itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]}
311224
max_itemset = 1
312-
rows_count = float(X.shape[0])
313-
314-
all_ones = np.ones((int(rows_count), 1))
315225

316226
while max_itemset and max_itemset < (max_len or float('inf')):
317227
next_max_itemset = max_itemset + 1
318228

319-
# With exceptionally large datasets, the matrix operations can use a
320-
# substantial amount of memory. For low memory applications or large
321-
# datasets, set `low_memory=True` to use a slower but more memory-
322-
# efficient implementation.
323-
if low_memory:
324-
combin = generate_new_combinations_low_memory(
325-
itemset_dict[max_itemset], X, min_support, is_sparse)
326-
# slightly faster than creating an array from a list of tuples
327-
combin = np.fromiter(combin, dtype=int)
328-
combin = combin.reshape(-1, next_max_itemset + 1)
329-
330-
if combin.size == 0:
331-
break
332-
if verbose:
333-
print(
334-
'\rProcessing %d combinations | Sampling itemset size %d' %
335-
(combin.size, next_max_itemset), end="")
336-
337-
itemset_dict[next_max_itemset] = combin[:, 1:]
338-
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
339-
/ rows_count
340-
max_itemset = next_max_itemset
229+
combin = generate_new_combinations(itemset_dict[max_itemset])
230+
# count supports
231+
frequent_itemsets = []
232+
frequent_supports = []
233+
processed = 0
234+
if is_sparse:
235+
count = np.empty(X.shape[0], dtype=int)
236+
for itemset in combin:
237+
processed += 1
238+
count[:] = 0
239+
for item in itemset:
240+
# Count nonnull entries via direct access to X indices;
241+
# this requires X to be stored in CSC format, and to call
242+
# X.eliminate_zeros() to remove null entries from X.
243+
count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
244+
support = np.count_nonzero(count == len(itemset)) / X.shape[0]
245+
if support >= min_support:
246+
frequent_itemsets.append(itemset)
247+
frequent_supports.append(support)
341248
else:
342-
combin = generate_new_combinations(itemset_dict[max_itemset])
343-
combin = np.fromiter(combin, dtype=int)
344-
combin = combin.reshape(-1, next_max_itemset)
345-
346-
if combin.size == 0:
347-
break
348-
if verbose:
349-
print(
350-
'\rProcessing %d combinations | Sampling itemset size %d' %
351-
(combin.size, next_max_itemset), end="")
352-
353-
if is_sparse:
354-
_bools = X[:, combin[:, 0]] == all_ones
355-
for n in range(1, combin.shape[1]):
356-
_bools = _bools & (X[:, combin[:, n]] == all_ones)
357-
else:
358-
_bools = np.all(X[:, combin], axis=2)
359-
360-
support = _support(np.array(_bools), rows_count, is_sparse)
361-
_mask = (support >= min_support).reshape(-1)
362-
if any(_mask):
363-
itemset_dict[next_max_itemset] = np.array(combin[_mask])
364-
support_dict[next_max_itemset] = np.array(support[_mask])
365-
max_itemset = next_max_itemset
366-
else:
367-
# Exit condition
368-
break
249+
_bools = np.empty(X.shape[0], dtype=bool)
250+
for itemset in combin:
251+
processed += 1
252+
_bools.fill(True)
253+
for item in itemset:
254+
np.logical_and(_bools, X[:, item], out=_bools)
255+
support = np.count_nonzero(_bools) / X.shape[0]
256+
if support >= min_support:
257+
frequent_itemsets.append(itemset)
258+
frequent_supports.append(support)
259+
if not frequent_itemsets:
260+
# Exit condition
261+
break
262+
if verbose:
263+
print(
264+
'\rProcessed %d combinations | Sampling itemset size %d' %
265+
(processed, next_max_itemset), end="")
266+
itemset_dict[next_max_itemset] = frequent_itemsets
267+
support_dict[next_max_itemset] = frequent_supports
268+
max_itemset = next_max_itemset
369269

370270
all_res = []
371271
for k in sorted(itemset_dict):

mlxtend/frequent_patterns/tests/test_fpbase.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def test_low_memory_flag(self):
229229
_ = self.fpalgo(self.df, low_memory=True, verbose=1)
230230

231231
# Only get the last value of the stream to reduce test noise
232-
expect = 'Processing 4 combinations | Sampling itemset size 3\n'
232+
expect = 'Processed 1 combinations | Sampling itemset size 3\n'
233233
out = out.getvalue().split('\r')[-1]
234234
assert out == expect
235235
else:

0 commit comments

Comments
 (0)