Skip to content

Commit 7eb928e

Browse files
committed
Let apriori always use low_memory processing
Thanks to previous optimizations, processing with low_memory=True is now almost as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler and allows to generate itemsets by a generator, which saves more memory. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length, which explains why output is different now.
1 parent fe783b5 commit 7eb928e

File tree

2 files changed

+46
-129
lines changed

2 files changed

+46
-129
lines changed

mlxtend/frequent_patterns/apriori.py

Lines changed: 45 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -61,88 +61,36 @@ def generate_new_combinations(old_combinations):
6161
# early exit from for-loop skips else clause just below
6262
break
6363
else:
64-
yield from candidate
64+
yield candidate
6565
j = j + 1
6666

6767

68-
def generate_new_combinations_low_memory(old_combinations, X, min_support,
69-
is_sparse):
70-
"""
71-
Generator of all combinations based on the last state of Apriori algorithm
72-
Parameters
73-
-----------
74-
old_combinations: np.array
75-
All combinations with enough support in the last step
76-
Combinations are represented by a matrix.
77-
Number of columns is equal to the combination size
78-
of the previous step.
79-
Each row represents one combination
80-
and contains item type ids in the ascending order
81-
```
82-
0 1
83-
0 15 20
84-
1 15 22
85-
2 17 19
86-
```
87-
88-
X: np.array or scipy sparse matrix
89-
The allowed values are either 0/1 or True/False.
90-
For example,
91-
92-
```
93-
0 True False True True False True
94-
1 True False True False False True
95-
2 True False True False False False
96-
3 True True False False False False
97-
4 False False True True True True
98-
5 False False True False True True
99-
6 False False True False True False
100-
7 True True False False False False
101-
```
102-
103-
min_support : float (default: 0.5)
104-
A float between 0 and 1 for minumum support of the itemsets returned.
105-
The support is computed as the fraction
106-
`transactions_where_item(s)_occur / total_transactions`.
107-
108-
is_sparse : bool True if X is sparse
109-
110-
Returns
111-
-----------
112-
Generator of all combinations from the last step x items
113-
from the previous step. Every combination contains the
114-
number of transactions where this item occurs, followed
115-
by item type ids in the ascending order.
116-
No combination other than generated
117-
do not have a chance to get enough support
118-
119-
Examples
120-
-----------
121-
For usage examples, please see
122-
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
123-
124-
"""
125-
126-
items_types_in_previous_step = np.unique(old_combinations.flatten())
127-
rows_count = X.shape[0]
128-
threshold = min_support * rows_count
129-
for old_combination in old_combinations:
130-
max_combination = old_combination[-1]
131-
mask = items_types_in_previous_step > max_combination
132-
valid_items = items_types_in_previous_step[mask]
133-
old_tuple = tuple(old_combination)
134-
if is_sparse:
135-
mask_rows = X[:, old_tuple].toarray().all(axis=1)
136-
X_cols = X[:, valid_items].toarray()
137-
supports = X_cols[mask_rows].sum(axis=0)
138-
else:
139-
mask_rows = X[:, old_tuple].all(axis=1)
140-
supports = X[mask_rows][:, valid_items].sum(axis=0)
141-
valid_indices = (supports >= threshold).nonzero()[0]
142-
for index in valid_indices:
143-
yield supports[index]
144-
yield from old_tuple
145-
yield valid_items[index]
68+
def generate_supports_and_itemsets(X, is_sparse, combin, min_support):
69+
counter = 0
70+
if is_sparse:
71+
count = np.empty(X.shape[0], dtype=int)
72+
for itemset in combin:
73+
counter += 1
74+
count[:] = 0
75+
for item in itemset:
76+
# faster than X[:, item].toarray() or X.getcol(item).indices
77+
count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
78+
support = np.count_nonzero(count == len(itemset))
79+
if support >= min_support:
80+
yield support
81+
yield from itemset
82+
else:
83+
for itemset in combin:
84+
counter += 1
85+
_bools = np.ones(X.shape[0], dtype=bool)
86+
for item in itemset:
87+
np.logical_and(_bools, X[:, item], out=_bools)
88+
support = np.count_nonzero(_bools)
89+
if support >= min_support:
90+
yield support
91+
yield from itemset
92+
# return the total of processed itemsets as last element
93+
yield counter
14694

14795

14896
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
@@ -254,13 +202,15 @@ def _support(_x, _n_rows, _is_sparse):
254202
X = df.values
255203
else:
256204
X = df.to_coo().tocsc()
205+
X.eliminate_zeros()
257206
is_sparse = True
258207
elif hasattr(df, "sparse"):
259208
# DataFrame with SparseArray (pandas >= 0.24)
260209
if df.size == 0:
261210
X = df.values
262211
else:
263212
X = df.sparse.to_coo().tocsc()
213+
X.eliminate_zeros()
264214
is_sparse = True
265215
else:
266216
# dense DataFrame
@@ -271,63 +221,30 @@ def _support(_x, _n_rows, _is_sparse):
271221
support_dict = {1: support[support >= min_support]}
272222
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
273223
max_itemset = 1
274-
rows_count = float(X.shape[0])
275-
276-
all_ones = np.ones((int(rows_count), 1))
277224

278225
while max_itemset and max_itemset < (max_len or float('inf')):
279226
next_max_itemset = max_itemset + 1
280227

281-
# With exceptionally large datasets, the matrix operations can use a
282-
# substantial amount of memory. For low memory applications or large
283-
# datasets, set `low_memory=True` to use a slower but more memory-
284-
# efficient implementation.
285-
if low_memory:
286-
combin = generate_new_combinations_low_memory(
287-
itemset_dict[max_itemset], X, min_support, is_sparse)
288-
# slightly faster than creating an array from a list of tuples
289-
combin = np.fromiter(combin, dtype=int)
290-
combin = combin.reshape(-1, next_max_itemset + 1)
291-
292-
if combin.size == 0:
293-
break
228+
combin = generate_new_combinations(itemset_dict[max_itemset])
229+
min_rows = np.math.ceil(min_support * X.shape[0])
230+
gen_itemsets = generate_supports_and_itemsets(
231+
X, is_sparse, combin, min_rows)
232+
233+
support_valid_itemsets = np.fromiter(gen_itemsets, dtype=int)
234+
processed_itemsets = support_valid_itemsets[-1]
235+
support_valid_itemsets = support_valid_itemsets[:-1]
236+
if support_valid_itemsets.size > 0:
294237
if verbose:
295238
print(
296-
'\rProcessing %d combinations | Sampling itemset size %d' %
297-
(combin.size, next_max_itemset), end="")
298-
299-
itemset_dict[next_max_itemset] = combin[:, 1:]
300-
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
301-
/ rows_count
239+
'\rProcessed %d combinations | Sampling itemset size %d' %
240+
(processed_itemsets, next_max_itemset), end="")
241+
support_valid_itemsets.shape = (-1, 1 + next_max_itemset)
242+
itemset_dict[next_max_itemset] = support_valid_itemsets[:, 1:]
243+
support_dict[next_max_itemset] = support_valid_itemsets[:, 0] / X.shape[0]
302244
max_itemset = next_max_itemset
303245
else:
304-
combin = generate_new_combinations(itemset_dict[max_itemset])
305-
combin = np.fromiter(combin, dtype=int)
306-
combin = combin.reshape(-1, next_max_itemset)
307-
308-
if combin.size == 0:
309-
break
310-
if verbose:
311-
print(
312-
'\rProcessing %d combinations | Sampling itemset size %d' %
313-
(combin.size, next_max_itemset), end="")
314-
315-
if is_sparse:
316-
_bools = X[:, combin[:, 0]] == all_ones
317-
for n in range(1, combin.shape[1]):
318-
_bools = _bools & (X[:, combin[:, n]] == all_ones)
319-
else:
320-
_bools = np.all(X[:, combin], axis=2)
321-
322-
support = _support(np.array(_bools), rows_count, is_sparse)
323-
_mask = (support >= min_support).reshape(-1)
324-
if any(_mask):
325-
itemset_dict[next_max_itemset] = np.array(combin[_mask])
326-
support_dict[next_max_itemset] = np.array(support[_mask])
327-
max_itemset = next_max_itemset
328-
else:
329-
# Exit condition
330-
break
246+
# Exit condition
247+
break
331248

332249
all_res = []
333250
for k in sorted(itemset_dict):

mlxtend/frequent_patterns/tests/test_fpbase.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def test_low_memory_flag(self):
229229
_ = self.fpalgo(self.df, low_memory=True, verbose=1)
230230

231231
# Only get the last value of the stream to reduce test noise
232-
expect = 'Processing 4 combinations | Sampling itemset size 3\n'
232+
expect = 'Processed 1 combinations | Sampling itemset size 3\n'
233233
out = out.getvalue().split('\r')[-1]
234234
assert out == expect
235235
else:

0 commit comments

Comments
 (0)