Skip to content

Commit 71356a7

Browse files
committed
Let apriori always use low_memory processing
Thanks to previous optimizations, processing with low_memory=True is now almost as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler and allows to generate itemsets by a generator, which saves more memory. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length, which explains why output is different now.
1 parent 8ed16b7 commit 71356a7

File tree

2 files changed

+40
-56
lines changed

2 files changed

+40
-56
lines changed

mlxtend/frequent_patterns/apriori.py

Lines changed: 39 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -61,40 +61,37 @@ def generate_new_combinations(old_combinations):
6161
# early exit from for-loop skips else clause just below
6262
break
6363
else:
64-
yield from candidate
64+
yield candidate
6565
j = j + 1
6666

6767

68-
def compute_supports_low_memory(X, is_sparse, combin):
69-
supports = np.zeros(combin.shape[0])
70-
ncomb, nitems = combin.shape
68+
def generate_supports_and_itemsets(X, is_sparse, combin, min_support):
69+
counter = 0
7170
if is_sparse:
72-
_bools = X[:, 0].toarray()
73-
for c in range(ncomb):
74-
_bools[:] = X[:, combin[c, 0]].toarray()
75-
for j in range(1, nitems):
76-
np.logical_and(_bools, X[:, combin[c, j]].toarray(), out=_bools)
77-
supports[c] = np.count_nonzero(_bools)
71+
count = np.empty(X.shape[0], dtype=int)
72+
for itemset in combin:
73+
counter += 1
74+
count[:] = 0
75+
for item in itemset:
76+
nnz = np.nonzero(X.data[X.indptr[item]:X.indptr[item+1]])[0]
77+
# much faster than X[:, item].toarray() or X.getcol(item).indices
78+
count[X.indices[X.indptr[item]:X.indptr[item+1]][nnz]] += 1
79+
support = np.count_nonzero(count == len(itemset))
80+
if support >= min_support:
81+
yield support
82+
yield from itemset
7883
else:
79-
_bools = np.copy(X[:, 0])
80-
for c in range(ncomb):
81-
_bools[:] = X[:, combin[c, 0]]
82-
for j in range(1, nitems):
83-
np.logical_and(_bools, X[:, combin[c, j]], out=_bools)
84-
supports[c] = np.count_nonzero(_bools)
85-
return supports
86-
87-
88-
def compute_supports(X, is_sparse, combin):
89-
if is_sparse:
90-
all_ones = np.ones((X.shape[0], 1))
91-
_bools = np.array(X[:, combin[:, 0]] == all_ones)
92-
for n in range(1, combin.shape[1]):
93-
_bools = _bools & np.array(X[:, combin[:, n]] == all_ones)
94-
else:
95-
_bools = np.all(X[:, combin], axis=2)
96-
97-
return np.sum(_bools, axis=0)
84+
for itemset in combin:
85+
counter += 1
86+
_bools = np.ones(X.shape[0], dtype=bool)
87+
for item in itemset:
88+
np.logical_and(_bools, X[:, item], out=_bools)
89+
support = np.count_nonzero(_bools)
90+
if support >= min_support:
91+
yield support
92+
yield from itemset
93+
# return the total of processed itemsets as last element
94+
yield counter
9895

9996

10097
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
@@ -223,37 +220,24 @@ def _support(_x, _n_rows, _is_sparse):
223220
support_dict = {1: support[support >= min_support]}
224221
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
225222
max_itemset = 1
226-
rows_count = float(X.shape[0])
227-
228-
all_ones = np.ones((int(rows_count), 1))
229223

230224
while max_itemset and max_itemset < (max_len or float('inf')):
231225
next_max_itemset = max_itemset + 1
232226

233227
combin = generate_new_combinations(itemset_dict[max_itemset])
234-
combin = np.fromiter(combin, dtype=int)
235-
combin = combin.reshape(-1, next_max_itemset)
236-
237-
if combin.size == 0:
238-
break
239-
if verbose:
240-
print(
241-
'\rProcessing %d combinations | Sampling itemset size %d' %
242-
(combin.size, next_max_itemset), end="")
243-
244-
# With exceptionally large datasets, the matrix operations can use a
245-
# substantial amount of memory. For low memory applications or large
246-
# datasets, set `low_memory=True` to use a slower but more memory-
247-
# efficient implementation.
248-
if low_memory:
249-
support = compute_supports_low_memory(X, is_sparse, combin) / rows_count
250-
else:
251-
support = compute_supports(X, is_sparse, combin) / rows_count
252-
253-
_mask = (support >= min_support)
254-
if any(_mask):
255-
itemset_dict[next_max_itemset] = np.array(combin[_mask])
256-
support_dict[next_max_itemset] = np.array(support[_mask])
228+
gen_itemsets = generate_supports_and_itemsets(X, is_sparse, combin, np.math.ceil(min_support * X.shape[0]))
229+
230+
support_valid_itemsets = np.fromiter(gen_itemsets, dtype=int)
231+
processed_itemsets = support_valid_itemsets[-1]
232+
233+
support_valid_itemsets = support_valid_itemsets[:-1].reshape(-1, 1 + next_max_itemset)
234+
if support_valid_itemsets.size > 0:
235+
if verbose:
236+
print(
237+
'\rProcessed %d combinations | Sampling itemset size %d' %
238+
(processed_itemsets, next_max_itemset), end="")
239+
itemset_dict[next_max_itemset] = support_valid_itemsets[:, 1:]
240+
support_dict[next_max_itemset] = support_valid_itemsets[:, 0] / X.shape[0]
257241
max_itemset = next_max_itemset
258242
else:
259243
# Exit condition

mlxtend/frequent_patterns/tests/test_fpbase.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def test_low_memory_flag(self):
229229
_ = self.fpalgo(self.df, low_memory=True, verbose=1)
230230

231231
# Only get the last value of the stream to reduce test noise
232-
expect = 'Processing 3 combinations | Sampling itemset size 3\n'
232+
expect = 'Processed 1 combinations | Sampling itemset size 3\n'
233233
out = out.getvalue().split('\r')[-1]
234234
assert out == expect
235235
else:

0 commit comments

Comments
 (0)