diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 8e41e75ae..18ed98fcd 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -14,24 +14,24 @@ def generate_new_combinations(old_combinations): Generator of all combinations based on the last state of Apriori algorithm Parameters ----------- - old_combinations: np.array + old_combinations: list of tuples All combinations with enough support in the last step - Combinations are represented by a matrix. - Number of columns is equal to the combination size + Combinations are represented by a list of tuples. + All tuples have the same length, which is equal to the combination size of the previous step. - Each row represents one combination + Each tuple represents one combination and contains item type ids in the ascending order ``` - 0 1 - 0 15 20 - 1 15 22 - 2 17 19 + 15 20 + 15 22 + 17 19 ``` Returns ----------- - Generator of all combinations from the last step x items - from the previous step. + Generator of combinations based on the last state of Apriori algorithm. + In order to reduce number of candidates, this function implements the + apriori-gen function described in section 2.1.1 of Apriori paper. Examples ----------- @@ -40,95 +40,28 @@ def generate_new_combinations(old_combinations): """ - items_types_in_previous_step = np.unique(old_combinations.flatten()) - for old_combination in old_combinations: - max_combination = old_combination[-1] - mask = items_types_in_previous_step > max_combination - valid_items = items_types_in_previous_step[mask] - old_tuple = tuple(old_combination) - for item in valid_items: - yield from old_tuple - yield item - - -def generate_new_combinations_low_memory(old_combinations, X, min_support, - is_sparse): - """ - Generator of all combinations based on the last state of Apriori algorithm - Parameters - ----------- - old_combinations: np.array - All combinations with enough support in the last step - Combinations are represented by a matrix. - Number of columns is equal to the combination size - of the previous step. - Each row represents one combination - and contains item type ids in the ascending order - ``` - 0 1 - 0 15 20 - 1 15 22 - 2 17 19 - ``` - - X: np.array or scipy sparse matrix - The allowed values are either 0/1 or True/False. - For example, - - ``` - 0 True False True True False True - 1 True False True False False True - 2 True False True False False False - 3 True True False False False False - 4 False False True True True True - 5 False False True False True True - 6 False False True False True False - 7 True True False False False False - ``` - - min_support : float (default: 0.5) - A float between 0 and 1 for minumum support of the itemsets returned. - The support is computed as the fraction - `transactions_where_item(s)_occur / total_transactions`. - - is_sparse : bool True if X is sparse - - Returns - ----------- - Generator of all combinations from the last step x items - from the previous step. Every combination contains the - number of transactions where this item occurs, followed - by item type ids in the ascending order. - No combination other than generated - do not have a chance to get enough support - - Examples - ----------- - For usage examples, please see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/ - - """ - - items_types_in_previous_step = np.unique(old_combinations.flatten()) - rows_count = X.shape[0] - threshold = min_support * rows_count - for old_combination in old_combinations: - max_combination = old_combination[-1] - mask = items_types_in_previous_step > max_combination - valid_items = items_types_in_previous_step[mask] - old_tuple = tuple(old_combination) - if is_sparse: - mask_rows = X[:, old_tuple].toarray().all(axis=1) - X_cols = X[:, valid_items].toarray() - supports = X_cols[mask_rows].sum(axis=0) - else: - mask_rows = X[:, old_tuple].all(axis=1) - supports = X[mask_rows][:, valid_items].sum(axis=0) - valid_indices = (supports >= threshold).nonzero()[0] - for index in valid_indices: - yield supports[index] - yield from old_tuple - yield valid_items[index] + length = len(old_combinations) + set_old_combinations = set(old_combinations) + for i, old_combination in enumerate(old_combinations): + head_i = list(old_combination[:-1]) + j = i + 1 + while j < length: + *head_j, tail_j = old_combinations[j] + if head_i != head_j: + break + # Prune old_combination+(item,) if any subset is not frequent + candidate = old_combination + (tail_j,) + # No need to check the last two values, because test_candidate + # is then old_combinations[i] and old_combinations[j] + for idx in range(len(candidate) - 2): + test_candidate = list(candidate) + del test_candidate[idx] + if tuple(test_candidate) not in set_old_combinations: + # early exit from for-loop skips else clause just below + break + else: + yield candidate + j = j + 1 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, @@ -168,16 +101,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, possible itemsets lengths (under the apriori condition) are evaluated. verbose : int (default: 0) - Shows the number of iterations if >= 1 and `low_memory` is `True`. If - >=1 and `low_memory` is `False`, shows the number of combinations. - - low_memory : bool (default: False) - If `True`, uses an iterator to search for combinations above - `min_support`. - Note that while `low_memory=True` should only be used for large dataset - if memory resources are limited, because this implementation is approx. - 3-6x slower than the default. - + Shows the number of combinations if >= 1. Returns ----------- @@ -197,32 +121,6 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, """ - def _support(_x, _n_rows, _is_sparse): - """DRY private method to calculate support as the - row-wise sum of values / number of rows - - Parameters - ----------- - - _x : matrix of bools or binary - - _n_rows : numeric, number of rows in _x - - _is_sparse : bool True if _x is sparse - - Returns - ----------- - np.array, shape = (n_rows, ) - - Examples - ----------- - For usage examples, please see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ - - """ - out = (np.sum(_x, axis=0) / _n_rows) - return np.array(out).reshape(-1) - if min_support <= 0.: raise ValueError('`min_support` must be a positive ' 'number within the interval `(0, 1]`. ' @@ -240,6 +138,8 @@ def _support(_x, _n_rows, _is_sparse): X = df.values else: X = df.to_coo().tocsc() + # See comment below + X.eliminate_zeros() is_sparse = True elif hasattr(df, "sparse"): # DataFrame with SparseArray (pandas >= 0.24) @@ -247,73 +147,68 @@ def _support(_x, _n_rows, _is_sparse): X = df.values else: X = df.sparse.to_coo().tocsc() + # See comment below + X.eliminate_zeros() is_sparse = True else: # dense DataFrame X = df.values is_sparse = False - support = _support(X, X.shape[0], is_sparse) - ary_col_idx = np.arange(X.shape[1]) + if is_sparse: + # Count nonnull entries via direct access to X indices; + # this requires X to be stored in CSC format, and to call + # X.eliminate_zeros() to remove null entries from X. + support = np.array([X.indptr[idx+1] - X.indptr[idx] + for idx in range(X.shape[1])], dtype=int) + else: + # Faster than np.count_nonzero(X, axis=0) or np.sum(X, axis=0), why? + support = np.array([np.count_nonzero(X[:, idx]) + for idx in range(X.shape[1])], dtype=int) + support = support / X.shape[0] support_dict = {1: support[support >= min_support]} - itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} + itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]} max_itemset = 1 - rows_count = float(X.shape[0]) - - all_ones = np.ones((int(rows_count), 1)) while max_itemset and max_itemset < (max_len or float('inf')): next_max_itemset = max_itemset + 1 - # With exceptionally large datasets, the matrix operations can use a - # substantial amount of memory. For low memory applications or large - # datasets, set `low_memory=True` to use a slower but more memory- - # efficient implementation. - if low_memory: - combin = generate_new_combinations_low_memory( - itemset_dict[max_itemset], X, min_support, is_sparse) - # slightly faster than creating an array from a list of tuples - combin = np.fromiter(combin, dtype=int) - combin = combin.reshape(-1, next_max_itemset + 1) - - if combin.size == 0: - break - if verbose: - print( - '\rProcessing %d combinations | Sampling itemset size %d' % - (combin.size, next_max_itemset), end="") - - itemset_dict[next_max_itemset] = combin[:, 1:] - support_dict[next_max_itemset] = combin[:, 0].astype(float) \ - / rows_count - max_itemset = next_max_itemset + combin = generate_new_combinations(itemset_dict[max_itemset]) + # count supports + frequent_itemsets = [] + frequent_supports = [] + processed = 0 + if is_sparse: + count = np.empty(X.shape[0], dtype=int) + for itemset in combin: + processed += 1 + count[:] = 0 + for item in itemset: + count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1 + support = np.count_nonzero(count == len(itemset)) / X.shape[0] + if support >= min_support: + frequent_itemsets.append(itemset) + frequent_supports.append(support) else: - combin = generate_new_combinations(itemset_dict[max_itemset]) - combin = np.fromiter(combin, dtype=int) - combin = combin.reshape(-1, next_max_itemset) - - if combin.size == 0: - break - if verbose: - print( - '\rProcessing %d combinations | Sampling itemset size %d' % - (combin.size, next_max_itemset), end="") - - if is_sparse: - _bools = X[:, combin[:, 0]] == all_ones - for n in range(1, combin.shape[1]): - _bools = _bools & (X[:, combin[:, n]] == all_ones) - else: - _bools = np.all(X[:, combin], axis=2) - - support = _support(np.array(_bools), rows_count, is_sparse) - _mask = (support >= min_support).reshape(-1) - if any(_mask): - itemset_dict[next_max_itemset] = np.array(combin[_mask]) - support_dict[next_max_itemset] = np.array(support[_mask]) - max_itemset = next_max_itemset - else: - # Exit condition - break + _bools = np.empty(X.shape[0], dtype=bool) + for itemset in combin: + processed += 1 + _bools.fill(True) + for item in itemset: + np.logical_and(_bools, X[:, item], out=_bools) + support = np.count_nonzero(_bools) / X.shape[0] + if support >= min_support: + frequent_itemsets.append(itemset) + frequent_supports.append(support) + if not frequent_itemsets: + # Exit condition + break + if verbose: + print( + '\rProcessed %d combinations | Sampling itemset size %d' % + (processed, next_max_itemset), end="") + itemset_dict[next_max_itemset] = frequent_itemsets + support_dict[next_max_itemset] = frequent_supports + max_itemset = next_max_itemset all_res = [] for k in sorted(itemset_dict): diff --git a/mlxtend/frequent_patterns/tests/benchmark.py b/mlxtend/frequent_patterns/tests/benchmark.py new file mode 100644 index 000000000..7a7435770 --- /dev/null +++ b/mlxtend/frequent_patterns/tests/benchmark.py @@ -0,0 +1,105 @@ +# Sebastian Raschka 2014-2019 +# myxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +from mlxtend.preprocessing import TransactionEncoder +from mlxtend.frequent_patterns import apriori +import pandas as pd +import numpy as np +import gzip +import os +import sys +from time import time +import signal +from contextlib import contextmanager + + +@contextmanager +def timeout(time): + # Register a function to raise a TimeoutError on the signal. + signal.signal(signal.SIGALRM, raise_timeout) + # Schedule the signal to be sent after ``time``. + signal.alarm(time) + + try: + yield + except TimeoutError: + pass + finally: + # Unregister the signal so it won't be triggered + # if the timeout is not reached. + signal.signal(signal.SIGALRM, signal.SIG_IGN) + + +def raise_timeout(signum, frame): + raise TimeoutError + + +files = [ + # "chess.dat.gz", + # "connect.dat.gz", + "mushroom.dat.gz", + "pumsb.dat.gz", + "pumsb_star.dat.gz", + # "T10I4D100K.dat.gz", + # "T40I10D100K.dat.gz", + # "kosarak.dat.gz", # this file is too large in sparse format + # "kosarak-1k.dat.gz", + # "kosarak-10k.dat.gz", + # "kosarak-50k.dat.gz", + # "kosarak-100k.dat.gz", + # "kosarak-200k.dat.gz", +] + + +low_memory = True +commit = "b731fd2" +test_supports = [0.5, 0.3, 0.1, 0.05, 0.03, 0.01, 0.005, 0.003, 0.001] + +for sparse, col_major in [[False, True], [False, False], [True, True]]: + sys.stdout = open("Results/{}-sparse{}-col_major{}.out".format( + commit, sparse, col_major), "w") + for filename in files: + with gzip.open(os.path.join("data", filename)) if filename.endswith( + ".gz" + ) else open(os.path.join("data", filename)) as f: + data = f.readlines() + + dataset = [list(map(int, line.split())) for line in data] + items = np.unique([item for itemset in dataset for item in itemset]) + print("{} contains {} transactions and {} items".format( + filename, len(dataset), len(items))) + + te = TransactionEncoder() + te_ary = te.fit(dataset).transform(dataset, sparse=sparse) + columns = ["c"+str(i) for i in te.columns_] + if sparse: + try: + df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=columns) + except AttributeError: + # pandas < 0.25 + df = pd.SparseDataFrame(te_ary, columns=columns, + default_fill_value=False) + else: + df = pd.DataFrame(te_ary, columns=columns) + if col_major: + df = pd.DataFrame({col: df[col] for col in df.columns}) + np.info(df.values) + + kwds = {"use_colnames": False, "low_memory": low_memory} + for min_support in test_supports: + tick = time() + with timeout(120): + print(apriori(df, min_support=min_support, verbose=1, **kwds)) + print("\nmin_support={} temps: {}\n".format( + min_support, time() - tick)) + if time() - tick < 10: + times = [] + for _ in range(5): + tick = time() + apriori(df, min_support=min_support, verbose=0, **kwds) + times.append(time() - tick) + print("Times:", times) + sys.stdout.close() diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py index 4a7f79e12..30551a03c 100644 --- a/mlxtend/frequent_patterns/tests/test_fpbase.py +++ b/mlxtend/frequent_patterns/tests/test_fpbase.py @@ -229,7 +229,7 @@ def test_low_memory_flag(self): _ = self.fpalgo(self.df, low_memory=True, verbose=1) # Only get the last value of the stream to reduce test noise - expect = 'Processing 4 combinations | Sampling itemset size 3\n' + expect = 'Processed 1 combinations | Sampling itemset size 3\n' out = out.getvalue().split('\r')[-1] assert out == expect else: