Let apriori always use low_memory processing

dbarbier · dbarbier · commit 85ca67d3da0f · 2020-01-03T16:42:19.000+01:00
Thanks to previous optimizations, processing with low_memory=True is now as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. We now display the number of itemsets after prune step. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length. Since vectorized operations are no more performed on frequent itemsets, they are stored as list of tuples.
diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -52,18 +52,17 @@ def generate_new_combinations(old_combinations):
     Generator of all combinations based on the last state of Apriori algorithm
     Parameters
     -----------
-    old_combinations: np.array
+    old_combinations: list of tuples
         All combinations with enough support in the last step
-        Combinations are represented by a matrix.
-        Number of columns is equal to the combination size
+        Combinations are represented by a list of tuples.
+        All tuples have the same length, which is equal to the combination size
         of the previous step.
-        Each row represents one combination
+        Each tuple represents one combination
         and contains item type ids in the ascending order
         ```
-               0        1
-        0      15       20
-        1      15       22
-        2      17       19
+           15       20
+           15       22
+           17       19
         ```
 
     Returns
@@ -89,7 +88,7 @@ def generate_new_combinations(old_combinations):
             if head_i != head_j:
                 break
             # Prune old_combination+(item,) if any subset is not frequent
-            candidate = tuple(old_combination) + (tail_j,)
+            candidate = old_combination + (tail_j,)
             # No need to check the last two values, because test_candidate
             # is then old_combinations[i] and old_combinations[j]
             for idx in range(len(candidate) - 2):
@@ -99,90 +98,10 @@ def generate_new_combinations(old_combinations):
                     # early exit from for-loop skips else clause just below
                     break
             else:
-                yield from candidate
+                yield candidate
             j = j + 1
 
 
-def generate_new_combinations_low_memory(old_combinations, X, min_support,
-                                         is_sparse):
-    """
-    Generator of all combinations based on the last state of Apriori algorithm
-    Parameters
-    -----------
-    old_combinations: np.array
-        All combinations with enough support in the last step
-        Combinations are represented by a matrix.
-        Number of columns is equal to the combination size
-        of the previous step.
-        Each row represents one combination
-        and contains item type ids in the ascending order
-        ```
-               0        1
-        0      15       20
-        1      15       22
-        2      17       19
-        ```
-
-    X: np.array or scipy sparse matrix
-      The allowed values are either 0/1 or True/False.
-      For example,
-
-    ```
-        0     True False  True  True False  True
-        1     True False  True False False  True
-        2     True False  True False False False
-        3     True  True False False False False
-        4    False False  True  True  True  True
-        5    False False  True False  True  True
-        6    False False  True False  True False
-        7     True  True False False False False
-    ```
-
-    min_support : float (default: 0.5)
-      A float between 0 and 1 for minumum support of the itemsets returned.
-      The support is computed as the fraction
-      `transactions_where_item(s)_occur / total_transactions`.
-
-    is_sparse : bool True if X is sparse
-
-    Returns
-    -----------
-    Generator of all combinations from the last step x items
-    from the previous step. Every combination contains the
-    number of transactions where this item occurs, followed
-    by item type ids in the ascending order.
-    No combination other than generated
-    do not have a chance to get enough support
-
-    Examples
-    -----------
-    For usage examples, please see
-    http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
-
-    """
-
-    items_types_in_previous_step = np.unique(old_combinations.flatten())
-    rows_count = X.shape[0]
-    threshold = min_support * rows_count
-    for old_combination in old_combinations:
-        max_combination = old_combination[-1]
-        mask = items_types_in_previous_step > max_combination
-        valid_items = items_types_in_previous_step[mask]
-        old_tuple = tuple(old_combination)
-        if is_sparse:
-            mask_rows = X[:, old_tuple].toarray().all(axis=1)
-            X_cols = X[:, valid_items].toarray()
-            supports = X_cols[mask_rows].sum(axis=0)
-        else:
-            mask_rows = X[:, old_tuple].all(axis=1)
-            supports = X[mask_rows][:, valid_items].sum(axis=0)
-        valid_indices = (supports >= threshold).nonzero()[0]
-        for index in valid_indices:
-            yield supports[index]
-            yield from old_tuple
-            yield valid_items[index]
-
-
 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
             low_memory=False):
     """Get frequent itemsets from a one-hot DataFrame
@@ -220,16 +139,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
       possible itemsets lengths (under the apriori condition) are evaluated.
 
     verbose : int (default: 0)
-      Shows the number of iterations if >= 1 and `low_memory` is `True`. If
-      >=1 and `low_memory` is `False`, shows the number of combinations.
-
-    low_memory : bool (default: False)
-      If `True`, uses an iterator to search for combinations above
-      `min_support`.
-      Note that while `low_memory=True` should only be used for large dataset
-      if memory resources are limited, because this implementation is approx.
-      3-6x slower than the default.
-
+      Shows the number of combinations if >= 1.
 
     Returns
     -----------
@@ -292,80 +202,70 @@ def _support(_x, _n_rows, _is_sparse):
             X = df.values
         else:
             X = df.to_coo().tocsc()
+            # See comment below
+            X.eliminate_zeros()
         is_sparse = True
     elif hasattr(df, "sparse"):
         # DataFrame with SparseArray (pandas >= 0.24)
         if df.size == 0:
             X = df.values
         else:
             X = df.sparse.to_coo().tocsc()
+            # See comment below
+            X.eliminate_zeros()
         is_sparse = True
     else:
         # dense DataFrame
         X = df.values
         is_sparse = False
     support = _support(X, X.shape[0], is_sparse)
-    ary_col_idx = np.arange(X.shape[1])
     support_dict = {1: support[support >= min_support]}
-    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
+    itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]}
     max_itemset = 1
-    rows_count = float(X.shape[0])
-
-    all_ones = np.ones((int(rows_count), 1))
 
     while max_itemset and max_itemset < (max_len or float('inf')):
         next_max_itemset = max_itemset + 1
 
-        # With exceptionally large datasets, the matrix operations can use a
-        # substantial amount of memory. For low memory applications or large
-        # datasets, set `low_memory=True` to use a slower but more memory-
-        # efficient implementation.
-        if low_memory:
-            combin = generate_new_combinations_low_memory(
-                itemset_dict[max_itemset], X, min_support, is_sparse)
-            # slightly faster than creating an array from a list of tuples
-            combin = np.fromiter(combin, dtype=int)
-            combin = combin.reshape(-1, next_max_itemset + 1)
-
-            if combin.size == 0:
-                break
-            if verbose:
-                print(
-                    '\rProcessing %d combinations | Sampling itemset size %d' %
-                    (combin.size, next_max_itemset), end="")
-
-            itemset_dict[next_max_itemset] = combin[:, 1:]
-            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
-                / rows_count
-            max_itemset = next_max_itemset
+        combin = generate_new_combinations(itemset_dict[max_itemset])
+        # count supports
+        frequent_itemsets = []
+        frequent_supports = []
+        processed = 0
+        if is_sparse:
+            count = np.empty(X.shape[0], dtype=int)
+            for itemset in combin:
+                processed += 1
+                count[:] = 0
+                for item in itemset:
+                    # Count nonnull entries via direct access to X indices;
+                    # this requires X to be stored in CSC format, and to call
+                    # X.eliminate_zeros() to remove null entries from X.
+                    count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
+                support = np.count_nonzero(count == len(itemset)) / X.shape[0]
+                if support >= min_support:
+                    frequent_itemsets.append(itemset)
+                    frequent_supports.append(support)
         else:
-            combin = generate_new_combinations(itemset_dict[max_itemset])
-            combin = np.fromiter(combin, dtype=int)
-            combin = combin.reshape(-1, next_max_itemset)
-
-            if combin.size == 0:
-                break
-            if verbose:
-                print(
-                    '\rProcessing %d combinations | Sampling itemset size %d' %
-                    (combin.size, next_max_itemset), end="")
-
-            if is_sparse:
-                _bools = X[:, combin[:, 0]] == all_ones
-                for n in range(1, combin.shape[1]):
-                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
-            else:
-                _bools = np.all(X[:, combin], axis=2)
-
-            support = _support(np.array(_bools), rows_count, is_sparse)
-            _mask = (support >= min_support).reshape(-1)
-            if any(_mask):
-                itemset_dict[next_max_itemset] = np.array(combin[_mask])
-                support_dict[next_max_itemset] = np.array(support[_mask])
-                max_itemset = next_max_itemset
-            else:
-                # Exit condition
-                break
+            _bools = np.empty(X.shape[0], dtype=bool)
+            for itemset in combin:
+                processed += 1
+                _bools.fill(True)
+                for item in itemset:
+                    np.logical_and(_bools, X[:, item], out=_bools)
+                support = np.count_nonzero(_bools) / X.shape[0]
+                if support >= min_support:
+                    frequent_itemsets.append(itemset)
+                    frequent_supports.append(support)
+        if not frequent_itemsets:
+            # Exit condition
+            break
+        if verbose:
+            print(
+                '\rProcessed %d combinations | Sampling itemset size %d' %
+                (processed, next_max_itemset), end="")
+        itemset_dict[next_max_itemset] = frequent_itemsets
+        support_dict[next_max_itemset] = frequent_supports
+        max_itemset = next_max_itemset
 
     all_res = []
     for k in sorted(itemset_dict):
diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py
@@ -229,7 +229,7 @@ def test_low_memory_flag(self):
                 _ = self.fpalgo(self.df, low_memory=True, verbose=1)
 
             # Only get the last value of the stream to reduce test noise
-            expect = 'Processing 4 combinations | Sampling itemset size 3\n'
+            expect = 'Processed 1 combinations | Sampling itemset size 3\n'
             out = out.getvalue().split('\r')[-1]
             assert out == expect
         else: