feat: add thresh param for Dataframe.dropna (#1885)

shuoweil · web-flow · commit 1395a502ffa0 · 2025-07-09T17:11:48.000-05:00
* support thresh in dropna

* update docstring, and polish function

* fix mypy
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -522,7 +522,8 @@ def rank(
 def dropna(
     block: blocks.Block,
     column_ids: typing.Sequence[str],
-    how: typing.Literal["all", "any"] = "any",
+    how: str = "any",
+    thresh: typing.Optional[int] = None,
     subset: Optional[typing.Sequence[str]] = None,
 ):
     """
@@ -531,17 +532,38 @@ def dropna(
     if subset is None:
         subset = column_ids
 
+    # Predicates to check for non-null values in the subset of columns
     predicates = [
         ops.notnull_op.as_expr(column_id)
         for column_id in column_ids
         if column_id in subset
     ]
+
     if len(predicates) == 0:
         return block
-    if how == "any":
-        predicate = functools.reduce(ops.and_op.as_expr, predicates)
-    else:  # "all"
-        predicate = functools.reduce(ops.or_op.as_expr, predicates)
+
+    if thresh is not None:
+        # Handle single predicate case
+        if len(predicates) == 1:
+            count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0])
+        else:
+            # Sum the boolean expressions to count non-null values
+            count_expr = functools.reduce(
+                lambda a, b: ops.add_op.as_expr(
+                    ops.AsTypeOp(pd.Int64Dtype()).as_expr(a),
+                    ops.AsTypeOp(pd.Int64Dtype()).as_expr(b),
+                ),
+                predicates,
+            )
+        # Filter rows where count >= thresh
+        predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
+    else:
+        # Only handle 'how' parameter when thresh is not specified
+        if how == "any":
+            predicate = functools.reduce(ops.and_op.as_expr, predicates)
+        else:  # "all"
+            predicate = functools.reduce(ops.or_op.as_expr, predicates)
+
     return block.filter(predicate)
 
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2802,6 +2802,7 @@ def dropna(
         *,
         axis: int | str = 0,
         how: str = "any",
+        thresh: typing.Optional[int] = None,
         subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
         inplace: bool = False,
         ignore_index=False,
@@ -2810,8 +2811,18 @@ def dropna(
             raise NotImplementedError(
                 f"'inplace'=True not supported. {constants.FEEDBACK_LINK}"
             )
-        if how not in ("any", "all"):
-            raise ValueError("'how' must be one of 'any', 'all'")
+
+        # Check if both thresh and how are explicitly provided
+        if thresh is not None:
+            # cannot specify both thresh and how parameters
+            if how != "any":
+                raise TypeError(
+                    "You cannot set both the how and thresh arguments at the same time."
+                )
+        else:
+            # Only validate 'how' when thresh is not provided
+            if how not in ("any", "all"):
+                raise ValueError("'how' must be one of 'any', 'all'")
 
         axis_n = utils.get_axis_number(axis)
 
@@ -2833,21 +2844,38 @@ def dropna(
                     for id_ in self._block.label_to_col_id[label]
                 ]
 
-            result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids)  # type: ignore
+            result = block_ops.dropna(
+                self._block,
+                self._block.value_columns,
+                how=how,
+                thresh=thresh,
+                subset=subset_ids,
+            )  # type: ignore
             if ignore_index:
                 result = result.reset_index()
             return DataFrame(result)
         else:
-            isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
-            if how == "any":
-                null_locations = DataFrame(isnull_block).any().to_pandas()
-            else:  # 'all'
-                null_locations = DataFrame(isnull_block).all().to_pandas()
-            keep_columns = [
-                col
-                for col, to_drop in zip(self._block.value_columns, null_locations)
-                if not to_drop
-            ]
+            if thresh is not None:
+                # Keep columns with at least 'thresh' non-null values
+                notnull_block = self._block.multi_apply_unary_op(ops.notnull_op)
+                notnull_counts = DataFrame(notnull_block).sum().to_pandas()
+
+                keep_columns = [
+                    col
+                    for col, count in zip(self._block.value_columns, notnull_counts)
+                    if count >= thresh
+                ]
+            else:
+                isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
+                if how == "any":
+                    null_locations = DataFrame(isnull_block).any().to_pandas()
+                else:  # 'all'
+                    null_locations = DataFrame(isnull_block).all().to_pandas()
+                keep_columns = [
+                    col
+                    for col, to_drop in zip(self._block.value_columns, null_locations)
+                    if not to_drop
+                ]
             return DataFrame(self._block.select_columns(keep_columns))
 
     def any(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1207,7 +1207,7 @@ def test_assign_callable_lambda(scalars_dfs):
         (1, "all", False, None),
     ],
 )
-def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
+def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     scalars_df, scalars_pandas_df = scalars_dfs
@@ -1222,6 +1222,36 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+@pytest.mark.parametrize(
+    ("axis", "ignore_index", "subset", "thresh"),
+    [
+        (0, False, None, 2),
+        (0, True, None, 3),
+        (1, False, None, 2),
+    ],
+)
+def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh):
+    """
+    Tests that dropna correctly keeps rows/columns with a minimum number
+    of non-null values.
+    """
+    # TODO: supply a reason why this isn't compatible with pandas 1.x
+    pytest.importorskip("pandas", minversion="2.0.0")
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    df_result = scalars_df.dropna(
+        axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+    )
+    pd_result = scalars_pandas_df.dropna(
+        axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+    )
+
+    bf_result = df_result.to_pandas()
+    # Pandas uses int64 instead of Int64 (nullable) dtype.
+    pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+    pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
 def test_df_dropna_range_columns(scalars_dfs):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1762,6 +1762,7 @@ def dropna(
         *,
         axis: int | str = 0,
         how: str = "any",
+        thresh: Optional[int] = None,
         subset=None,
         inplace: bool = False,
         ignore_index=False,
@@ -1812,6 +1813,25 @@ def dropna(
             <BLANKLINE>
             [3 rows x 3 columns]
 
+        Keep rows with at least 2 non-null values.
+
+            >>> df.dropna(thresh=2)
+                            name        toy        born
+            1    Batman  Batmobile  1940-04-25
+            2  Catwoman   Bullwhip        <NA>
+            <BLANKLINE>
+            [2 rows x 3 columns]
+
+        Keep columns with at least 2 non-null values:
+
+            >>> df.dropna(axis='columns', thresh=2)
+                name        toy
+            0    Alfred       <NA>
+            1    Batman  Batmobile
+            2  Catwoman   Bullwhip
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
         Define in which columns to look for missing values.
 
             >>> df.dropna(subset=['name', 'toy'])
@@ -1822,7 +1842,7 @@ def dropna(
             [2 rows x 3 columns]
 
         Args:
-            axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
+            axis ({0 or 'index', 1 or 'columns'}, default 0):
                 Determine if rows or columns which contain missing values are
                 removed.
 
@@ -1834,6 +1854,8 @@ def dropna(
 
                 * 'any' : If any NA values are present, drop that row or column.
                 * 'all' : If all values are NA, drop that row or column.
+            thresh (int, optional):
+                Require that many non-NA values. Cannot be combined with how.
             subset (column label or sequence of labels, optional):
                 Labels along other axis to consider, e.g. if you are dropping
                 rows these would be a list of columns to include.
@@ -1851,6 +1873,8 @@ def dropna(
         Raises:
             ValueError:
                 If ``how`` is not one of ``any`` or ``all``.
+            TyperError:
+                If both ``how`` and ``thresh`` are specified.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)