Merge pull request #130 from posit-dev/fix-set-limit-extracts

rich-iannone · web-flow · commit ee5afeb64ea0 · 2025-03-28T17:10:20.000-04:00
fix: set limit on extracts no matter which scheme was used for collection
diff --git a/pointblank/data/api-docs.txt b/pointblank/data/api-docs.txt
@@ -60,7 +60,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
         [`Thresholds`](`pointblank.Thresholds`) object.
     actions
         The actions to take when validation steps meet or exceed any set threshold levels. This
-        should be provided in the form of an `Actions` object. If `None` then no default actions
+        should be provided in the form of an `Actions` object. If `None` then no global actions
         will be set.
     brief
         A global setting for briefs, which are optional brief descriptions for validation steps
@@ -104,7 +104,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
 
     Examples
     --------
-    ## Creating a validation plan and interrogating
+    ### Creating a validation plan and interrogating
 
     Let's walk through a data quality analysis of an extremely small table. It's actually called
     `"small_table"` and it's accessible through the [`load_dataset()`](`pointblank.load_dataset`)
@@ -170,11 +170,72 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`) method, which contains
     options for modifying the display of the table.
 
-    Furthermore, post-interrogation methods such as
-    [`get_step_report()`](`pointblank.Validate.get_step_report`),
-    [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`), and
-    [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) allow you to generate
-    additional reporting or extract useful data for downstream analysis from a `Validate` object.
+    ### Adding briefs
+
+    Briefs are short descriptions of the validation steps. While they can be set for each step
+    individually, they can also be set globally. The global setting is done by using the
+    `brief=` argument in `Validate`. The global setting can be as simple as `True` to have
+    automatically-generated briefs for each step. Alternatively, we can use templating elements
+    like `"{step}"` (to insert the step number) or `"{auto}"` (to include an automatically generated
+    brief). Here's an example of a global setting for briefs:
+
+    ```python
+    validation = (
+        pb.Validate(
+            data=pb.load_dataset(),
+            tbl_name="small_table",
+            label="Validation example with briefs",
+            brief="Step {step}: {auto}",
+        )
+        .col_vals_gt(columns="d", value=100)
+        .col_vals_between(columns="c", left=3, right=10, na_pass=True)
+        .col_vals_regex(
+            columns="b",
+            pattern=r"[0-9]-[a-z]{3}-[0-9]{3}",
+            brief="Regex check for column {col}"
+        )
+        .interrogate()
+    )
+
+    validation
+    ```
+
+    We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
+    the global brief's template (`"Step {step}: {auto}"`) is applied to all steps except for the
+    final step, where the step-level `brief=` argument provided an override.
+
+    If you should want to cancel the globally-defined brief for one or more validation steps, you
+    can set `brief=False` in those particular steps.
+
+    ### Post-interrogation methods
+
+    The `Validate` class has a number of post-interrogation methods that can be used to extract
+    useful information from the validation results. For example, the
+    [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method can be used to get
+    the data extracts for each validation step.
+
+    ```python
+    validation.get_data_extracts()
+    ```
+
+    We can also view step reports for each validation step using the
+    [`get_step_report()`](`pointblank.Validate.get_step_report`) method. This method adapts to the
+    type of validation step and shows the relevant information for a step's validation.
+
+    ```python
+    validation.get_step_report(i=2)
+    ```
+
+    The `Validate` class also has a method for getting the sundered data, which is the data that
+    passed or failed the validation steps. This can be done using the
+    [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
+
+    ```python
+    pb.preview(validation.get_sundered_data())
+    ```
+
+    The sundered data is a DataFrame that contains the rows that passed or failed the validation.
+    The default behavior is to return the rows that failed the validation, as shown above.
 
 
 Thresholds(warning: 'int | float | bool | None' = None, error: 'int | float | bool | None' = None, critical: 'int | float | bool | None' = None) -> None
@@ -4169,7 +4230,7 @@ validation steps, (3) `interrogate()`. After interrogation of the data, we can v
 report table (by printing the object or using `get_tabular_report()`), extract key metrics, or we
 can split the data based on the validation results (with `get_sundered_data()`).
 
-interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' = True, get_first_n: 'int | None' = None, sample_n: 'int | None' = None, sample_frac: 'int | float | None' = None, sample_limit: 'int' = 5000) -> 'Validate'
+interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' = True, get_first_n: 'int | None' = None, sample_n: 'int | None' = None, sample_frac: 'int | float | None' = None, extract_limit: 'int' = 500) -> 'Validate'
 
         Execute each validation step against the table and store the results.
 
@@ -4179,8 +4240,8 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
 
         The interrogation process will collect extracts of failing rows if the `collect_extracts=`
         option is set to `True` (the default). We can control the number of rows collected using the
-        `get_first_n=`, `sample_n=`, and `sample_frac=` options. The `sample_limit=` option will
-        enforce a hard limit on the number of rows collected when using the `sample_frac=` option.
+        `get_first_n=`, `sample_n=`, and `sample_frac=` options. The `extract_limit=` option will
+        enforce a hard limit on the number of rows collected when `collect_extracts=True`.
 
         After interrogation is complete, the `Validate` object will have gathered information, and
         we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
@@ -4199,9 +4260,9 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
             The processed data frames produced by executing the validation steps is collected and
             stored in the `Validate` object if `collect_tbl_checked=True`. This information is
             necessary for some methods (e.g.,
-            [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`)), but it potentially
-            makes the object grow to a large size. To opt out of attaching this data, set this
-            argument to `False`.
+            [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`)), but it can
+            potentially make the object grow to a large size. To opt out of attaching this data, set
+            this to `False`.
         get_first_n
             If the option to collect rows where test units is chosen, there is the option here to
             collect the first `n` rows. Supply an integer number of rows to extract from the top of
@@ -4215,11 +4276,15 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
         sample_frac
             If the option to collect non-passing rows is chosen, this option allows for the sampling
             of a fraction of those rows. Provide a number in the range of `0` and `1`. The number of
-            rows to return could be very large, however, the `sample_limit=` option will apply a
+            rows to return could be very large, however, the `extract_limit=` option will apply a
             hard limit to the returned rows.
-        sample_limit
-            A value that limits the possible number of rows returned when sampling non-passing rows
-            using the `sample_frac=` option.
+        extract_limit
+            A value that limits the possible number of rows returned when extracting non-passing
+            rows. The default is `500` rows. This limit is applied after any sampling or limiting
+            options are applied. If the number of rows to be returned is greater than this limit,
+            then the number of rows returned will be limited to this value. This is useful for
+            preventing the collection of too many rows when the number of non-passing rows is very
+            large.
 
         Returns
         -------
diff --git a/pointblank/validate.py b/pointblank/validate.py
@@ -5147,7 +5147,7 @@ def interrogate(
         get_first_n: int | None = None,
         sample_n: int | None = None,
         sample_frac: int | float | None = None,
-        sample_limit: int = 5000,
+        extract_limit: int = 500,
     ) -> Validate:
         """
         Execute each validation step against the table and store the results.
@@ -5158,8 +5158,8 @@ def interrogate(
 
         The interrogation process will collect extracts of failing rows if the `collect_extracts=`
         option is set to `True` (the default). We can control the number of rows collected using the
-        `get_first_n=`, `sample_n=`, and `sample_frac=` options. The `sample_limit=` option will
-        enforce a hard limit on the number of rows collected when using the `sample_frac=` option.
+        `get_first_n=`, `sample_n=`, and `sample_frac=` options. The `extract_limit=` option will
+        enforce a hard limit on the number of rows collected when `collect_extracts=True`.
 
         After interrogation is complete, the `Validate` object will have gathered information, and
         we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
@@ -5178,9 +5178,9 @@ def interrogate(
             The processed data frames produced by executing the validation steps is collected and
             stored in the `Validate` object if `collect_tbl_checked=True`. This information is
             necessary for some methods (e.g.,
-            [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`)), but it potentially
-            makes the object grow to a large size. To opt out of attaching this data, set this
-            argument to `False`.
+            [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`)), but it can
+            potentially make the object grow to a large size. To opt out of attaching this data, set
+            this to `False`.
         get_first_n
             If the option to collect rows where test units is chosen, there is the option here to
             collect the first `n` rows. Supply an integer number of rows to extract from the top of
@@ -5194,11 +5194,15 @@ def interrogate(
         sample_frac
             If the option to collect non-passing rows is chosen, this option allows for the sampling
             of a fraction of those rows. Provide a number in the range of `0` and `1`. The number of
-            rows to return could be very large, however, the `sample_limit=` option will apply a
+            rows to return could be very large, however, the `extract_limit=` option will apply a
             hard limit to the returned rows.
-        sample_limit
-            A value that limits the possible number of rows returned when sampling non-passing rows
-            using the `sample_frac=` option.
+        extract_limit
+            A value that limits the possible number of rows returned when extracting non-passing
+            rows. The default is `500` rows. This limit is applied after any sampling or limiting
+            options are applied. If the number of rows to be returned is greater than this limit,
+            then the number of rows returned will be limited to this value. This is useful for
+            preventing the collection of too many rows when the number of non-passing rows is very
+            large.
 
         Returns
         -------
@@ -5708,9 +5712,9 @@ def interrogate(
                 elif sample_frac is not None:
                     validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac)
 
-                    # Ensure a limit is set on the number of rows to extract
-                    if len(validation_extract_nw) > sample_limit:
-                        validation_extract_nw = validation_extract_nw.head(sample_limit)
+                # Ensure a limit is set on the number of rows to extract
+                if len(validation_extract_nw) > extract_limit:
+                    validation_extract_nw = validation_extract_nw.head(extract_limit)
 
                 validation.extract = nw.to_native(validation_extract_nw)
 
diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -5057,6 +5057,55 @@ def test_interrogate_sample_n(request, tbl_fixture):
         assert len(nw.from_native(validation.get_data_extracts(i=1, frame=True)).columns) == 4
 
 
+def test_interrogate_sample_n_limit():
+    game_revenue = load_dataset(dataset="game_revenue", tbl_type="polars")
+
+    validation_default_limit = (
+        Validate(game_revenue).col_vals_gt(columns="item_revenue", value=10000).interrogate()
+    )
+
+    assert (
+        len(nw.from_native(validation_default_limit.get_data_extracts(i=1, frame=True)).rows())
+        == 500
+    )
+
+    validation_set_n_limit = (
+        Validate(game_revenue)
+        .col_vals_gt(columns="item_revenue", value=10000)
+        .interrogate(get_first_n=10)
+    )
+
+    assert (
+        len(nw.from_native(validation_set_n_limit.get_data_extracts(i=1, frame=True)).rows()) == 10
+    )
+
+    validation_set_n_no_limit_break = (
+        Validate(game_revenue)
+        .col_vals_gt(columns="item_revenue", value=10000)
+        .interrogate(get_first_n=750)
+    )
+
+    assert (
+        len(
+            nw.from_native(
+                validation_set_n_no_limit_break.get_data_extracts(i=1, frame=True)
+            ).rows()
+        )
+        == 500
+    )
+
+    validation_set_n_adj_limit = (
+        Validate(game_revenue)
+        .col_vals_gt(columns="item_revenue", value=10000)
+        .interrogate(get_first_n=750, extract_limit=1000)
+    )
+
+    assert (
+        len(nw.from_native(validation_set_n_adj_limit.get_data_extracts(i=1, frame=True)).rows())
+        == 750
+    )
+
+
 @pytest.mark.parametrize(
     "tbl_fixture, sample_frac, expected",
     [
@@ -5096,7 +5145,7 @@ def test_interrogate_sample_frac_with_sample_limit(request, tbl_fixture):
     validation = (
         Validate(tbl)
         .col_vals_regex(columns="text", pattern=r"^[a-z]{3}")
-        .interrogate(sample_frac=0.8, sample_limit=1)
+        .interrogate(sample_frac=0.8, extract_limit=1)
     )
 
     # Expect that the extracts table has 2 entries out of 3 failures