Merge pull request #25 from rich-iannone/feat-col-count-match

rich-iannone · web-flow · commit 013e961a85ef · 2025-01-10T23:02:02.000-05:00
feat: add the `col_count_match()` validation method
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
@@ -74,6 +74,7 @@ quartodoc:
         - name: Validate.rows_distinct
         - name: Validate.col_schema_match
         - name: Validate.row_count_match
+        - name: Validate.col_count_match
     - title: Column Selection
       desc: >
         A flexible way to select columns for validation is to use the `col()` function along with
diff --git a/pointblank/_constants.py b/pointblank/_constants.py
@@ -315,6 +315,21 @@
             <path d="M27.0931863,-2.98533065 C26.8836625,-2.98533065 25.712234,-2.93533065 25.712234,-1.65199731 L25.712234,37.681336 C25.712234,38.9646694 26.8836625,39.0146694 27.0931863,39.0146694 L29.6646149,39.0146694 L29.6646149,-2.98533065 L27.0931863,-2.98533065 Z M35.6721771,-2.98533065 L35.6721771,39.0146694 L31.7522908,39.0146694 L31.7522908,39.0146694 L31.7522908,-2.98533065 L31.7522908,-2.98533065 L35.6721771,-2.98533065 Z M40.3656149,-2.9849096 C40.6448786,-2.97823698 41.712234,-2.87699731 41.712234,-1.65199731 L41.712234,-1.65199731 L41.712234,37.681336 C41.712234,38.9646694 40.5408054,39.0146694 40.3312816,39.0146694 L40.3312816,39.0146694 L37.759853,39.0146694 L37.759853,-2.98533065 Z" id="rows_one" fill="#000000" fill-rule="nonzero" transform="translate(33.712234, 18.014669) rotate(-90.000000) translate(-33.712234, -18.014669) "></path>
         </g>
     </g>
+</svg>""",
+    "col_count_match": """<?xml version="1.0" encoding="UTF-8"?>
+<svg width="67px" height="67px" viewBox="0 0 67 67" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <title>col_count_match</title>
+    <g id="All-Icons" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="col_count_match" transform="translate(0.000000, 0.275862)">
+            <path d="M56.712234,1.01466935 C59.1975153,1.01466935 61.4475153,2.02202867 63.076195,3.65070832 C64.7048747,5.27938798 65.712234,7.52938798 65.712234,10.0146694 L65.712234,10.0146694 L65.712234,65.0146694 L10.712234,65.0146694 C8.22695259,65.0146694 5.97695259,64.00731 4.34827294,62.3786304 C2.71959328,60.7499507 1.71223397,58.4999507 1.71223397,56.0146694 L1.71223397,56.0146694 L1.71223397,10.0146694 C1.71223397,7.52938798 2.71959328,5.27938798 4.34827294,3.65070832 C5.97695259,2.02202867 8.22695259,1.01466935 10.712234,1.01466935 L10.712234,1.01466935 Z" id="rectangle" stroke="#000000" stroke-width="2" fill="#FFFFFF"></path>
+            <path d="M44.6352762,13.0146694 L44.6352762,54.0145886 L42.5357154,54.0142493 C42.4170822,54.0112174 42.1008219,53.9958922 41.8818489,53.7791481 C41.7566015,53.6551756 41.6828953,53.460322 41.6828953,53.181336 L41.6828953,53.181336 L41.6828953,13.8480027 C41.6828953,13.5651731 41.7589094,13.3688282 41.8870321,13.2451235 C42.126812,13.013612 42.4766654,13.0146694 42.5638476,13.0146694 L44.6352762,13.0146694 Z M50.6306006,13.0146694 L50.6306006,54.0146694 L47.7351899,54.0146694 L47.7351899,13.0146694 L50.6306006,13.0146694 Z M53.7305143,13.0147502 L55.8300088,13.0150894 C55.9440566,13.0180667 56.2374813,13.0341534 56.4560165,13.2242838 C56.5714828,13.3247421 56.6526612,13.4807072 56.6758389,13.7041707 L56.6758389,13.7041707 L56.6828953,53.181336 C56.6828953,53.460322 56.609189,53.6551756 56.4839416,53.7791481 C56.2649687,53.9958922 55.9487083,54.0112174 55.8300752,54.0142493 L55.8300752,54.0142493 L53.7305143,54.0145886 L53.7305143,13.0147502 Z" id="rows_two" stroke="#000000" fill-rule="nonzero" transform="translate(49.182895, 33.514669) rotate(-180.000000) translate(-49.182895, -33.514669) "></path>
+            <g id="vertical_equal" transform="translate(34.135195, 33.726903) rotate(-90.000000) translate(-34.135195, -33.726903) translate(30.635195, 29.226903)" stroke="#000000" stroke-linecap="square">
+                <line x1="2.21223397" y1="0.514669353" x2="2.21223397" y2="7.58573716" id="Line"></line>
+                <line x1="5.21223397" y1="0.514669353" x2="5.21223397" y2="7.58573716" id="Line"></line>
+            </g>
+            <path d="M11.5931863,12.5146694 C11.3836625,12.5146694 10.212234,12.5646694 10.212234,13.8480027 L10.212234,53.181336 C10.212234,54.4646694 11.3836625,54.5146694 11.5931863,54.5146694 L14.1646149,54.5146694 L14.1646149,12.5146694 L11.5931863,12.5146694 Z M20.1721771,12.5146694 L20.1721771,54.5146694 L16.2522908,54.5146694 L16.2522908,54.5146694 L16.2522908,12.5146694 L16.2522908,12.5146694 L20.1721771,12.5146694 Z M24.8656149,12.5150904 C25.1448786,12.521763 26.212234,12.6230027 26.212234,13.8480027 L26.212234,13.8480027 L26.212234,53.181336 C26.212234,54.4646694 25.0408054,54.5146694 24.8312816,54.5146694 L24.8312816,54.5146694 L22.259853,54.5146694 L22.259853,12.5146694 Z" id="rows_one" fill="#000000" fill-rule="nonzero" transform="translate(18.212234, 33.514669) rotate(-180.000000) translate(-18.212234, -33.514669) "></path>
+        </g>
+    </g>
 </svg>""",
 }
 
diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py
@@ -1749,6 +1749,55 @@ def get_test_results(self):
         return self.test_unit_res
 
 
+@dataclass
+class ColCountMatch:
+    """
+    Check if columns in a DataFrame either match or don't match a fixed value.
+
+    Parameters
+    ----------
+    data_tbl
+        A data table.
+    count
+        The fixed column count to check against.
+    inverse
+        `True` to check if the column count does not match the fixed value, `False` otherwise.
+    threshold
+        The maximum number of failing test units to allow.
+    tbl_type
+        The type of table to use for the assertion.
+
+    Returns
+    -------
+    bool
+        `True` when test units pass below the threshold level for failing test units, `False`
+        otherwise.
+    """
+
+    data_tbl: FrameT
+    count: int
+    inverse: bool
+    threshold: int
+    tbl_type: str = "local"
+
+    def __post_init__(self):
+
+        from pointblank.preview import get_column_count
+
+        if not self.inverse:
+
+            res = get_column_count(data=self.data_tbl) == self.count
+            print(res)
+        else:
+
+            res = get_column_count(data=self.data_tbl) != self.count
+
+        self.test_unit_res = res
+
+    def get_test_results(self):
+        return self.test_unit_res
+
+
 @dataclass
 class NumberOfTestUnits:
     """
diff --git a/pointblank/validate.py b/pointblank/validate.py
@@ -29,7 +29,7 @@
     SVG_ICONS_FOR_TBL_STATUS,
 )
 from pointblank.column import Column, col, ColumnSelector
-from pointblank.preview import get_row_count
+from pointblank.preview import get_column_count, get_row_count
 from pointblank.schema import Schema
 from pointblank._interrogation import (
     ColValsCompareOne,
@@ -39,6 +39,7 @@
     ColExistsHasType,
     ColSchemaMatch,
     RowCountMatch,
+    ColCountMatch,
     NumberOfTestUnits,
     RowsDistinct,
 )
@@ -2883,9 +2884,9 @@ def row_count_match(
         Parameters
         ----------
         count
-            The expected row count of the table. This can be an integer value, a Polars DataFrame
-            object, or an Ibis backend table. If a DataFrame/table is provided, the row count of the
-            DataFrame will be used as the expected count.
+            The expected row count of the table. This can be an integer value, a Polars or Pandas
+            DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
+            count of that object will be used as the expected count.
         inverse
             Should the validation step be inverted? If `True`, then the expectation is that the row
             count of the target table should not match the specified `count=` value.
@@ -2976,6 +2977,121 @@ def row_count_match(
 
         return self
 
+    def col_count_match(
+        self,
+        count: int | FrameT | Any,
+        inverse: bool = False,
+        pre: Callable | None = None,
+        thresholds: int | float | bool | tuple | dict | Thresholds = None,
+        active: bool = True,
+    ) -> Validate:
+        """
+        Validate whether the column count of the table matches a specified count.
+
+        The `col_count_match()` method checks whether the column count of the target table matches a
+        specified count. This validation will operate over a single test unit, which is whether the
+        column count matches the specified count.
+
+        We also have the option to invert the validation step by setting `inverse=True`. This will
+        make the expectation that column row count of the target table *does not* match the
+        specified count.
+
+        Parameters
+        ----------
+        count
+            The expected column count of the table. This can be an integer value, a Polars or Pandas
+            DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
+            count of that object will be used as the expected count.
+        inverse
+            Should the validation step be inverted? If `True`, then the expectation is that the
+            column count of the target table should not match the specified `count=` value.
+        pre
+            A pre-processing function or lambda to apply to the data table for the validation step.
+        thresholds
+            Failure threshold levels so that the validation step can react accordingly when
+            exceeding the set levels for different states (`warn`, `stop`, and `notify`). This can
+            be created simply as an integer or float denoting the absolute number or fraction of
+            failing test units for the 'warn' level. Otherwise, you can use a tuple of 1-3 values,
+            a dictionary of 1-3 entries, or a Thresholds object.
+        active
+            A boolean value indicating whether the validation step should be active. Using `False`
+            will make the validation step inactive (still reporting its presence and keeping indexes
+            for the steps unchanged).
+
+        Returns
+        -------
+        Validate
+            The `Validate` object with the added validation step.
+
+        Examples
+        --------
+        ```{python}
+        #| echo: false
+        #| output: false
+        import pointblank as pb
+        pb.config(report_incl_header=False, report_incl_footer=False)
+        ```
+
+        For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
+        obtained by calling `load_dataset("game_revenue")`.
+
+        ```{python}
+        import pointblank as pb
+
+        game_revenue = pb.load_dataset("game_revenue")
+
+        game_revenue
+        ```
+
+        Let's validate that the number of columns in the table matches a fixed value. In this case,
+        we will use the value `11` as the expected column count.
+
+        ```{python}
+        validation = (
+            pb.Validate(data=game_revenue)
+            .col_count_match(count=11)
+            .interrogate()
+        )
+
+        validation
+        ```
+
+        The validation table shows that the expectation value of `11` matches the actual count of
+        columns in the target table. So, the single test unit passed.
+        """
+
+        assertion_type = _get_fn_name()
+
+        _check_pre(pre=pre)
+        _check_thresholds(thresholds=thresholds)
+        _check_boolean_input(param=active, param_name="active")
+        _check_boolean_input(param=inverse, param_name="inverse")
+
+        # Determine threshold to use (global or local) and normalize a local `thresholds=` value
+        thresholds = (
+            self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
+        )
+
+        # If `count` is a DataFrame or table then use the column count of the DataFrame as
+        # the expected count
+        if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
+            count = get_column_count(count)
+
+        # Package up the `count=` and boolean params into a dictionary for later interrogation
+        values = {"count": count, "inverse": inverse}
+
+        val_info = _ValidationInfo(
+            assertion_type=assertion_type,
+            values=values,
+            pre=pre,
+            thresholds=thresholds,
+            active=active,
+        )
+
+        self._add_validation(validation_info=val_info)
+
+        return self
+
     def interrogate(
         self,
         collect_extracts: bool = True,
@@ -3300,10 +3416,28 @@ def interrogate(
 
                 results_tbl = None
 
+            if assertion_category == "COL_COUNT_MATCH":
+
+                result_bool = ColCountMatch(
+                    data_tbl=data_tbl_step,
+                    count=value["count"],
+                    inverse=value["inverse"],
+                    threshold=threshold,
+                    tbl_type=tbl_type,
+                ).get_test_results()
+
+                validation.all_passed = result_bool
+                validation.n = 1
+                validation.n_passed = int(result_bool)
+                validation.n_failed = 1 - result_bool
+
+                results_tbl = None
+
             if assertion_category not in [
                 "COL_EXISTS_HAS_TYPE",
                 "COL_SCHEMA_MATCH",
                 "ROW_COUNT_MATCH",
+                "COL_COUNT_MATCH",
             ]:
 
                 # Extract the `pb_is_good_` column from the table as a results list
@@ -4793,7 +4927,7 @@ def get_tabular_report(
         # Iterate over the values in the `column` entry
         for i, column in enumerate(columns):
 
-            if assertion_type[i] in ["col_schema_match", "row_count_match"]:
+            if assertion_type[i] in ["col_schema_match", "row_count_match", "col_count_match"]:
                 columns_upd.append("&mdash;")
             else:
                 columns_upd.append(column)
@@ -4850,7 +4984,7 @@ def get_tabular_report(
             elif assertion_type[i] in ["col_schema_match"]:
                 values_upd.append("SCHEMA")
 
-            elif assertion_type[i] in ["row_count_match"]:
+            elif assertion_type[i] in ["row_count_match", "col_count_match"]:
 
                 count = values[i]["count"]
                 inverse = values[i]["inverse"]
diff --git a/tests/snapshots/test_validate/test_comprehensive_validation_report_html_snap/comprehensive_validation_report.html b/tests/snapshots/test_validate/test_comprehensive_validation_report_html_snap/comprehensive_validation_report.html
diff --git a/tests/test_validate.py b/tests/test_validate.py