Skip to content

Commit 013e961

Browse files
authored
Merge pull request #25 from rich-iannone/feat-col-count-match
feat: add the `col_count_match()` validation method
2 parents 0654258 + fb10431 commit 013e961

File tree

6 files changed

+311
-6
lines changed

6 files changed

+311
-6
lines changed

docs/_quarto.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ quartodoc:
7474
- name: Validate.rows_distinct
7575
- name: Validate.col_schema_match
7676
- name: Validate.row_count_match
77+
- name: Validate.col_count_match
7778
- title: Column Selection
7879
desc: >
7980
A flexible way to select columns for validation is to use the `col()` function along with

pointblank/_constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,21 @@
315315
<path d="M27.0931863,-2.98533065 C26.8836625,-2.98533065 25.712234,-2.93533065 25.712234,-1.65199731 L25.712234,37.681336 C25.712234,38.9646694 26.8836625,39.0146694 27.0931863,39.0146694 L29.6646149,39.0146694 L29.6646149,-2.98533065 L27.0931863,-2.98533065 Z M35.6721771,-2.98533065 L35.6721771,39.0146694 L31.7522908,39.0146694 L31.7522908,39.0146694 L31.7522908,-2.98533065 L31.7522908,-2.98533065 L35.6721771,-2.98533065 Z M40.3656149,-2.9849096 C40.6448786,-2.97823698 41.712234,-2.87699731 41.712234,-1.65199731 L41.712234,-1.65199731 L41.712234,37.681336 C41.712234,38.9646694 40.5408054,39.0146694 40.3312816,39.0146694 L40.3312816,39.0146694 L37.759853,39.0146694 L37.759853,-2.98533065 Z" id="rows_one" fill="#000000" fill-rule="nonzero" transform="translate(33.712234, 18.014669) rotate(-90.000000) translate(-33.712234, -18.014669) "></path>
316316
</g>
317317
</g>
318+
</svg>""",
319+
"col_count_match": """<?xml version="1.0" encoding="UTF-8"?>
320+
<svg width="67px" height="67px" viewBox="0 0 67 67" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
321+
<title>col_count_match</title>
322+
<g id="All-Icons" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
323+
<g id="col_count_match" transform="translate(0.000000, 0.275862)">
324+
<path d="M56.712234,1.01466935 C59.1975153,1.01466935 61.4475153,2.02202867 63.076195,3.65070832 C64.7048747,5.27938798 65.712234,7.52938798 65.712234,10.0146694 L65.712234,10.0146694 L65.712234,65.0146694 L10.712234,65.0146694 C8.22695259,65.0146694 5.97695259,64.00731 4.34827294,62.3786304 C2.71959328,60.7499507 1.71223397,58.4999507 1.71223397,56.0146694 L1.71223397,56.0146694 L1.71223397,10.0146694 C1.71223397,7.52938798 2.71959328,5.27938798 4.34827294,3.65070832 C5.97695259,2.02202867 8.22695259,1.01466935 10.712234,1.01466935 L10.712234,1.01466935 Z" id="rectangle" stroke="#000000" stroke-width="2" fill="#FFFFFF"></path>
325+
<path d="M44.6352762,13.0146694 L44.6352762,54.0145886 L42.5357154,54.0142493 C42.4170822,54.0112174 42.1008219,53.9958922 41.8818489,53.7791481 C41.7566015,53.6551756 41.6828953,53.460322 41.6828953,53.181336 L41.6828953,53.181336 L41.6828953,13.8480027 C41.6828953,13.5651731 41.7589094,13.3688282 41.8870321,13.2451235 C42.126812,13.013612 42.4766654,13.0146694 42.5638476,13.0146694 L44.6352762,13.0146694 Z M50.6306006,13.0146694 L50.6306006,54.0146694 L47.7351899,54.0146694 L47.7351899,13.0146694 L50.6306006,13.0146694 Z M53.7305143,13.0147502 L55.8300088,13.0150894 C55.9440566,13.0180667 56.2374813,13.0341534 56.4560165,13.2242838 C56.5714828,13.3247421 56.6526612,13.4807072 56.6758389,13.7041707 L56.6758389,13.7041707 L56.6828953,53.181336 C56.6828953,53.460322 56.609189,53.6551756 56.4839416,53.7791481 C56.2649687,53.9958922 55.9487083,54.0112174 55.8300752,54.0142493 L55.8300752,54.0142493 L53.7305143,54.0145886 L53.7305143,13.0147502 Z" id="rows_two" stroke="#000000" fill-rule="nonzero" transform="translate(49.182895, 33.514669) rotate(-180.000000) translate(-49.182895, -33.514669) "></path>
326+
<g id="vertical_equal" transform="translate(34.135195, 33.726903) rotate(-90.000000) translate(-34.135195, -33.726903) translate(30.635195, 29.226903)" stroke="#000000" stroke-linecap="square">
327+
<line x1="2.21223397" y1="0.514669353" x2="2.21223397" y2="7.58573716" id="Line"></line>
328+
<line x1="5.21223397" y1="0.514669353" x2="5.21223397" y2="7.58573716" id="Line"></line>
329+
</g>
330+
<path d="M11.5931863,12.5146694 C11.3836625,12.5146694 10.212234,12.5646694 10.212234,13.8480027 L10.212234,53.181336 C10.212234,54.4646694 11.3836625,54.5146694 11.5931863,54.5146694 L14.1646149,54.5146694 L14.1646149,12.5146694 L11.5931863,12.5146694 Z M20.1721771,12.5146694 L20.1721771,54.5146694 L16.2522908,54.5146694 L16.2522908,54.5146694 L16.2522908,12.5146694 L16.2522908,12.5146694 L20.1721771,12.5146694 Z M24.8656149,12.5150904 C25.1448786,12.521763 26.212234,12.6230027 26.212234,13.8480027 L26.212234,13.8480027 L26.212234,53.181336 C26.212234,54.4646694 25.0408054,54.5146694 24.8312816,54.5146694 L24.8312816,54.5146694 L22.259853,54.5146694 L22.259853,12.5146694 Z" id="rows_one" fill="#000000" fill-rule="nonzero" transform="translate(18.212234, 33.514669) rotate(-180.000000) translate(-18.212234, -33.514669) "></path>
331+
</g>
332+
</g>
318333
</svg>""",
319334
}
320335

pointblank/_interrogation.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1749,6 +1749,55 @@ def get_test_results(self):
17491749
return self.test_unit_res
17501750

17511751

1752+
@dataclass
1753+
class ColCountMatch:
1754+
"""
1755+
Check if columns in a DataFrame either match or don't match a fixed value.
1756+
1757+
Parameters
1758+
----------
1759+
data_tbl
1760+
A data table.
1761+
count
1762+
The fixed column count to check against.
1763+
inverse
1764+
`True` to check if the column count does not match the fixed value, `False` otherwise.
1765+
threshold
1766+
The maximum number of failing test units to allow.
1767+
tbl_type
1768+
The type of table to use for the assertion.
1769+
1770+
Returns
1771+
-------
1772+
bool
1773+
`True` when test units pass below the threshold level for failing test units, `False`
1774+
otherwise.
1775+
"""
1776+
1777+
data_tbl: FrameT
1778+
count: int
1779+
inverse: bool
1780+
threshold: int
1781+
tbl_type: str = "local"
1782+
1783+
def __post_init__(self):
1784+
1785+
from pointblank.preview import get_column_count
1786+
1787+
if not self.inverse:
1788+
1789+
res = get_column_count(data=self.data_tbl) == self.count
1790+
print(res)
1791+
else:
1792+
1793+
res = get_column_count(data=self.data_tbl) != self.count
1794+
1795+
self.test_unit_res = res
1796+
1797+
def get_test_results(self):
1798+
return self.test_unit_res
1799+
1800+
17521801
@dataclass
17531802
class NumberOfTestUnits:
17541803
"""

pointblank/validate.py

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
SVG_ICONS_FOR_TBL_STATUS,
3030
)
3131
from pointblank.column import Column, col, ColumnSelector
32-
from pointblank.preview import get_row_count
32+
from pointblank.preview import get_column_count, get_row_count
3333
from pointblank.schema import Schema
3434
from pointblank._interrogation import (
3535
ColValsCompareOne,
@@ -39,6 +39,7 @@
3939
ColExistsHasType,
4040
ColSchemaMatch,
4141
RowCountMatch,
42+
ColCountMatch,
4243
NumberOfTestUnits,
4344
RowsDistinct,
4445
)
@@ -2883,9 +2884,9 @@ def row_count_match(
28832884
Parameters
28842885
----------
28852886
count
2886-
The expected row count of the table. This can be an integer value, a Polars DataFrame
2887-
object, or an Ibis backend table. If a DataFrame/table is provided, the row count of the
2888-
DataFrame will be used as the expected count.
2887+
The expected row count of the table. This can be an integer value, a Polars or Pandas
2888+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
2889+
count of that object will be used as the expected count.
28892890
inverse
28902891
Should the validation step be inverted? If `True`, then the expectation is that the row
28912892
count of the target table should not match the specified `count=` value.
@@ -2976,6 +2977,121 @@ def row_count_match(
29762977

29772978
return self
29782979

2980+
def col_count_match(
2981+
self,
2982+
count: int | FrameT | Any,
2983+
inverse: bool = False,
2984+
pre: Callable | None = None,
2985+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
2986+
active: bool = True,
2987+
) -> Validate:
2988+
"""
2989+
Validate whether the column count of the table matches a specified count.
2990+
2991+
The `col_count_match()` method checks whether the column count of the target table matches a
2992+
specified count. This validation will operate over a single test unit, which is whether the
2993+
column count matches the specified count.
2994+
2995+
We also have the option to invert the validation step by setting `inverse=True`. This will
2996+
make the expectation that column row count of the target table *does not* match the
2997+
specified count.
2998+
2999+
Parameters
3000+
----------
3001+
count
3002+
The expected column count of the table. This can be an integer value, a Polars or Pandas
3003+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
3004+
count of that object will be used as the expected count.
3005+
inverse
3006+
Should the validation step be inverted? If `True`, then the expectation is that the
3007+
column count of the target table should not match the specified `count=` value.
3008+
pre
3009+
A pre-processing function or lambda to apply to the data table for the validation step.
3010+
thresholds
3011+
Failure threshold levels so that the validation step can react accordingly when
3012+
exceeding the set levels for different states (`warn`, `stop`, and `notify`). This can
3013+
be created simply as an integer or float denoting the absolute number or fraction of
3014+
failing test units for the 'warn' level. Otherwise, you can use a tuple of 1-3 values,
3015+
a dictionary of 1-3 entries, or a Thresholds object.
3016+
active
3017+
A boolean value indicating whether the validation step should be active. Using `False`
3018+
will make the validation step inactive (still reporting its presence and keeping indexes
3019+
for the steps unchanged).
3020+
3021+
Returns
3022+
-------
3023+
Validate
3024+
The `Validate` object with the added validation step.
3025+
3026+
Examples
3027+
--------
3028+
```{python}
3029+
#| echo: false
3030+
#| output: false
3031+
import pointblank as pb
3032+
pb.config(report_incl_header=False, report_incl_footer=False)
3033+
```
3034+
3035+
For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
3036+
obtained by calling `load_dataset("game_revenue")`.
3037+
3038+
```{python}
3039+
import pointblank as pb
3040+
3041+
game_revenue = pb.load_dataset("game_revenue")
3042+
3043+
game_revenue
3044+
```
3045+
3046+
Let's validate that the number of columns in the table matches a fixed value. In this case,
3047+
we will use the value `11` as the expected column count.
3048+
3049+
```{python}
3050+
validation = (
3051+
pb.Validate(data=game_revenue)
3052+
.col_count_match(count=11)
3053+
.interrogate()
3054+
)
3055+
3056+
validation
3057+
```
3058+
3059+
The validation table shows that the expectation value of `11` matches the actual count of
3060+
columns in the target table. So, the single test unit passed.
3061+
"""
3062+
3063+
assertion_type = _get_fn_name()
3064+
3065+
_check_pre(pre=pre)
3066+
_check_thresholds(thresholds=thresholds)
3067+
_check_boolean_input(param=active, param_name="active")
3068+
_check_boolean_input(param=inverse, param_name="inverse")
3069+
3070+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
3071+
thresholds = (
3072+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
3073+
)
3074+
3075+
# If `count` is a DataFrame or table then use the column count of the DataFrame as
3076+
# the expected count
3077+
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
3078+
count = get_column_count(count)
3079+
3080+
# Package up the `count=` and boolean params into a dictionary for later interrogation
3081+
values = {"count": count, "inverse": inverse}
3082+
3083+
val_info = _ValidationInfo(
3084+
assertion_type=assertion_type,
3085+
values=values,
3086+
pre=pre,
3087+
thresholds=thresholds,
3088+
active=active,
3089+
)
3090+
3091+
self._add_validation(validation_info=val_info)
3092+
3093+
return self
3094+
29793095
def interrogate(
29803096
self,
29813097
collect_extracts: bool = True,
@@ -3300,10 +3416,28 @@ def interrogate(
33003416

33013417
results_tbl = None
33023418

3419+
if assertion_category == "COL_COUNT_MATCH":
3420+
3421+
result_bool = ColCountMatch(
3422+
data_tbl=data_tbl_step,
3423+
count=value["count"],
3424+
inverse=value["inverse"],
3425+
threshold=threshold,
3426+
tbl_type=tbl_type,
3427+
).get_test_results()
3428+
3429+
validation.all_passed = result_bool
3430+
validation.n = 1
3431+
validation.n_passed = int(result_bool)
3432+
validation.n_failed = 1 - result_bool
3433+
3434+
results_tbl = None
3435+
33033436
if assertion_category not in [
33043437
"COL_EXISTS_HAS_TYPE",
33053438
"COL_SCHEMA_MATCH",
33063439
"ROW_COUNT_MATCH",
3440+
"COL_COUNT_MATCH",
33073441
]:
33083442

33093443
# Extract the `pb_is_good_` column from the table as a results list
@@ -4793,7 +4927,7 @@ def get_tabular_report(
47934927
# Iterate over the values in the `column` entry
47944928
for i, column in enumerate(columns):
47954929

4796-
if assertion_type[i] in ["col_schema_match", "row_count_match"]:
4930+
if assertion_type[i] in ["col_schema_match", "row_count_match", "col_count_match"]:
47974931
columns_upd.append("&mdash;")
47984932
else:
47994933
columns_upd.append(column)
@@ -4850,7 +4984,7 @@ def get_tabular_report(
48504984
elif assertion_type[i] in ["col_schema_match"]:
48514985
values_upd.append("SCHEMA")
48524986

4853-
elif assertion_type[i] in ["row_count_match"]:
4987+
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
48544988

48554989
count = values[i]["count"]
48564990
inverse = values[i]["inverse"]

0 commit comments

Comments
 (0)