|
29 | 29 | SVG_ICONS_FOR_TBL_STATUS, |
30 | 30 | ) |
31 | 31 | from pointblank.column import Column, col, ColumnSelector |
32 | | -from pointblank.preview import get_row_count |
| 32 | +from pointblank.preview import get_column_count, get_row_count |
33 | 33 | from pointblank.schema import Schema |
34 | 34 | from pointblank._interrogation import ( |
35 | 35 | ColValsCompareOne, |
|
39 | 39 | ColExistsHasType, |
40 | 40 | ColSchemaMatch, |
41 | 41 | RowCountMatch, |
| 42 | + ColCountMatch, |
42 | 43 | NumberOfTestUnits, |
43 | 44 | RowsDistinct, |
44 | 45 | ) |
@@ -2883,9 +2884,9 @@ def row_count_match( |
2883 | 2884 | Parameters |
2884 | 2885 | ---------- |
2885 | 2886 | count |
2886 | | - The expected row count of the table. This can be an integer value, a Polars DataFrame |
2887 | | - object, or an Ibis backend table. If a DataFrame/table is provided, the row count of the |
2888 | | - DataFrame will be used as the expected count. |
| 2887 | + The expected row count of the table. This can be an integer value, a Polars or Pandas |
| 2888 | + DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row |
| 2889 | + count of that object will be used as the expected count. |
2889 | 2890 | inverse |
2890 | 2891 | Should the validation step be inverted? If `True`, then the expectation is that the row |
2891 | 2892 | count of the target table should not match the specified `count=` value. |
@@ -2976,6 +2977,121 @@ def row_count_match( |
2976 | 2977 |
|
2977 | 2978 | return self |
2978 | 2979 |
|
| 2980 | + def col_count_match( |
| 2981 | + self, |
| 2982 | + count: int | FrameT | Any, |
| 2983 | + inverse: bool = False, |
| 2984 | + pre: Callable | None = None, |
| 2985 | + thresholds: int | float | bool | tuple | dict | Thresholds = None, |
| 2986 | + active: bool = True, |
| 2987 | + ) -> Validate: |
| 2988 | + """ |
| 2989 | + Validate whether the column count of the table matches a specified count. |
| 2990 | +
|
| 2991 | + The `col_count_match()` method checks whether the column count of the target table matches a |
| 2992 | + specified count. This validation will operate over a single test unit, which is whether the |
| 2993 | + column count matches the specified count. |
| 2994 | +
|
| 2995 | + We also have the option to invert the validation step by setting `inverse=True`. This will |
| 2996 | + make the expectation that column row count of the target table *does not* match the |
| 2997 | + specified count. |
| 2998 | +
|
| 2999 | + Parameters |
| 3000 | + ---------- |
| 3001 | + count |
| 3002 | + The expected column count of the table. This can be an integer value, a Polars or Pandas |
| 3003 | + DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column |
| 3004 | + count of that object will be used as the expected count. |
| 3005 | + inverse |
| 3006 | + Should the validation step be inverted? If `True`, then the expectation is that the |
| 3007 | + column count of the target table should not match the specified `count=` value. |
| 3008 | + pre |
| 3009 | + A pre-processing function or lambda to apply to the data table for the validation step. |
| 3010 | + thresholds |
| 3011 | + Failure threshold levels so that the validation step can react accordingly when |
| 3012 | + exceeding the set levels for different states (`warn`, `stop`, and `notify`). This can |
| 3013 | + be created simply as an integer or float denoting the absolute number or fraction of |
| 3014 | + failing test units for the 'warn' level. Otherwise, you can use a tuple of 1-3 values, |
| 3015 | + a dictionary of 1-3 entries, or a Thresholds object. |
| 3016 | + active |
| 3017 | + A boolean value indicating whether the validation step should be active. Using `False` |
| 3018 | + will make the validation step inactive (still reporting its presence and keeping indexes |
| 3019 | + for the steps unchanged). |
| 3020 | +
|
| 3021 | + Returns |
| 3022 | + ------- |
| 3023 | + Validate |
| 3024 | + The `Validate` object with the added validation step. |
| 3025 | +
|
| 3026 | + Examples |
| 3027 | + -------- |
| 3028 | + ```{python} |
| 3029 | + #| echo: false |
| 3030 | + #| output: false |
| 3031 | + import pointblank as pb |
| 3032 | + pb.config(report_incl_header=False, report_incl_footer=False) |
| 3033 | + ``` |
| 3034 | +
|
| 3035 | + For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be |
| 3036 | + obtained by calling `load_dataset("game_revenue")`. |
| 3037 | +
|
| 3038 | + ```{python} |
| 3039 | + import pointblank as pb |
| 3040 | +
|
| 3041 | + game_revenue = pb.load_dataset("game_revenue") |
| 3042 | +
|
| 3043 | + game_revenue |
| 3044 | + ``` |
| 3045 | +
|
| 3046 | + Let's validate that the number of columns in the table matches a fixed value. In this case, |
| 3047 | + we will use the value `11` as the expected column count. |
| 3048 | +
|
| 3049 | + ```{python} |
| 3050 | + validation = ( |
| 3051 | + pb.Validate(data=game_revenue) |
| 3052 | + .col_count_match(count=11) |
| 3053 | + .interrogate() |
| 3054 | + ) |
| 3055 | +
|
| 3056 | + validation |
| 3057 | + ``` |
| 3058 | +
|
| 3059 | + The validation table shows that the expectation value of `11` matches the actual count of |
| 3060 | + columns in the target table. So, the single test unit passed. |
| 3061 | + """ |
| 3062 | + |
| 3063 | + assertion_type = _get_fn_name() |
| 3064 | + |
| 3065 | + _check_pre(pre=pre) |
| 3066 | + _check_thresholds(thresholds=thresholds) |
| 3067 | + _check_boolean_input(param=active, param_name="active") |
| 3068 | + _check_boolean_input(param=inverse, param_name="inverse") |
| 3069 | + |
| 3070 | + # Determine threshold to use (global or local) and normalize a local `thresholds=` value |
| 3071 | + thresholds = ( |
| 3072 | + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) |
| 3073 | + ) |
| 3074 | + |
| 3075 | + # If `count` is a DataFrame or table then use the column count of the DataFrame as |
| 3076 | + # the expected count |
| 3077 | + if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)): |
| 3078 | + count = get_column_count(count) |
| 3079 | + |
| 3080 | + # Package up the `count=` and boolean params into a dictionary for later interrogation |
| 3081 | + values = {"count": count, "inverse": inverse} |
| 3082 | + |
| 3083 | + val_info = _ValidationInfo( |
| 3084 | + assertion_type=assertion_type, |
| 3085 | + values=values, |
| 3086 | + pre=pre, |
| 3087 | + thresholds=thresholds, |
| 3088 | + active=active, |
| 3089 | + ) |
| 3090 | + |
| 3091 | + self._add_validation(validation_info=val_info) |
| 3092 | + |
| 3093 | + return self |
| 3094 | + |
2979 | 3095 | def interrogate( |
2980 | 3096 | self, |
2981 | 3097 | collect_extracts: bool = True, |
@@ -3300,10 +3416,28 @@ def interrogate( |
3300 | 3416 |
|
3301 | 3417 | results_tbl = None |
3302 | 3418 |
|
| 3419 | + if assertion_category == "COL_COUNT_MATCH": |
| 3420 | + |
| 3421 | + result_bool = ColCountMatch( |
| 3422 | + data_tbl=data_tbl_step, |
| 3423 | + count=value["count"], |
| 3424 | + inverse=value["inverse"], |
| 3425 | + threshold=threshold, |
| 3426 | + tbl_type=tbl_type, |
| 3427 | + ).get_test_results() |
| 3428 | + |
| 3429 | + validation.all_passed = result_bool |
| 3430 | + validation.n = 1 |
| 3431 | + validation.n_passed = int(result_bool) |
| 3432 | + validation.n_failed = 1 - result_bool |
| 3433 | + |
| 3434 | + results_tbl = None |
| 3435 | + |
3303 | 3436 | if assertion_category not in [ |
3304 | 3437 | "COL_EXISTS_HAS_TYPE", |
3305 | 3438 | "COL_SCHEMA_MATCH", |
3306 | 3439 | "ROW_COUNT_MATCH", |
| 3440 | + "COL_COUNT_MATCH", |
3307 | 3441 | ]: |
3308 | 3442 |
|
3309 | 3443 | # Extract the `pb_is_good_` column from the table as a results list |
@@ -4793,7 +4927,7 @@ def get_tabular_report( |
4793 | 4927 | # Iterate over the values in the `column` entry |
4794 | 4928 | for i, column in enumerate(columns): |
4795 | 4929 |
|
4796 | | - if assertion_type[i] in ["col_schema_match", "row_count_match"]: |
| 4930 | + if assertion_type[i] in ["col_schema_match", "row_count_match", "col_count_match"]: |
4797 | 4931 | columns_upd.append("—") |
4798 | 4932 | else: |
4799 | 4933 | columns_upd.append(column) |
@@ -4850,7 +4984,7 @@ def get_tabular_report( |
4850 | 4984 | elif assertion_type[i] in ["col_schema_match"]: |
4851 | 4985 | values_upd.append("SCHEMA") |
4852 | 4986 |
|
4853 | | - elif assertion_type[i] in ["row_count_match"]: |
| 4987 | + elif assertion_type[i] in ["row_count_match", "col_count_match"]: |
4854 | 4988 |
|
4855 | 4989 | count = values[i]["count"] |
4856 | 4990 | inverse = values[i]["inverse"] |
|
0 commit comments