diff --git a/docs/_quarto.yml b/docs/_quarto.yml index ee512938f..66aac42d9 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -157,6 +157,8 @@ quartodoc: - name: Validate.col_vals_outside - name: Validate.col_vals_in_set - name: Validate.col_vals_not_in_set + - name: Validate.col_vals_increasing + - name: Validate.col_vals_decreasing - name: Validate.col_vals_null - name: Validate.col_vals_not_null - name: Validate.col_vals_regex diff --git a/pointblank/_constants.py b/pointblank/_constants.py index e45d5c191..a8e20bf47 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -36,6 +36,8 @@ "col_vals_not_in_set": "not_in_set", "col_vals_regex": "regex", "col_vals_within_spec": "within_spec", + "col_vals_increasing": "increasing", + "col_vals_decreasing": "decreasing", "col_vals_null": "null", "col_vals_not_null": "not_null", "col_vals_expr": "expr", @@ -81,6 +83,8 @@ "col_vals_not_in_set", "col_vals_regex", "col_vals_within_spec", + "col_vals_increasing", + "col_vals_decreasing", "col_vals_null", "col_vals_not_null", "col_vals_expr", @@ -310,6 +314,30 @@ +""", + "col_vals_increasing": """ + + col_vals_increasing + + + + + + + + +""", + "col_vals_decreasing": """ + + col_vals_decreasing + + + + + + + + """, "col_vals_null": """ diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index ee35b5882..47d796e8f 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -2197,6 +2197,122 @@ def interrogate_not_null(tbl: FrameT, column: str) -> FrameT: return result_tbl.to_native() +def interrogate_increasing( + tbl: FrameT, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool +) -> FrameT: + """ + Increasing interrogation. + + Checks whether column values are increasing row by row. + + Parameters + ---------- + tbl + The table to interrogate. + column + The column to check. + allow_stationary + Whether to allow consecutive equal values (stationary phases). + decreasing_tol + Optional tolerance for negative movement (decreasing values). + na_pass + Whether NA/null values should be considered as passing. + + Returns + ------- + FrameT + The table with a `pb_is_good_` column indicating pass/fail for each row. + """ + nw_tbl = nw.from_native(tbl) + + # Create a lagged difference column + result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1)) + + # Build the condition based on allow_stationary and decreasing_tol + if allow_stationary or decreasing_tol != 0: + # Allow stationary (diff >= 0) or within tolerance + threshold = -abs(decreasing_tol) if decreasing_tol != 0 else 0 + good_condition = nw.col("pb_lagged_difference_") >= threshold + else: + # Strictly increasing (diff > 0) + good_condition = nw.col("pb_lagged_difference_") > 0 + + # Apply the validation logic + # The logic is: + # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA) + # 2. If current value is null -> apply na_pass + # 3. Otherwise -> apply the good_condition + result_tbl = result_tbl.with_columns( + pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null()) + .then(nw.lit(True)) # First row or row after NA (can't validate) + .otherwise( + nw.when(nw.col(column).is_null()) + .then(nw.lit(na_pass)) # Handle NA values in current row + .otherwise(good_condition) + ) + ) + + return result_tbl.drop("pb_lagged_difference_").to_native() + + +def interrogate_decreasing( + tbl: FrameT, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool +) -> FrameT: + """ + Decreasing interrogation. + + Checks whether column values are decreasing row by row. + + Parameters + ---------- + tbl + The table to interrogate. + column + The column to check. + allow_stationary + Whether to allow consecutive equal values (stationary phases). + increasing_tol + Optional tolerance for positive movement (increasing values). + na_pass + Whether NA/null values should be considered as passing. + + Returns + ------- + FrameT + The table with a `pb_is_good_` column indicating pass/fail for each row. + """ + nw_tbl = nw.from_native(tbl) + + # Create a lagged difference column + result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1)) + + # Build the condition based on allow_stationary and increasing_tol + if allow_stationary or increasing_tol != 0: + # Allow stationary (diff <= 0) or within tolerance + threshold = abs(increasing_tol) if increasing_tol != 0 else 0 + good_condition = nw.col("pb_lagged_difference_") <= threshold + else: + # Strictly decreasing (diff < 0) + good_condition = nw.col("pb_lagged_difference_") < 0 + + # Apply the validation logic + # The logic is: + # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA) + # 2. If current value is null -> apply na_pass + # 3. Otherwise -> apply the good_condition + result_tbl = result_tbl.with_columns( + pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null()) + .then(nw.lit(True)) # First row or row after NA (can't validate) + .otherwise( + nw.when(nw.col(column).is_null()) + .then(nw.lit(na_pass)) # Handle NA values in current row + .otherwise(good_condition) + ) + ) + + return result_tbl.drop("pb_lagged_difference_").to_native() + + def _interrogate_comparison_base( tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str ) -> FrameT: diff --git a/pointblank/_utils.py b/pointblank/_utils.py index 0622cc55d..6d5712470 100644 --- a/pointblank/_utils.py +++ b/pointblank/_utils.py @@ -670,9 +670,12 @@ def _get_api_text() -> str: "Validate.col_vals_outside", "Validate.col_vals_in_set", "Validate.col_vals_not_in_set", + "Validate.col_vals_increasing", + "Validate.col_vals_decreasing", "Validate.col_vals_null", "Validate.col_vals_not_null", "Validate.col_vals_regex", + "Validate.col_vals_within_spec", "Validate.col_vals_expr", "Validate.rows_distinct", "Validate.rows_complete", diff --git a/pointblank/data/api-docs.txt b/pointblank/data/api-docs.txt index fb57594e7..2f293f372 100644 --- a/pointblank/data/api-docs.txt +++ b/pointblank/data/api-docs.txt @@ -3677,6 +3677,262 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | statuses in the `InvalidStatus` enum. +col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate' + + Are column data increasing by row? + + The `col_vals_increasing()` validation method checks whether column values in a table are + increasing when moving down a table. There are options for allowing missing values in the + target column, allowing stationary phases (where consecutive values don't change), and even + one for allowing decreasing movements up to a certain threshold. This validation will + operate over the number of test units that is equal to the number of rows in the table + (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + allow_stationary + An option to allow pauses in increasing values. For example, if the values for the test + units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time) + would be marked as failing when `allow_stationary` is `False`. Using + `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to + be marked as passing. + decreasing_tol + An optional threshold value that allows for movement of numerical values in the negative + direction. By default this is `None` but using a numerical value will set the absolute + threshold of negative travel allowed across numerical test units. Note that setting a + value here also has the effect of setting `allow_stationary` to `True`. + na_pass + Should any encountered None, NA, or Null values be considered as passing test units? By + default, this is `False`. Set to `True` to pass test units with missing values. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value indicating whether the validation step should be active. Using `False` + will make the validation step inactive (still reporting its presence and keeping indexes + for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The + table is shown below: + + ```python + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 2, 3, 4, 5], + "c": [1, 2, 1, 3, 4, 5], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that values in column `a` are increasing. We'll determine if this validation + had any failing test units (there are six test units, one for each row). + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="a") + .interrogate() + ) + + validation + ``` + + The validation passed as all values in column `a` are increasing. Now let's check column + `b` which has a stationary value: + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="b") + .interrogate() + ) + + validation + ``` + + This validation fails at the third row because the value `2` is repeated. If we want to + allow stationary values, we can use `allow_stationary=True`: + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="b", allow_stationary=True) + .interrogate() + ) + + validation + ``` + + +col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate' + + Are column data decreasing by row? + + The `col_vals_decreasing()` validation method checks whether column values in a table are + decreasing when moving down a table. There are options for allowing missing values in the + target column, allowing stationary phases (where consecutive values don't change), and even + one for allowing increasing movements up to a certain threshold. This validation will + operate over the number of test units that is equal to the number of rows in the table + (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + allow_stationary + An option to allow pauses in decreasing values. For example, if the values for the test + units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time) + would be marked as failing when `allow_stationary` is `False`. Using + `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to + be marked as passing. + increasing_tol + An optional threshold value that allows for movement of numerical values in the positive + direction. By default this is `None` but using a numerical value will set the absolute + threshold of positive travel allowed across numerical test units. Note that setting a + value here also has the effect of setting `allow_stationary` to `True`. + na_pass + Should any encountered None, NA, or Null values be considered as passing test units? By + default, this is `False`. Set to `True` to pass test units with missing values. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value indicating whether the validation step should be active. Using `False` + will make the validation step inactive (still reporting its presence and keeping indexes + for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The + table is shown below: + + ```python + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "a": [6, 5, 4, 3, 2, 1], + "b": [5, 4, 4, 3, 2, 1], + "c": [5, 4, 5, 3, 2, 1], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that values in column `a` are decreasing. We'll determine if this validation + had any failing test units (there are six test units, one for each row). + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="a") + .interrogate() + ) + + validation + ``` + + The validation passed as all values in column `a` are decreasing. Now let's check column + `b` which has a stationary value: + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="b") + .interrogate() + ) + + validation + ``` + + This validation fails at the third row because the value `4` is repeated. If we want to + allow stationary values, we can use `allow_stationary=True`: + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="b", allow_stationary=True) + .interrogate() + ) + + validation + ``` + + col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate' Validate whether values in a column are Null. @@ -4245,6 +4501,224 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum string values of rows 1 and 2 in column `b`. +col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate' + + Validate whether column values fit within a specification. + + The `col_vals_within_spec()` validation method checks whether column values in a table + correspond to a specification (`spec=`) type (details of which are available in the + *Specifications* section). Specifications include common data types like email addresses, + URLs, postal codes, vehicle identification numbers (VINs), International Bank Account + Numbers (IBANs), and more. This validation will operate over the number of test units that + is equal to the number of rows in the table. + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + spec + A specification string for defining the specification type. Examples are `"email"`, + `"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available + options. + na_pass + Should any encountered None, NA, or Null values be considered as passing test units? By + default, this is `False`. Set to `True` to pass test units with missing values. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value indicating whether the validation step should be active. Using `False` + will make the validation step inactive (still reporting its presence and keeping indexes + for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Specifications + -------------- + A specification type must be used with the `spec=` argument. This is a string-based keyword + that corresponds to the type of data in the specified columns. The following keywords can + be used: + + - `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier + for books. This keyword validates both 10-digit and 13-digit ISBNs. + + - `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive + industry to identify individual motor vehicles. + + - `"postal_code[]"`: A postal code (also known as postcodes, PIN, or ZIP + codes) is a series of letters, digits, or both included in a postal address. Because the + coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or + 3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or + `"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes. + + - `"credit_card"`: A credit card number can be validated across a variety of issuers. The + validation uses the Luhn algorithm. + + - `"iban[]"`: The International Bank Account Number (IBAN) is a system of + identifying bank accounts across countries. Because the length and coding varies by + country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`). + + - `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are + unique identifiers for financial and non-financial institutions. + + - `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email + addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with + their respective keywords. + + Only a single `spec=` value should be provided per function call. + + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + The preprocessing function can be any callable that takes a table as input and returns a + modified table. For example, you could use a lambda function to filter the table based on + certain criteria or to apply a transformation to the data. Note that you can refer to + a column via `columns=` that is expected to be present in the transformed table, but may not + exist in the table before preprocessing. Regarding the lifetime of the transformed table, it + only exists during the validation step and is not stored in the `Validate` object or used in + subsequent validation steps. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. + + Providing a single column name will result in a separate validation step for each unique + value in that column. For example, if you have a column called `"region"` with values + `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each + region. + + Alternatively, you can provide a tuple that specifies a column name and its corresponding + values to segment on. For example, if you have a column called `"date"` and you want to + segment on only specific dates, you can provide a tuple like + `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded + (i.e., no validation steps will be created for them). + + A list with a combination of column names and tuples can be provided as well. This allows + for more complex segmentation scenarios. The following inputs are both valid: + + ``` + # Segments from all unique values in the `region` column + # and specific dates in the `date` column + segments=["region", ("date", ["2023-01-01", "2023-01-02"])] + + # Segments from all unique values in the `region` and `date` columns + segments=["region", "date"] + ``` + + The segmentation is performed during interrogation, and the resulting validation steps will + be numbered sequentially. Each segment will have its own validation step, and the results + will be reported separately. This allows for a more granular analysis of the data and helps + identify issues within specific segments. + + Importantly, the segmentation process will be performed after any preprocessing of the data + table. Because of this, one can conceivably use the `pre=` argument to generate a column + that can be used for segmentation. For example, you could create a new column called + `"segment"` through use of `pre=` and then use that column for segmentation. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + + Examples + -------- + For the examples here, we'll use a simple Polars DataFrame with an email column. The table + is shown below: + + ```python + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "email": [ + "user@example.com", + "admin@test.org", + "invalid-email", + "contact@company.co.uk", + ], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that all of the values in the `email` column are valid email addresses. + We'll determine if this validation had any failing test units (there are four test units, + one for each row). + + ```python + validation = ( + pb.Validate(data=tbl) + .col_vals_within_spec(columns="email", spec="email") + .interrogate() + ) + + validation + ``` + + The validation table shows that one test unit failed (the invalid email address in row 3). + + col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate' Validate column values using a custom expression. diff --git a/pointblank/validate.py b/pointblank/validate.py index f4d502b4e..fa7d0d9ed 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -7895,6 +7895,382 @@ class InvalidStatus(Enum): return self + def col_vals_increasing( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + allow_stationary: bool = False, + decreasing_tol: float | None = None, + na_pass: bool = False, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool = True, + ) -> Validate: + """ + Are column data increasing by row? + + The `col_vals_increasing()` validation method checks whether column values in a table are + increasing when moving down a table. There are options for allowing missing values in the + target column, allowing stationary phases (where consecutive values don't change), and even + one for allowing decreasing movements up to a certain threshold. This validation will + operate over the number of test units that is equal to the number of rows in the table + (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + allow_stationary + An option to allow pauses in increasing values. For example, if the values for the test + units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time) + would be marked as failing when `allow_stationary` is `False`. Using + `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to + be marked as passing. + decreasing_tol + An optional threshold value that allows for movement of numerical values in the negative + direction. By default this is `None` but using a numerical value will set the absolute + threshold of negative travel allowed across numerical test units. Note that setting a + value here also has the effect of setting `allow_stationary` to `True`. + na_pass + Should any encountered None, NA, or Null values be considered as passing test units? By + default, this is `False`. Set to `True` to pass test units with missing values. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value indicating whether the validation step should be active. Using `False` + will make the validation step inactive (still reporting its presence and keeping indexes + for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False) + ``` + + For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The + table is shown below: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 2, 3, 4, 5], + "c": [1, 2, 1, 3, 4, 5], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that values in column `a` are increasing. We'll determine if this validation + had any failing test units (there are six test units, one for each row). + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="a") + .interrogate() + ) + + validation + ``` + + The validation passed as all values in column `a` are increasing. Now let's check column + `b` which has a stationary value: + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="b") + .interrogate() + ) + + validation + ``` + + This validation fails at the third row because the value `2` is repeated. If we want to + allow stationary values, we can use `allow_stationary=True`: + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_increasing(columns="b", allow_stationary=True) + .interrogate() + ) + + validation + ``` + """ + assertion_type = "col_vals_increasing" + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later + # resolve the columns + if isinstance(columns, (ColumnSelector, nw.selectors.Selector)): + columns = col(columns) + + # If `columns` is Column value or a string, place it in a list for iteration + if isinstance(columns, (Column, str)): + columns = [columns] + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values="", + na_pass=na_pass, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + val_info={ + "allow_stationary": allow_stationary, + "decreasing_tol": decreasing_tol if decreasing_tol else 0.0, + }, + ) + + self._add_validation(validation_info=val_info) + + return self + + def col_vals_decreasing( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + allow_stationary: bool = False, + increasing_tol: float | None = None, + na_pass: bool = False, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool = True, + ) -> Validate: + """ + Are column data decreasing by row? + + The `col_vals_decreasing()` validation method checks whether column values in a table are + decreasing when moving down a table. There are options for allowing missing values in the + target column, allowing stationary phases (where consecutive values don't change), and even + one for allowing increasing movements up to a certain threshold. This validation will + operate over the number of test units that is equal to the number of rows in the table + (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + allow_stationary + An option to allow pauses in decreasing values. For example, if the values for the test + units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time) + would be marked as failing when `allow_stationary` is `False`. Using + `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to + be marked as passing. + increasing_tol + An optional threshold value that allows for movement of numerical values in the positive + direction. By default this is `None` but using a numerical value will set the absolute + threshold of positive travel allowed across numerical test units. Note that setting a + value here also has the effect of setting `allow_stationary` to `True`. + na_pass + Should any encountered None, NA, or Null values be considered as passing test units? By + default, this is `False`. Set to `True` to pass test units with missing values. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value indicating whether the validation step should be active. Using `False` + will make the validation step inactive (still reporting its presence and keeping indexes + for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False) + ``` + + For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The + table is shown below: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "a": [6, 5, 4, 3, 2, 1], + "b": [5, 4, 4, 3, 2, 1], + "c": [5, 4, 5, 3, 2, 1], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that values in column `a` are decreasing. We'll determine if this validation + had any failing test units (there are six test units, one for each row). + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="a") + .interrogate() + ) + + validation + ``` + + The validation passed as all values in column `a` are decreasing. Now let's check column + `b` which has a stationary value: + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="b") + .interrogate() + ) + + validation + ``` + + This validation fails at the third row because the value `4` is repeated. If we want to + allow stationary values, we can use `allow_stationary=True`: + + ```{python} + validation = ( + pb.Validate(data=tbl) + .col_vals_decreasing(columns="b", allow_stationary=True) + .interrogate() + ) + + validation + ``` + """ + assertion_type = "col_vals_decreasing" + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later + # resolve the columns + if isinstance(columns, (ColumnSelector, nw.selectors.Selector)): + columns = col(columns) + + # If `columns` is Column value or a string, place it in a list for iteration + if isinstance(columns, (Column, str)): + columns = [columns] + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values="", + na_pass=na_pass, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + val_info={ + "allow_stationary": allow_stationary, + "increasing_tol": increasing_tol if increasing_tol else 0.0, + }, + ) + + self._add_validation(validation_info=val_info) + + return self + def col_vals_null( self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, @@ -11730,6 +12106,8 @@ def interrogate( "col_vals_le", "col_vals_null", "col_vals_not_null", + "col_vals_increasing", + "col_vals_decreasing", "col_vals_between", "col_vals_outside", "col_vals_in_set", @@ -11771,6 +12149,36 @@ def interrogate( elif assertion_method == "not_null": results_tbl = interrogate_not_null(tbl=tbl, column=column) + elif assertion_type == "col_vals_increasing": + from pointblank._interrogation import interrogate_increasing + + # Extract direction options from val_info + allow_stationary = validation.val_info.get("allow_stationary", False) + decreasing_tol = validation.val_info.get("decreasing_tol", 0.0) + + results_tbl = interrogate_increasing( + tbl=tbl, + column=column, + allow_stationary=allow_stationary, + decreasing_tol=decreasing_tol, + na_pass=na_pass, + ) + + elif assertion_type == "col_vals_decreasing": + from pointblank._interrogation import interrogate_decreasing + + # Extract direction options from val_info + allow_stationary = validation.val_info.get("allow_stationary", False) + increasing_tol = validation.val_info.get("increasing_tol", 0.0) + + results_tbl = interrogate_decreasing( + tbl=tbl, + column=column, + allow_stationary=allow_stationary, + increasing_tol=increasing_tol, + na_pass=na_pass, + ) + elif assertion_type == "col_vals_between": results_tbl = interrogate_between( tbl=tbl, @@ -14515,6 +14923,9 @@ def get_tabular_report( elif assertion_type[i] in ["col_vals_expr", "conjointly"]: values_upd.append("COLUMN EXPR") + elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]: + values_upd.append("") + elif assertion_type[i] in ["row_count_match", "col_count_match"]: count = values[i]["count"] inverse = values[i]["inverse"] diff --git a/tests/test_col_vals_increasing_decreasing.py b/tests/test_col_vals_increasing_decreasing.py new file mode 100644 index 000000000..27da5123b --- /dev/null +++ b/tests/test_col_vals_increasing_decreasing.py @@ -0,0 +1,260 @@ +import datetime + +import polars as pl + +import pointblank as pb + + +def test_strictly_increasing_passes(): + """Test that strictly increasing values pass validation.""" + tbl = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + assert validation.f_passed(i=1, scalar=True) == 1.0 + + +def test_strictly_increasing_with_stationary_fails(): + """Test that stationary values fail when allow_stationary=False.""" + tbl = pl.DataFrame({"a": [1, 2, 2, 3, 4]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 4 # First value and values after stationary + assert validation.n_failed(i=1, scalar=True) == 1 # The repeated value + assert validation.f_failed(i=1, scalar=True) == 0.2 + + +def test_allow_stationary_passes(): + """Test that stationary values pass when allow_stationary=True.""" + tbl = pl.DataFrame({"a": [1, 2, 2, 3, 4]}) + + validation = ( + pb.Validate(data=tbl).col_vals_increasing(columns="a", allow_stationary=True).interrogate() + ) + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + assert validation.f_passed(i=1, scalar=True) == 1.0 + + +def test_increasing_decreasing_values_fail(): + """Test that decreasing values fail validation.""" + tbl = pl.DataFrame({"a": [1, 2, 3, 2, 4]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + + assert validation.n_failed(i=1, scalar=True) == 1 # The value that decreased + + +def test_increasing_decreasing_tol_allows_small_decreases(): + """Test that decreasing_tol allows small decreases.""" + tbl = pl.DataFrame({"a": [10, 12, 11, 13, 15]}) # 12 to 11 is -1 + + validation = ( + pb.Validate(data=tbl).col_vals_increasing(columns="a", decreasing_tol=1.0).interrogate() + ) + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_increasing_decreasing_tol_fails_large_decreases(): + """Test that decreasing_tol still fails large decreases.""" + tbl = pl.DataFrame({"a": [10, 12, 8, 13, 15]}) # 12 to 8 is -4 + + validation = ( + pb.Validate(data=tbl).col_vals_increasing(columns="a", decreasing_tol=1.0).interrogate() + ) + + assert validation.n_failed(i=1, scalar=True) == 1 # The large decrease + + +def test_increasing_na_pass_false_fails_na_values(): + """Test that NA values fail when na_pass=False.""" + tbl = pl.DataFrame({"a": [1, 2, None, 4, 5]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a", na_pass=False).interrogate() + + assert validation.n_failed(i=1, scalar=True) == 1 # The None value + + +def test_increasing_na_pass_true_passes_na_values(): + """Test that NA values pass when na_pass=True.""" + tbl = pl.DataFrame({"a": [1, 2, None, 4, 5]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a", na_pass=True).interrogate() + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_increasing_multiple_columns(): + """Test validation on multiple columns.""" + tbl = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30], "c": [5, 5, 6]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns=["a", "b"]).interrogate() + + # Both columns should pass + assert validation.n_passed(i=1, scalar=True) == 3 # Column a + assert validation.n_passed(i=2, scalar=True) == 3 # Column b + + +def test_strictly_decreasing_passes(): + """Test that strictly decreasing values pass validation.""" + tbl = pl.DataFrame({"a": [5, 4, 3, 2, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + assert validation.f_passed(i=1, scalar=True) == 1.0 + + +def test_strictly_decreasing_with_stationary_fails(): + """Test that stationary values fail when allow_stationary=False.""" + tbl = pl.DataFrame({"a": [4, 3, 3, 2, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 4 # First value and values after stationary + assert validation.n_failed(i=1, scalar=True) == 1 # The repeated value + + +def test_decreasing_allow_stationary_passes(): + """Test that stationary values pass when allow_stationary=True.""" + tbl = pl.DataFrame({"a": [4, 3, 3, 2, 1]}) + + validation = ( + pb.Validate(data=tbl).col_vals_decreasing(columns="a", allow_stationary=True).interrogate() + ) + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + assert validation.f_passed(i=1, scalar=True) == 1.0 + + +def test_decreasing_increasing_values_fail(): + """Test that increasing values fail validation.""" + tbl = pl.DataFrame({"a": [4, 3, 2, 3, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a").interrogate() + + assert validation.n_failed(i=1, scalar=True) == 1 # The value that increased + + +def test_decreasing_increasing_tol_allows_small_increases(): + """Test that increasing_tol allows small increases.""" + tbl = pl.DataFrame({"a": [15, 13, 14, 12, 10]}) # 13 to 14 is +1 + + validation = ( + pb.Validate(data=tbl).col_vals_decreasing(columns="a", increasing_tol=1.0).interrogate() + ) + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_decreasing_increasing_tol_fails_large_increases(): + """Test that increasing_tol still fails large increases.""" + tbl = pl.DataFrame({"a": [15, 13, 18, 12, 10]}) # 13 to 18 is +5 + + validation = ( + pb.Validate(data=tbl).col_vals_decreasing(columns="a", increasing_tol=1.0).interrogate() + ) + + assert validation.n_failed(i=1, scalar=True) == 1 # The large increase + + +def test_decreasing_na_pass_false_fails_na_values(): + """Test that NA values fail when na_pass=False.""" + tbl = pl.DataFrame({"a": [5, 4, None, 2, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a", na_pass=False).interrogate() + + assert validation.n_failed(i=1, scalar=True) == 1 # The None value + + +def test_decreasing_na_pass_true_passes_na_values(): + """Test that NA values pass when na_pass=True.""" + tbl = pl.DataFrame({"a": [5, 4, None, 2, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a", na_pass=True).interrogate() + + assert validation.n_passed(i=1, scalar=True) == 5 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_decreasing_multiple_columns(): + """Test validation on multiple columns.""" + tbl = pl.DataFrame({"a": [3, 2, 1], "b": [30, 20, 10], "c": [6, 5, 5]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns=["a", "b"]).interrogate() + + # Both columns should pass + assert validation.n_passed(i=1, scalar=True) == 3 # Column a + assert validation.n_passed(i=2, scalar=True) == 3 # Column b + + +# Edge case tests + + +def test_single_value_always_passes(): + """Test that a single value always passes (no previous value to compare).""" + tbl = pl.DataFrame({"a": [5]}) + + validation_inc = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + validation_dec = pb.Validate(data=tbl).col_vals_decreasing(columns="a").interrogate() + + assert validation_inc.n_passed(i=1, scalar=True) == 1 + assert validation_dec.n_passed(i=1, scalar=True) == 1 + + +def test_two_values_increasing(): + """Test with exactly two values for increasing.""" + tbl = pl.DataFrame({"a": [1, 2]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 2 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_two_values_decreasing(): + """Test with exactly two values for decreasing.""" + tbl = pl.DataFrame({"a": [2, 1]}) + + validation = pb.Validate(data=tbl).col_vals_decreasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 2 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_datetime_values_increasing(): + """Test with datetime values for increasing.""" + tbl = pl.DataFrame( + { + "date": [ + datetime.datetime(2023, 1, 1), + datetime.datetime(2023, 1, 2), + datetime.datetime(2023, 1, 3), + ] + } + ) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="date").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 3 + assert validation.n_failed(i=1, scalar=True) == 0 + + +def test_float_values(): + """Test with float values.""" + tbl = pl.DataFrame({"a": [1.1, 2.2, 3.3, 4.4]}) + + validation = pb.Validate(data=tbl).col_vals_increasing(columns="a").interrogate() + + assert validation.n_passed(i=1, scalar=True) == 4 + assert validation.n_failed(i=1, scalar=True) == 0