diff --git a/.gitignore b/.gitignore index 7b9ba3edb..affec794f 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ benchmark/data/*.json .swp uv.lock + +# While typing is experimental, don't mark the entire package as typed +pointblank/py.typed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1e9083fc..b18ce60e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,11 +25,12 @@ Once there is consensus that a PR based on the issue would be helpful, adhering ### Setting Up Your Development Environment -To set up your development environment, you can follow these steps: +To set up your development environment, first clone the posit-dev/pointblank repository. -- Clone the posit-dev/pointblank repository -- Create a virtual environment for the folder -- Install the package in editable mode with `pip install -e .` from the root of the project folder +If you're using UV, you may run `uv sync` and your environment is setup! If using pip or another package manager, keep following these steps: + +- Create a virtual environment for the folder. +- Install the package in editable mode with `pip install -e .` from the root of the project folder. - Install the development dependencies with `pip install '.[dev]'` (have a look at the `pyproject.toml` file for the list of development dependencies) Our documentation uses `quartodoc` which in turn requires a local install of the Quarto CLI. To install Quarto, go to to get the latest build for your platform. @@ -43,3 +44,7 @@ Building the documentation can be done with `make docs-build` from the root of t The tests are located in the `tests` folder and we use `pytest` for running them. To run all of the tests, use `make test`. If you want to run a specific test file, you can use `pytest tests/test_file.py`. If you create new tests involving snapshots, please ensure that the resulting snapshots are relatively small. After adding snapshots, use `make test-update` (this runs `pytest --snapshot-update`). A subsequent use of `make test` should pass without any issues. + +### Linting and Type Checking + +We use `ruff` for linting, the settings used are fairly loose and objective. Linting is run in pre-commit in CI. You can run it locally with `make lint`. Type checking is currently not enforced, but we intend on gradually typing the codebase. You can run `make type` to run Astral's new experimental type checker `ty`. Feel free to leverage type hints and occasionally type checking but it's not obligatory at this time. diff --git a/Makefile b/Makefile index 20c27d7fd..2007aa201 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,11 @@ lint: ## Run ruff formatter and linter @uv run ruff format @uv run ruff check --fix + +type: ## Run experimental(!) type checking + @uvx ty check pointblank + + check: pyright --pythonversion 3.8 pointblank pyright --pythonversion 3.9 pointblank diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 637939486..be2e3c900 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -1911,7 +1911,7 @@ class ColSchemaMatch: """ data_tbl: FrameT | Any - schema: any + schema: Any complete: bool in_order: bool case_sensitive_colnames: bool @@ -2425,7 +2425,7 @@ def _check_nulls_across_columns_nw(table, columns_subset): return result -def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any: +def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any: tgt_col_dtype_str = str(tgt_column.dtype).lower() if compare_val is isinstance(compare_val, Column): # pragma: no cover diff --git a/pointblank/_typing.py b/pointblank/_typing.py index c8cbd56c2..18ba19c7e 100644 --- a/pointblank/_typing.py +++ b/pointblank/_typing.py @@ -1,6 +1,8 @@ from __future__ import annotations +import datetime import sys +from collections.abc import Container from typing import List, Tuple, Union # Check Python version for TypeAlias support @@ -15,6 +17,12 @@ SegmentTuple: TypeAlias = Tuple[str, SegmentValue] SegmentItem: TypeAlias = Union[str, SegmentTuple] SegmentSpec: TypeAlias = Union[str, SegmentTuple, List[SegmentItem]] + + _CompliantValue: TypeAlias = Union[str, int, float, datetime.datetime, datetime.date] + """A compliant value that pointblank can use in a validation step""" + _CompliantValues: TypeAlias = Container[_CompliantValue] + """A collection of compliant values that pointblank can use in a validation step""" + else: # Python 3.8 and 3.9 compatible type aliases AbsoluteBounds = Tuple[int, int] @@ -24,6 +32,10 @@ SegmentTuple = Tuple[str, SegmentValue] SegmentItem = Union[str, SegmentTuple] SegmentSpec = Union[str, SegmentTuple, List[SegmentItem]] + _CompliantValue = Union[str, int, float, datetime.datetime, datetime.date] + """A compliant value that pointblank can use in a validation step""" + _CompliantValues = Container[_CompliantValue] + """A collection of compliant values that pointblank can use in a validation step""" # Add docstrings for better IDE support AbsoluteBounds.__doc__ = "Absolute bounds (i.e., plus or minus)" diff --git a/pointblank/schema.py b/pointblank/schema.py index 5db200732..3911f27e2 100644 --- a/pointblank/schema.py +++ b/pointblank/schema.py @@ -2,10 +2,14 @@ import copy from dataclasses import dataclass +from typing import TYPE_CHECKING from pointblank._constants import IBIS_BACKENDS from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table +if TYPE_CHECKING: + from typing import Any + __all__ = ["Schema"] @@ -265,14 +269,14 @@ class Schema: columns: str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None = ( None ) - tbl: any | None = None + tbl: Any | None = None def __init__( self, columns: ( str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None ) = None, - tbl: any | None = None, + tbl: Any | None = None, **kwargs, ): if tbl is None and columns is None and not kwargs: @@ -872,7 +876,7 @@ def _schema_info_generate_params_dict( def _get_schema_validation_info( - data_tbl: any, + data_tbl: Any, schema: Schema, passed: bool, complete: bool, diff --git a/pointblank/validate.py b/pointblank/validate.py index f4b930681..3dbe94476 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -97,8 +97,9 @@ if TYPE_CHECKING: from collections.abc import Collection + from typing import Any - from pointblank._typing import AbsoluteBounds, Tolerance + from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues __all__ = [ "Validate", @@ -2417,12 +2418,12 @@ class _ValidationInfo: step_id: str | None = None sha1: str | None = None assertion_type: str | None = None - column: any | None = None - values: any | list[any] | tuple | None = None + column: Any | None = None + values: Any | list[any] | tuple | None = None inclusive: tuple[bool, bool] | None = None na_pass: bool | None = None pre: Callable | None = None - segments: any | None = None + segments: Any | None = None thresholds: Thresholds | None = None actions: Actions | None = None label: str | None = None @@ -6937,7 +6938,7 @@ def col_vals_regex( def col_vals_expr( self, - expr: any, + expr: Any, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds = None, @@ -12992,7 +12993,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime: return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S") -def _string_date_dttm_conversion(value: any) -> any: +def _string_date_dttm_conversion(value: Any) -> Any: """ Convert a string to a date or datetime object if it is in the correct format. If the value is not a string, it is returned as is. @@ -13030,9 +13031,9 @@ def _process_brief( brief: str | None, step: int, col: str | list[str] | None, - values: any | None, - thresholds: any | None, - segment: any | None, + values: Any | None, + thresholds: Any | None, + segment: Any | None, ) -> str: # If there is no brief, return `None` if brief is None: @@ -13098,7 +13099,7 @@ def _process_action_str( action_str: str, step: int, col: str | None, - value: any, + value: Any, type: str, level: str, time: str, @@ -13148,7 +13149,7 @@ def _process_action_str( def _create_autobrief_or_failure_text( - assertion_type: str, lang: str, column: str | None, values: str | None, for_failure: bool + assertion_type: str, lang: str, column: str, values: str | None, for_failure: bool ) -> str: if assertion_type in [ "col_vals_gt", @@ -13278,7 +13279,7 @@ def _create_autobrief_or_failure_text( if assertion_type == "specially": return _create_text_specially(lang=lang, for_failure=for_failure) - return None # pragma: no cover + raise NotImplementedError # pragma: no cover def _expect_failure_type(for_failure: bool) -> str: @@ -13288,7 +13289,7 @@ def _expect_failure_type(for_failure: bool) -> str: def _create_text_comparison( assertion_type: str, lang: str, - column: str | list[str] | None, + column: str | list[str], values: str | None, for_failure: bool = False, ) -> str: @@ -13314,7 +13315,7 @@ def _create_text_comparison( def _create_text_between( lang: str, - column: str | None, + column: str, value_1: str, value_2: str, not_: bool = False, @@ -13344,7 +13345,7 @@ def _create_text_between( def _create_text_set( - lang: str, column: str | None, values: list[any], not_: bool = False, for_failure: bool = False + lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False ) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -13366,9 +13367,7 @@ def _create_text_set( return text -def _create_text_null( - lang: str, column: str | None, not_: bool = False, for_failure: bool = False -) -> str: +def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -13385,9 +13384,7 @@ def _create_text_null( return text -def _create_text_regex( - lang: str, column: str | None, pattern: str, for_failure: bool = False -) -> str: +def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -13404,7 +13401,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str: return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang] -def _create_text_col_exists(lang: str, column: str | None, for_failure: bool = False) -> str: +def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -13454,7 +13451,7 @@ def _create_text_rows_complete( return text -def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str: +def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) @@ -13462,7 +13459,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text) -def _create_text_col_count_match(lang: str, value: int, for_failure: bool = False) -> str: +def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) @@ -13485,19 +13482,13 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str: def _prep_column_text(column: str | list[str]) -> str: if isinstance(column, list): return "`" + str(column[0]) + "`" - elif isinstance(column, str): + if isinstance(column, str): return "`" + column + "`" - else: - return "" + raise AssertionError def _prep_values_text( - values: str - | int - | float - | datetime.datetime - | datetime.date - | list[str | int | float | datetime.datetime | datetime.date], + values: _CompliantValue | _CompliantValues, lang: str, limit: int = 3, ) -> str: @@ -13545,7 +13536,7 @@ def _prep_values_text( return values_str -def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]: +def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]: """ Obtain the segmentation categories from a table column. @@ -13637,7 +13628,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]: return seg_tuples -def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any: +def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: """ Apply the segments expression to the data table. @@ -13753,11 +13744,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]: return icon_svg -def _replace_svg_dimensions(svg: list[str], height_width: int | float) -> list[str]: +def _replace_svg_dimensions(svg: str, height_width: int | float) -> str: svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg) - svg = re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg) - - return svg + return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg) def _get_title_text( @@ -13821,7 +13810,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s return title_text -def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]: +def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]: # If no interrogation was performed, return a list of empty strings if not interrogation_performed: return ["" for _ in range(len(pre))] @@ -14135,22 +14124,21 @@ def _transform_assertion_str( return type_upd -def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]: +def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None: if isinstance(pre, Callable): return _get_callable_source(fn=pre) + return None def _get_callable_source(fn: Callable) -> str: - if isinstance(fn, Callable): - try: - source_lines, _ = inspect.getsourcelines(fn) - source = "".join(source_lines).strip() - # Extract the `pre` argument from the source code - pre_arg = _extract_pre_argument(source) - return pre_arg - except (OSError, TypeError): # pragma: no cover - return fn.__name__ - return fn + try: + source_lines, _ = inspect.getsourcelines(fn) + source = "".join(source_lines).strip() + # Extract the `pre` argument from the source code + pre_arg = _extract_pre_argument(source) + return pre_arg + except (OSError, TypeError): # pragma: no cover + return fn.__name__ def _extract_pre_argument(source: str) -> str: @@ -14176,6 +14164,7 @@ def _create_table_time_html( if time_start is None: return "" + assert time_end is not None # typing # Get the time duration (difference between `time_end` and `time_start`) in seconds time_duration = (time_end - time_start).total_seconds() @@ -14393,12 +14382,12 @@ def _step_report_row_based( column: str, column_position: int, columns_subset: list[str] | None, - values: any, + values: Any, inclusive: tuple[bool, bool] | None, n: int, n_failed: int, all_passed: bool, - extract: any, + extract: Any, tbl_preview: GT, header: str, limit: int | None, @@ -14425,10 +14414,12 @@ def _step_report_row_based( elif assertion_type == "col_vals_le": text = f"{column} ≤ {values}" elif assertion_type == "col_vals_between": + assert inclusive is not None symbol_left = "≤" if inclusive[0] else "<" symbol_right = "≤" if inclusive[1] else "<" text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}" elif assertion_type == "col_vals_outside": + assert inclusive is not None symbol_left = "<" if inclusive[0] else "≤" symbol_right = ">" if inclusive[1] else "≥" text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}" @@ -14633,7 +14624,7 @@ def _step_report_rows_distinct( n: int, n_failed: int, all_passed: bool, - extract: any, + extract: Any, tbl_preview: GT, header: str, limit: int | None, @@ -14761,7 +14752,7 @@ def _step_report_rows_distinct( def _step_report_schema_in_order( step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False -) -> GT | any: +) -> GT | Any: """ This is the case for schema validation where the schema is supposed to have the same column order as the target table. @@ -15100,7 +15091,7 @@ def _step_report_schema_in_order( def _step_report_schema_any_order( step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False -) -> GT | any: +) -> GT | Any: """ This is the case for schema validation where the schema is permitted to not have to be in the same column order as the target table. diff --git a/pyproject.toml b/pyproject.toml index 7ea91b0e8..ceead60f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,6 @@ dev = [ "pytest-rerunfailures>=15.0", "pytest-snapshot", "pytest-xdist>=3.6.1", - "pytest-xdist>=3.6.1", "quartodoc>=0.8.1; python_version >= '3.9'", "ruff>=0.9.9", "shiny>=1.4.0", diff --git a/tests/test_validate.py b/tests/test_validate.py index f5e323317..74b7bcbc6 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -12650,7 +12650,8 @@ def test_above_threshold_no_interrogation(): def test_prep_column_text(): assert _prep_column_text(column="column") == "`column`" assert _prep_column_text(column=["column_a", "column_b"]) == "`column_a`" - assert _prep_column_text(column=3) == "" + with pytest.raises(AssertionError): + _prep_column_text(column=3) def test_validate_csv_string_path_input():