From 340396886d547486840e29fb1c9669c7c0944e56 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 31 May 2025 11:49:18 -0400 Subject: [PATCH 1/3] build out compare testing --- tests/test_compare.py | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/tests/test_compare.py b/tests/test_compare.py index d8bf0ca07..8e082f392 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -1,15 +1,37 @@ from __future__ import annotations import pytest +import polars as pl -from pointblank.compare import Compare -import polars.testing.parametric as pt -from hypothesis import given +from pointblank.compare import Compare, MetaSummary -@pytest.mark.xfail -def test_compare_basic(dfa, dfb) -> None: - comp = Compare(dfa, dfb) +def test_compare_basic() -> None: + df1 = { + "a": [1, 2, 3], + "b": [4, 5, 6], + } + df2 = { + "a": [1, 2, 3], + "b": ["4", "5", "7"], + "c": [8, 9, 10], + } + data1 = pl.DataFrame(df1) + data2 = pl.DataFrame(df2) + comp = Compare(data1, data2) comp.compare() - raise NotImplementedError + ## Pull out the summary data + summary: MetaSummary = comp.meta_summary + + assert summary.name == ["a", "b"] + assert summary.n_observations == (3, 3) + assert summary.n_variables == (2, 3) + assert summary.in_a_only == set() + assert summary.in_b_only == {"c"} + assert summary.in_both == {"a", "b"} + assert summary.conflicting_types == ["b"] + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 4bf99cbacb8d6ecb5fab1a5a30983bbf8eacdc99 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 31 May 2025 11:49:38 -0400 Subject: [PATCH 2/3] __getitem__ to data profile --- pointblank/scan_profile.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pointblank/scan_profile.py b/pointblank/scan_profile.py index efc4f1d3f..55ca0e8f7 100644 --- a/pointblank/scan_profile.py +++ b/pointblank/scan_profile.py @@ -257,6 +257,13 @@ def __init__( self.implementation = implementation self.column_profiles: list[ColumnProfile] = [] + def __getitem__(self, colname: str) -> ColumnProfile: + """Get a column profile by its name.""" + for prof in self.column_profiles: + if prof.colname == colname: + return prof + raise KeyError(f"Column profile for '{colname}' not found.") + def set_row_count(self, data: Frame) -> None: assert self.columns # internal: cols should already be set From 534e8c3105102c263e361f2323453188ca2c2cad Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 31 May 2025 11:49:54 -0400 Subject: [PATCH 3/3] add meta summary calculation --- pointblank/compare.py | 71 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/pointblank/compare.py b/pointblank/compare.py index 04dd6ca95..f61068741 100644 --- a/pointblank/compare.py +++ b/pointblank/compare.py @@ -1,27 +1,76 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, NamedTuple + +import narwhals as nw from pointblank import DataScan if TYPE_CHECKING: - from narwhals.typing import IntoFrame + from typing import Any + + from narwhals.typing import IntoFrameT class Compare: - def __init__(self, a: IntoFrame, b: IntoFrame) -> None: - self.a: IntoFrame = a - self.b: IntoFrame = b + def __init__(self, a: IntoFrameT, b: IntoFrameT, backend: Any = None) -> None: + self.a: IntoFrameT = a + self.b: IntoFrameT = b def compare(self) -> None: - ## Scan both frames self._scana = DataScan(self.a) + self._asummary: nw.DataFrame = nw.from_native(self._scana.summary_data) self._scanb = DataScan(self.b) + self._bsummary: nw.DataFrame = nw.from_native(self._scanb.summary_data) + + @property + def meta_summary(self) -> MetaSummary: + """Return metadata summary.""" + # TODO: elegant error if compare is not called first + + ## Number of rows: + arows: int = self._scana.profile.row_count + brows: int = self._scanb.profile.row_count + + ## Number of variables: + avars: int = len(self._scana.profile.columns) + bvars: int = len(self._scanb.profile.columns) + + ## Cols only in `a`: + acols: set[str] = set(self._scana.profile.columns) + bcols: set[str] = set(self._scanb.profile.columns) + aonly: set[str] = acols - bcols + bonly: set[str] = bcols - acols + bothcols: set[str] = acols & bcols + + ## Conflicting types: + conflicting: list[str] = [] + for col in bothcols: + atype = self._scana.profile[col].coltype + btype = self._scanb.profile[col].coltype + if atype != btype: + conflicting.append(col) + + ## Create the Summary Frame: + aname: str = self._scana.profile.table_name or "a" + bname: str = self._scanb.profile.table_name or "b" - ## Get summary outs - summarya = self._scana.summary_data - summaryb = self._scana.summary_data + return MetaSummary( + name=[aname, bname], + n_observations=(arows, brows), + n_variables=(avars, bvars), + in_a_only=aonly, + in_b_only=bonly, + in_both=bothcols, + conflicting_types=conflicting, + ) - summarya.columns - self._scana.profile +class MetaSummary(NamedTuple): + name: list[str] + n_observations: tuple[int, int] + n_variables: tuple[int, int] + in_a_only: set[str] + in_b_only: set[str] + in_both: set[str] + conflicting_types: list[str]