Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 60 additions & 11 deletions pointblank/compare.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,76 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, NamedTuple

import narwhals as nw

from pointblank import DataScan

if TYPE_CHECKING:
from narwhals.typing import IntoFrame
from typing import Any

from narwhals.typing import IntoFrameT


class Compare:
def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
self.a: IntoFrame = a
self.b: IntoFrame = b
def __init__(self, a: IntoFrameT, b: IntoFrameT, backend: Any = None) -> None:
self.a: IntoFrameT = a
self.b: IntoFrameT = b

def compare(self) -> None:
## Scan both frames
self._scana = DataScan(self.a)
self._asummary: nw.DataFrame = nw.from_native(self._scana.summary_data)
self._scanb = DataScan(self.b)
self._bsummary: nw.DataFrame = nw.from_native(self._scanb.summary_data)

@property
def meta_summary(self) -> MetaSummary:
"""Return metadata summary."""
# TODO: elegant error if compare is not called first

## Number of rows:
arows: int = self._scana.profile.row_count
brows: int = self._scanb.profile.row_count

## Number of variables:
avars: int = len(self._scana.profile.columns)
bvars: int = len(self._scanb.profile.columns)

## Cols only in `a`:
acols: set[str] = set(self._scana.profile.columns)
bcols: set[str] = set(self._scanb.profile.columns)
aonly: set[str] = acols - bcols
bonly: set[str] = bcols - acols
bothcols: set[str] = acols & bcols

## Conflicting types:
conflicting: list[str] = []
for col in bothcols:
atype = self._scana.profile[col].coltype
btype = self._scanb.profile[col].coltype
if atype != btype:
conflicting.append(col)

## Create the Summary Frame:
aname: str = self._scana.profile.table_name or "a"
bname: str = self._scanb.profile.table_name or "b"

## Get summary outs
summarya = self._scana.summary_data
summaryb = self._scana.summary_data
return MetaSummary(
name=[aname, bname],
n_observations=(arows, brows),
n_variables=(avars, bvars),
in_a_only=aonly,
in_b_only=bonly,
in_both=bothcols,
conflicting_types=conflicting,
)

summarya.columns

self._scana.profile
class MetaSummary(NamedTuple):
name: list[str]
n_observations: tuple[int, int]
n_variables: tuple[int, int]
in_a_only: set[str]
in_b_only: set[str]
in_both: set[str]
conflicting_types: list[str]
7 changes: 7 additions & 0 deletions pointblank/scan_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,13 @@ def __init__(
self.implementation = implementation
self.column_profiles: list[ColumnProfile] = []

def __getitem__(self, colname: str) -> ColumnProfile:
"""Get a column profile by its name."""
for prof in self.column_profiles:
if prof.colname == colname:
return prof
raise KeyError(f"Column profile for '{colname}' not found.")

def set_row_count(self, data: Frame) -> None:
assert self.columns # internal: cols should already be set

Expand Down
36 changes: 29 additions & 7 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,37 @@
from __future__ import annotations
import pytest
import polars as pl

from pointblank.compare import Compare
import polars.testing.parametric as pt
from hypothesis import given
from pointblank.compare import Compare, MetaSummary


@pytest.mark.xfail
def test_compare_basic(dfa, dfb) -> None:
comp = Compare(dfa, dfb)
def test_compare_basic() -> None:
df1 = {
"a": [1, 2, 3],
"b": [4, 5, 6],
}
df2 = {
"a": [1, 2, 3],
"b": ["4", "5", "7"],
"c": [8, 9, 10],
}
data1 = pl.DataFrame(df1)
data2 = pl.DataFrame(df2)
comp = Compare(data1, data2)

comp.compare()

raise NotImplementedError
## Pull out the summary data
summary: MetaSummary = comp.meta_summary

assert summary.name == ["a", "b"]
assert summary.n_observations == (3, 3)
assert summary.n_variables == (2, 3)
assert summary.in_a_only == set()
assert summary.in_b_only == {"c"}
assert summary.in_both == {"a", "b"}
assert summary.conflicting_types == ["b"]


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading