Skip to content

Commit cf58e52

Browse files
Feat: Add new comparators for Feather, Parquet and Stata files (#56)
* Feat: Add new comparators for Feather, Parquet and Stata files * Fix: Fix coverage
1 parent 76d8569 commit cf58e52

File tree

5 files changed

+264
-27
lines changed

5 files changed

+264
-27
lines changed

.coveragerc_py38

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[paths]
2+
source_paths =
3+
dir_content_diff
4+
*/site-packages/dir_content_diff
5+
*/dir_content_diff/dir_content_diff
6+
*/dir-content-diff/dir_content_diff
7+
8+
[report]
9+
exclude_also =
10+
return pd\.read_feather\(path, \*\*kwargs\)
11+
data\.to_feather\(path, \*\*kwargs\)

dir_content_diff/pandas.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,42 @@ def save(self, data, path, **kwargs):
147147
data.to_hdf(path, index=index, key=key, **kwargs)
148148

149149

150+
class FeatherComparator(DataframeComparator):
151+
"""Comparator for Feather files."""
152+
153+
def load(self, path, **kwargs):
154+
"""Load a Feather file into a :class:`pandas.DataFrame` object."""
155+
return pd.read_feather(path, **kwargs)
156+
157+
def save(self, data, path, **kwargs):
158+
"""Save data to a Feather file."""
159+
data.to_feather(path, **kwargs)
160+
161+
162+
class ParquetComparator(DataframeComparator):
163+
"""Comparator for Parquet files."""
164+
165+
def load(self, path, **kwargs):
166+
"""Load a Parquet file into a :class:`pandas.DataFrame` object."""
167+
return pd.read_parquet(path, **kwargs)
168+
169+
def save(self, data, path, **kwargs):
170+
"""Save data to a Parquet file."""
171+
data.to_parquet(path, **kwargs)
172+
173+
174+
class StataComparator(DataframeComparator):
175+
"""Comparator for Stata files."""
176+
177+
def load(self, path, **kwargs):
178+
"""Load a Stata file into a :class:`pandas.DataFrame` object."""
179+
return pd.read_stata(path, **kwargs)
180+
181+
def save(self, data, path, **kwargs):
182+
"""Save data to a Stata file."""
183+
data.to_stata(path, **kwargs)
184+
185+
150186
def register():
151187
"""Register Pandas extensions."""
152188
register_comparator(".csv", CsvComparator())
@@ -156,3 +192,6 @@ def register():
156192
register_comparator(".hdf", HdfComparator())
157193
register_comparator(".hdf4", HdfComparator())
158194
register_comparator(".hdf5", HdfComparator())
195+
register_comparator(".feather", FeatherComparator())
196+
register_comparator(".parquet", ParquetComparator())
197+
register_comparator(".dta", StataComparator())

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@
2020

2121
pandas_reqs = [
2222
"pandas>=1.4",
23+
"pyarrow>=11",
2324
"tables>=3.7",
2425
]
2526

2627
test_reqs = [
27-
"coverage>=6",
28+
"coverage>=7.2",
2829
"dicttoxml>=1.7.16",
2930
"matplotlib>=3",
3031
"rst2pdf>=0.99",

tests/test_pandas.py

Lines changed: 211 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# pylint: disable=unused-argument
55
# pylint: disable=use-implicit-booleaness-not-comparison
66
import re
7+
import sys
78

89
import pandas as pd
910
import pytest
@@ -49,6 +50,9 @@ def test_pandas_register(self, registry_reseter):
4950
".hdf": dir_content_diff.pandas.HdfComparator(),
5051
".hdf4": dir_content_diff.pandas.HdfComparator(),
5152
".hdf5": dir_content_diff.pandas.HdfComparator(),
53+
".feather": dir_content_diff.pandas.FeatherComparator(),
54+
".parquet": dir_content_diff.pandas.ParquetComparator(),
55+
".dta": dir_content_diff.pandas.StataComparator(),
5256
}
5357

5458

@@ -59,19 +63,71 @@ def pandas_registry_reseter(registry_reseter):
5963

6064

6165
@pytest.fixture
62-
def ref_hdf5(empty_ref_tree):
63-
"""The reference HDF5 file."""
66+
def ref_data(empty_ref_tree):
6467
ref_data = {
6568
"col_a": [1, 2, 3],
6669
"col_b": ["a", "b", "c"],
6770
"col_c": [4, 5, 6],
6871
}
6972
df = pd.DataFrame(ref_data, index=["idx1", "idx2", "idx3"])
73+
return df
74+
75+
76+
@pytest.fixture
77+
def ref_hdf5(ref_data, empty_ref_tree):
78+
"""The reference HDF5 file."""
7079
filename = empty_ref_tree / "file.h5"
71-
df.to_hdf(filename, key="data", index=True)
80+
ref_data.to_hdf(filename, key="data", index=True)
81+
return filename
82+
83+
84+
@pytest.fixture
85+
def ref_feather(ref_data, empty_ref_tree):
86+
"""The reference Feather file."""
87+
filename = empty_ref_tree / "file.feather"
88+
ref_data.to_feather(filename)
7289
return filename
7390

7491

92+
@pytest.fixture
93+
def ref_parquet(ref_data, empty_ref_tree):
94+
"""The reference Parquet file."""
95+
filename = empty_ref_tree / "file.parquet"
96+
ref_data.to_parquet(filename)
97+
return filename
98+
99+
100+
@pytest.fixture
101+
def ref_stata(ref_data, empty_ref_tree):
102+
"""The reference Stata file."""
103+
filename = empty_ref_tree / "file.dta"
104+
ref_data.to_stata(filename)
105+
return filename
106+
107+
108+
def _update_df(df):
109+
df.loc["idx1", "col_a"] *= 10
110+
df.loc["idx2", "col_b"] += "_new"
111+
112+
113+
@pytest.fixture
114+
def res_diff_checker():
115+
"""The regex used to check the diff result."""
116+
return (
117+
r"The files '\S*/ref/file\.\S*' and '\S*/res/file\.\S*' are different:\n\n"
118+
r"Column 'col_a': Series are different\n\n"
119+
r"Series values are different \(33\.33333 %\)\n"
120+
r"\[index\]: \[idx1, idx2, idx3\]\n"
121+
r"\[left\]: \[1, 2, 3\]\n\[right\]: \[10, 2, 3\]\n"
122+
r"""(At positional index 0, first diff: 1 != 10\n)?\n"""
123+
r"Column 'col_b': Series are different\n\n"
124+
r"Series values are different \(33\.33333 %\)\n"
125+
r"\[index\]: \[idx1, idx2, idx3\]\n"
126+
r"\[left\]: \[a, b, c\]\n\[right\]: \[a, b_new, c\]"
127+
r"""(\nAt positional index 1, first diff: b != b_new)?"""
128+
)
129+
130+
75131
@pytest.fixture
76132
def res_hdf5_equal(ref_hdf5, empty_res_tree):
77133
"""The result hdf5 file equal to the reference."""
@@ -85,13 +141,69 @@ def res_hdf5_equal(ref_hdf5, empty_res_tree):
85141
def res_hdf5_diff(ref_hdf5, empty_res_tree):
86142
"""The result hdf5 file different from the reference."""
87143
df = pd.read_hdf(ref_hdf5, index_col="index")
88-
df.loc["idx1", "col_a"] *= 10
89-
df.loc["idx2", "col_b"] += "_new"
144+
_update_df(df)
90145
filename = empty_res_tree / "file.h5"
91146
df.to_hdf(filename, key="data", index=True)
92147
return filename
93148

94149

150+
@pytest.fixture
151+
def res_feather_equal(ref_feather, empty_res_tree):
152+
"""The result feather file equal to the reference."""
153+
df = pd.read_feather(ref_feather)
154+
filename = empty_res_tree / "file.feather"
155+
df.to_feather(filename)
156+
return filename
157+
158+
159+
@pytest.fixture
160+
def res_feather_diff(ref_feather, empty_res_tree):
161+
"""The result feather file different from the reference."""
162+
df = pd.read_feather(ref_feather)
163+
_update_df(df)
164+
filename = empty_res_tree / "file.feather"
165+
df.to_feather(filename)
166+
return filename
167+
168+
169+
@pytest.fixture
170+
def res_parquet_equal(ref_parquet, empty_res_tree):
171+
"""The result parquet file equal to the reference."""
172+
df = pd.read_parquet(ref_parquet)
173+
filename = empty_res_tree / "file.parquet"
174+
df.to_parquet(filename)
175+
return filename
176+
177+
178+
@pytest.fixture
179+
def res_parquet_diff(ref_parquet, empty_res_tree):
180+
"""The result parquet file different from the reference."""
181+
df = pd.read_parquet(ref_parquet)
182+
_update_df(df)
183+
filename = empty_res_tree / "file.parquet"
184+
df.to_parquet(filename)
185+
return filename
186+
187+
188+
@pytest.fixture
189+
def res_stata_equal(ref_stata, empty_res_tree):
190+
"""The result stata file equal to the reference."""
191+
df = pd.read_stata(ref_stata, index_col="index")
192+
filename = empty_res_tree / "file.dta"
193+
df.to_stata(filename)
194+
return filename
195+
196+
197+
@pytest.fixture
198+
def res_stata_diff(ref_stata, empty_res_tree):
199+
"""The result stata file different from the reference."""
200+
df = pd.read_stata(ref_stata, index_col="index")
201+
_update_df(df)
202+
filename = empty_res_tree / "file.dta"
203+
df.to_stata(filename)
204+
return filename
205+
206+
95207
class TestEqualTrees:
96208
"""Tests that should return no difference."""
97209

@@ -222,17 +334,46 @@ def test_replace_pattern(
222334
)
223335
assert match_res is not None
224336

225-
def test_hdf5_comparator(
226-
self, empty_ref_tree, empty_res_tree, res_hdf5_equal, pandas_registry_reseter
227-
):
228-
"""Test the comparator for HDF5 files."""
337+
def _check_equal(self, empty_ref_tree, empty_res_tree):
229338
assert_equal_trees(empty_ref_tree, empty_res_tree, export_formatted_files=True)
230339
ref_files = list(empty_ref_tree.rglob("*"))
231340
res_files = list(empty_res_tree.rglob("*"))
232341
assert len(ref_files) == len(res_files)
342+
return ref_files, res_files
343+
344+
def test_hdf5_comparator(
345+
self, empty_ref_tree, empty_res_tree, res_hdf5_equal, pandas_registry_reseter
346+
):
347+
"""Test the comparator for HDF5 files."""
348+
ref_files, res_files = self._check_equal(empty_ref_tree, empty_res_tree)
233349
for i, j in zip(ref_files, res_files):
234350
assert pd.read_hdf(i).equals(pd.read_hdf(j))
235351

352+
@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
353+
def test_feather_comparator(
354+
self, empty_ref_tree, empty_res_tree, res_feather_equal, pandas_registry_reseter
355+
):
356+
"""Test the comparator for Feather files."""
357+
ref_files, res_files = self._check_equal(empty_ref_tree, empty_res_tree)
358+
for i, j in zip(ref_files, res_files):
359+
assert pd.read_feather(i).equals(pd.read_feather(j))
360+
361+
def test_parquet_comparator(
362+
self, empty_ref_tree, empty_res_tree, res_parquet_equal, pandas_registry_reseter
363+
):
364+
"""Test the comparator for Parquet files."""
365+
ref_files, res_files = self._check_equal(empty_ref_tree, empty_res_tree)
366+
for i, j in zip(ref_files, res_files):
367+
assert pd.read_parquet(i).equals(pd.read_parquet(j))
368+
369+
def test_stata_comparator(
370+
self, empty_ref_tree, empty_res_tree, res_stata_equal, pandas_registry_reseter
371+
):
372+
"""Test the comparator for Stata files."""
373+
ref_files, res_files = self._check_equal(empty_ref_tree, empty_res_tree)
374+
for i, j in zip(ref_files, res_files):
375+
assert pd.read_stata(i).equals(pd.read_stata(j))
376+
236377

237378
class TestDiffTrees:
238379
"""Tests that should return differences."""
@@ -287,26 +428,70 @@ def test_missing_column(
287428
)
288429
assert match_res is not None
289430

290-
def test_hdf5_comparator(
291-
self, empty_ref_tree, empty_res_tree, res_hdf5_diff, pandas_registry_reseter
431+
def _check_diff_comparator(
432+
self, empty_ref_tree, empty_res_tree, res_diff_checker, ext, specific_args=None
292433
):
293-
"""Test the comparator for HDF5 files."""
294-
res = compare_trees(empty_ref_tree, empty_res_tree)
434+
res = compare_trees(empty_ref_tree, empty_res_tree, specific_args=specific_args)
295435

296436
assert len(res) == 1
297-
res_hdf = res["file.h5"]
437+
filename = f"file.{ext}"
438+
res_ext = res[filename]
439+
if specific_args is not None and filename in specific_args:
440+
res_diff_checker = res_diff_checker.replace(
441+
"' are different:",
442+
"' are different:\nKwargs used for loading data: "
443+
f"{specific_args[filename]['load_kwargs']}",
444+
)
298445
match_res = re.match(
299-
r"The files '\S*/ref/file\.h5' and '\S*/res/file\.h5' are different:\n\n"
300-
r"Column 'col_a': Series are different\n\n"
301-
r"Series values are different \(33\.33333 %\)\n"
302-
r"\[index\]: \[idx1, idx2, idx3\]\n"
303-
r"\[left\]: \[1, 2, 3\]\n\[right\]: \[10, 2, 3\]\n"
304-
r"""(At positional index 0, first diff: 1 != 10\n)?\n"""
305-
r"Column 'col_b': Series are different\n\n"
306-
r"Series values are different \(33\.33333 %\)\n"
307-
r"\[index\]: \[idx1, idx2, idx3\]\n"
308-
r"\[left\]: \[a, b, c\]\n\[right\]: \[a, b_new, c\]"
309-
r"""(\nAt positional index 1, first diff: b != b_new)?""",
310-
res_hdf,
446+
res_diff_checker,
447+
res_ext,
311448
)
312449
assert match_res is not None
450+
451+
def test_hdf5_comparator(
452+
self,
453+
empty_ref_tree,
454+
empty_res_tree,
455+
res_hdf5_diff,
456+
res_diff_checker,
457+
pandas_registry_reseter,
458+
):
459+
"""Test the comparator for HDF5 files."""
460+
self._check_diff_comparator(empty_ref_tree, empty_res_tree, res_diff_checker, "h5")
461+
462+
@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
463+
def test_feather_comparator(
464+
self,
465+
empty_ref_tree,
466+
empty_res_tree,
467+
res_feather_diff,
468+
res_diff_checker,
469+
pandas_registry_reseter,
470+
):
471+
"""Test the comparator for feather files."""
472+
self._check_diff_comparator(empty_ref_tree, empty_res_tree, res_diff_checker, "feather")
473+
474+
def test_parquet_comparator(
475+
self,
476+
empty_ref_tree,
477+
empty_res_tree,
478+
res_parquet_diff,
479+
res_diff_checker,
480+
pandas_registry_reseter,
481+
):
482+
"""Test the comparator for parquet files."""
483+
self._check_diff_comparator(empty_ref_tree, empty_res_tree, res_diff_checker, "parquet")
484+
485+
def test_stata_comparator(
486+
self,
487+
empty_ref_tree,
488+
empty_res_tree,
489+
res_stata_diff,
490+
res_diff_checker,
491+
pandas_registry_reseter,
492+
):
493+
"""Test the comparator for stata files."""
494+
specific_args = {"file.dta": {"load_kwargs": {"index_col": "index"}}}
495+
self._check_diff_comparator(
496+
empty_ref_tree, empty_res_tree, res_diff_checker, "dta", specific_args=specific_args
497+
)

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ extras =
1919
test
2020
setenv =
2121
COVERAGE_FILE = {env:COVERAGE_FILE:.coverage-{envname}}
22+
py38,min_versions: COVERAGE_RCFILE=.coveragerc_py38
2223
deps =
2324
min_versions: Requirements-Builder
2425
commands =

0 commit comments

Comments
 (0)