44# pylint: disable=unused-argument
55# pylint: disable=use-implicit-booleaness-not-comparison
66import re
7+ import sys
78
89import pandas as pd
910import pytest
@@ -49,6 +50,9 @@ def test_pandas_register(self, registry_reseter):
4950 ".hdf" : dir_content_diff .pandas .HdfComparator (),
5051 ".hdf4" : dir_content_diff .pandas .HdfComparator (),
5152 ".hdf5" : dir_content_diff .pandas .HdfComparator (),
53+ ".feather" : dir_content_diff .pandas .FeatherComparator (),
54+ ".parquet" : dir_content_diff .pandas .ParquetComparator (),
55+ ".dta" : dir_content_diff .pandas .StataComparator (),
5256 }
5357
5458
@@ -59,19 +63,71 @@ def pandas_registry_reseter(registry_reseter):
5963
6064
6165@pytest .fixture
62- def ref_hdf5 (empty_ref_tree ):
63- """The reference HDF5 file."""
66+ def ref_data (empty_ref_tree ):
6467 ref_data = {
6568 "col_a" : [1 , 2 , 3 ],
6669 "col_b" : ["a" , "b" , "c" ],
6770 "col_c" : [4 , 5 , 6 ],
6871 }
6972 df = pd .DataFrame (ref_data , index = ["idx1" , "idx2" , "idx3" ])
73+ return df
74+
75+
76+ @pytest .fixture
77+ def ref_hdf5 (ref_data , empty_ref_tree ):
78+ """The reference HDF5 file."""
7079 filename = empty_ref_tree / "file.h5"
71- df .to_hdf (filename , key = "data" , index = True )
80+ ref_data .to_hdf (filename , key = "data" , index = True )
81+ return filename
82+
83+
84+ @pytest .fixture
85+ def ref_feather (ref_data , empty_ref_tree ):
86+ """The reference Feather file."""
87+ filename = empty_ref_tree / "file.feather"
88+ ref_data .to_feather (filename )
7289 return filename
7390
7491
92+ @pytest .fixture
93+ def ref_parquet (ref_data , empty_ref_tree ):
94+ """The reference Parquet file."""
95+ filename = empty_ref_tree / "file.parquet"
96+ ref_data .to_parquet (filename )
97+ return filename
98+
99+
100+ @pytest .fixture
101+ def ref_stata (ref_data , empty_ref_tree ):
102+ """The reference Stata file."""
103+ filename = empty_ref_tree / "file.dta"
104+ ref_data .to_stata (filename )
105+ return filename
106+
107+
108+ def _update_df (df ):
109+ df .loc ["idx1" , "col_a" ] *= 10
110+ df .loc ["idx2" , "col_b" ] += "_new"
111+
112+
113+ @pytest .fixture
114+ def res_diff_checker ():
115+ """The regex used to check the diff result."""
116+ return (
117+ r"The files '\S*/ref/file\.\S*' and '\S*/res/file\.\S*' are different:\n\n"
118+ r"Column 'col_a': Series are different\n\n"
119+ r"Series values are different \(33\.33333 %\)\n"
120+ r"\[index\]: \[idx1, idx2, idx3\]\n"
121+ r"\[left\]: \[1, 2, 3\]\n\[right\]: \[10, 2, 3\]\n"
122+ r"""(At positional index 0, first diff: 1 != 10\n)?\n"""
123+ r"Column 'col_b': Series are different\n\n"
124+ r"Series values are different \(33\.33333 %\)\n"
125+ r"\[index\]: \[idx1, idx2, idx3\]\n"
126+ r"\[left\]: \[a, b, c\]\n\[right\]: \[a, b_new, c\]"
127+ r"""(\nAt positional index 1, first diff: b != b_new)?"""
128+ )
129+
130+
75131@pytest .fixture
76132def res_hdf5_equal (ref_hdf5 , empty_res_tree ):
77133 """The result hdf5 file equal to the reference."""
@@ -85,13 +141,69 @@ def res_hdf5_equal(ref_hdf5, empty_res_tree):
85141def res_hdf5_diff (ref_hdf5 , empty_res_tree ):
86142 """The result hdf5 file different from the reference."""
87143 df = pd .read_hdf (ref_hdf5 , index_col = "index" )
88- df .loc ["idx1" , "col_a" ] *= 10
89- df .loc ["idx2" , "col_b" ] += "_new"
144+ _update_df (df )
90145 filename = empty_res_tree / "file.h5"
91146 df .to_hdf (filename , key = "data" , index = True )
92147 return filename
93148
94149
150+ @pytest .fixture
151+ def res_feather_equal (ref_feather , empty_res_tree ):
152+ """The result feather file equal to the reference."""
153+ df = pd .read_feather (ref_feather )
154+ filename = empty_res_tree / "file.feather"
155+ df .to_feather (filename )
156+ return filename
157+
158+
159+ @pytest .fixture
160+ def res_feather_diff (ref_feather , empty_res_tree ):
161+ """The result feather file different from the reference."""
162+ df = pd .read_feather (ref_feather )
163+ _update_df (df )
164+ filename = empty_res_tree / "file.feather"
165+ df .to_feather (filename )
166+ return filename
167+
168+
169+ @pytest .fixture
170+ def res_parquet_equal (ref_parquet , empty_res_tree ):
171+ """The result parquet file equal to the reference."""
172+ df = pd .read_parquet (ref_parquet )
173+ filename = empty_res_tree / "file.parquet"
174+ df .to_parquet (filename )
175+ return filename
176+
177+
178+ @pytest .fixture
179+ def res_parquet_diff (ref_parquet , empty_res_tree ):
180+ """The result parquet file different from the reference."""
181+ df = pd .read_parquet (ref_parquet )
182+ _update_df (df )
183+ filename = empty_res_tree / "file.parquet"
184+ df .to_parquet (filename )
185+ return filename
186+
187+
188+ @pytest .fixture
189+ def res_stata_equal (ref_stata , empty_res_tree ):
190+ """The result stata file equal to the reference."""
191+ df = pd .read_stata (ref_stata , index_col = "index" )
192+ filename = empty_res_tree / "file.dta"
193+ df .to_stata (filename )
194+ return filename
195+
196+
197+ @pytest .fixture
198+ def res_stata_diff (ref_stata , empty_res_tree ):
199+ """The result stata file different from the reference."""
200+ df = pd .read_stata (ref_stata , index_col = "index" )
201+ _update_df (df )
202+ filename = empty_res_tree / "file.dta"
203+ df .to_stata (filename )
204+ return filename
205+
206+
95207class TestEqualTrees :
96208 """Tests that should return no difference."""
97209
@@ -222,17 +334,46 @@ def test_replace_pattern(
222334 )
223335 assert match_res is not None
224336
225- def test_hdf5_comparator (
226- self , empty_ref_tree , empty_res_tree , res_hdf5_equal , pandas_registry_reseter
227- ):
228- """Test the comparator for HDF5 files."""
337+ def _check_equal (self , empty_ref_tree , empty_res_tree ):
229338 assert_equal_trees (empty_ref_tree , empty_res_tree , export_formatted_files = True )
230339 ref_files = list (empty_ref_tree .rglob ("*" ))
231340 res_files = list (empty_res_tree .rglob ("*" ))
232341 assert len (ref_files ) == len (res_files )
342+ return ref_files , res_files
343+
344+ def test_hdf5_comparator (
345+ self , empty_ref_tree , empty_res_tree , res_hdf5_equal , pandas_registry_reseter
346+ ):
347+ """Test the comparator for HDF5 files."""
348+ ref_files , res_files = self ._check_equal (empty_ref_tree , empty_res_tree )
233349 for i , j in zip (ref_files , res_files ):
234350 assert pd .read_hdf (i ).equals (pd .read_hdf (j ))
235351
352+ @pytest .mark .skipif (sys .version_info < (3 , 9 ), reason = "requires python3.9 or higher" )
353+ def test_feather_comparator (
354+ self , empty_ref_tree , empty_res_tree , res_feather_equal , pandas_registry_reseter
355+ ):
356+ """Test the comparator for Feather files."""
357+ ref_files , res_files = self ._check_equal (empty_ref_tree , empty_res_tree )
358+ for i , j in zip (ref_files , res_files ):
359+ assert pd .read_feather (i ).equals (pd .read_feather (j ))
360+
361+ def test_parquet_comparator (
362+ self , empty_ref_tree , empty_res_tree , res_parquet_equal , pandas_registry_reseter
363+ ):
364+ """Test the comparator for Parquet files."""
365+ ref_files , res_files = self ._check_equal (empty_ref_tree , empty_res_tree )
366+ for i , j in zip (ref_files , res_files ):
367+ assert pd .read_parquet (i ).equals (pd .read_parquet (j ))
368+
369+ def test_stata_comparator (
370+ self , empty_ref_tree , empty_res_tree , res_stata_equal , pandas_registry_reseter
371+ ):
372+ """Test the comparator for Stata files."""
373+ ref_files , res_files = self ._check_equal (empty_ref_tree , empty_res_tree )
374+ for i , j in zip (ref_files , res_files ):
375+ assert pd .read_stata (i ).equals (pd .read_stata (j ))
376+
236377
237378class TestDiffTrees :
238379 """Tests that should return differences."""
@@ -287,26 +428,70 @@ def test_missing_column(
287428 )
288429 assert match_res is not None
289430
290- def test_hdf5_comparator (
291- self , empty_ref_tree , empty_res_tree , res_hdf5_diff , pandas_registry_reseter
431+ def _check_diff_comparator (
432+ self , empty_ref_tree , empty_res_tree , res_diff_checker , ext , specific_args = None
292433 ):
293- """Test the comparator for HDF5 files."""
294- res = compare_trees (empty_ref_tree , empty_res_tree )
434+ res = compare_trees (empty_ref_tree , empty_res_tree , specific_args = specific_args )
295435
296436 assert len (res ) == 1
297- res_hdf = res ["file.h5" ]
437+ filename = f"file.{ ext } "
438+ res_ext = res [filename ]
439+ if specific_args is not None and filename in specific_args :
440+ res_diff_checker = res_diff_checker .replace (
441+ "' are different:" ,
442+ "' are different:\n Kwargs used for loading data: "
443+ f"{ specific_args [filename ]['load_kwargs' ]} " ,
444+ )
298445 match_res = re .match (
299- r"The files '\S*/ref/file\.h5' and '\S*/res/file\.h5' are different:\n\n"
300- r"Column 'col_a': Series are different\n\n"
301- r"Series values are different \(33\.33333 %\)\n"
302- r"\[index\]: \[idx1, idx2, idx3\]\n"
303- r"\[left\]: \[1, 2, 3\]\n\[right\]: \[10, 2, 3\]\n"
304- r"""(At positional index 0, first diff: 1 != 10\n)?\n"""
305- r"Column 'col_b': Series are different\n\n"
306- r"Series values are different \(33\.33333 %\)\n"
307- r"\[index\]: \[idx1, idx2, idx3\]\n"
308- r"\[left\]: \[a, b, c\]\n\[right\]: \[a, b_new, c\]"
309- r"""(\nAt positional index 1, first diff: b != b_new)?""" ,
310- res_hdf ,
446+ res_diff_checker ,
447+ res_ext ,
311448 )
312449 assert match_res is not None
450+
451+ def test_hdf5_comparator (
452+ self ,
453+ empty_ref_tree ,
454+ empty_res_tree ,
455+ res_hdf5_diff ,
456+ res_diff_checker ,
457+ pandas_registry_reseter ,
458+ ):
459+ """Test the comparator for HDF5 files."""
460+ self ._check_diff_comparator (empty_ref_tree , empty_res_tree , res_diff_checker , "h5" )
461+
462+ @pytest .mark .skipif (sys .version_info < (3 , 9 ), reason = "requires python3.9 or higher" )
463+ def test_feather_comparator (
464+ self ,
465+ empty_ref_tree ,
466+ empty_res_tree ,
467+ res_feather_diff ,
468+ res_diff_checker ,
469+ pandas_registry_reseter ,
470+ ):
471+ """Test the comparator for feather files."""
472+ self ._check_diff_comparator (empty_ref_tree , empty_res_tree , res_diff_checker , "feather" )
473+
474+ def test_parquet_comparator (
475+ self ,
476+ empty_ref_tree ,
477+ empty_res_tree ,
478+ res_parquet_diff ,
479+ res_diff_checker ,
480+ pandas_registry_reseter ,
481+ ):
482+ """Test the comparator for parquet files."""
483+ self ._check_diff_comparator (empty_ref_tree , empty_res_tree , res_diff_checker , "parquet" )
484+
485+ def test_stata_comparator (
486+ self ,
487+ empty_ref_tree ,
488+ empty_res_tree ,
489+ res_stata_diff ,
490+ res_diff_checker ,
491+ pandas_registry_reseter ,
492+ ):
493+ """Test the comparator for stata files."""
494+ specific_args = {"file.dta" : {"load_kwargs" : {"index_col" : "index" }}}
495+ self ._check_diff_comparator (
496+ empty_ref_tree , empty_res_tree , res_diff_checker , "dta" , specific_args = specific_args
497+ )
0 commit comments