@@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
744744 except Exception as e :
745745 assert str (e ).strip ("'" ) == HASH_LOOKUP_ERR_MSG .format (t1 [0 ])
746746
747+ def test_pandas (self ):
748+ import pandas as pd
749+ df = pd .DataFrame ({"a" : [1 ]})
750+ equal_df = pd .DataFrame ({"a" : [1 ]})
751+ df_same_column_names = pd .DataFrame ({"a" : [1 , 2 ]})
752+ other_df = pd .DataFrame ({"b" : [1 ]})
753+ df_hash = DeepHashPrep (df )[df ]
754+ equal_df_hash = DeepHashPrep (equal_df )[equal_df ]
755+ df_same_column_names_hash = DeepHashPrep (df_same_column_names )[df_same_column_names ]
756+ other_df_hash = DeepHashPrep (other_df )[other_df ]
757+ assert df_hash == equal_df_hash
758+ assert df_hash != df_same_column_names_hash
759+ assert df_hash != other_df_hash
760+
761+ df_mixed = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
762+ df_mixed_2 = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
763+ df_mixed_3 = pd .DataFrame ({'a' : [1 ], 'b' : ['one' ], 'c' : [(1 , 2 )]})
764+ df_mixed_4 = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 3 )]})
765+ df_mixed_hash = DeepHashPrep (df_mixed )[df_mixed ]
766+ df_mixed_2_hash = DeepHashPrep (df_mixed_2 )[df_mixed_2 ]
767+ df_mixed_3_hash = DeepHashPrep (df_mixed_3 )[df_mixed_3 ]
768+ df_mixed_4_hash = DeepHashPrep (df_mixed_4 )[df_mixed_4 ]
769+ assert df_mixed_hash == df_mixed_2_hash
770+ assert df_mixed_hash != df_mixed_3_hash
771+ assert df_mixed_hash != df_mixed_4_hash
772+
773+ df_u8 = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .uint8 )})
774+ df_u16 = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .uint16 )})
775+ df_float = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .float32 )})
776+ df_u8_hash = DeepHashPrep (df_u8 )[df_u8 ]
777+ df_u16_hash = DeepHashPrep (df_u16 )[df_u16 ]
778+ df_float_hash = DeepHashPrep (df_float )[df_float ]
779+ assert df_u8_hash != df_float_hash
780+ assert df_u8_hash != df_u16_hash
781+
782+ df_index = pd .DataFrame ({'a' : [1 , 2 , 3 ]}, index = [1 , 2 , 3 ])
783+ df_index_diff = pd .DataFrame ({'a' : [1 , 2 , 3 ]}, index = [1 , 2 , 4 ])
784+ df_index_hash = DeepHashPrep (df_index )[df_index ]
785+ df_index_diff_hash = DeepHashPrep (df_index_diff )[df_index_diff ]
786+ assert df_index_hash != df_index_diff_hash
787+
788+ def test_polars (self ):
789+ import polars as pl
790+ df = pl .DataFrame ({"a" : [1 ]})
791+ equal_df = pl .DataFrame ({"a" : [1 ]})
792+ df_same_column_names = pl .DataFrame ({"a" : [1 , 2 ]})
793+ other_df = pl .DataFrame ({"b" : [1 ]})
794+ df_hash = DeepHashPrep (df )[df ]
795+ equal_df_hash = DeepHashPrep (equal_df )[equal_df ]
796+ df_same_column_names_hash = DeepHashPrep (df_same_column_names )[df_same_column_names ]
797+ other_df_hash = DeepHashPrep (other_df )[other_df ]
798+ assert df_hash == equal_df_hash
799+ assert df_hash != df_same_column_names_hash
800+ assert df_hash != other_df_hash
801+
802+ df_mixed = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
803+ df_mixed_2 = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
804+ df_mixed_3 = pl .DataFrame ({'a' : [1 ], 'b' : ['one' ], 'c' : [(1 , 2 )]})
805+ df_mixed_4 = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 3 )]})
806+ df_mixed_hash = DeepHashPrep (df_mixed )[df_mixed ]
807+ df_mixed_2_hash = DeepHashPrep (df_mixed_2 )[df_mixed_2 ]
808+ df_mixed_3_hash = DeepHashPrep (df_mixed_3 )[df_mixed_3 ]
809+ df_mixed_4_hash = DeepHashPrep (df_mixed_4 )[df_mixed_4 ]
810+ assert df_mixed_hash == df_mixed_2_hash
811+ assert df_mixed_hash != df_mixed_3_hash
812+ assert df_mixed_hash != df_mixed_4_hash
813+
814+ df_u8 = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .uint8 )})
815+ df_u16 = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .uint16 )})
816+ df_float = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .float32 )})
817+ df_u8_hash = DeepHashPrep (df_u8 )[df_u8 ]
818+ df_u16_hash = DeepHashPrep (df_u16 )[df_u16 ]
819+ df_float_hash = DeepHashPrep (df_float )[df_float ]
820+ assert df_u8_hash != df_float_hash
821+ assert df_u8_hash != df_u16_hash
822+
823+ lazy_1 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ]}).lazy ()
824+ lazy_2 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ]}).lazy ()
825+ lazy_3 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ], "foobar" : 5 }).lazy ()
826+ with pytest .raises (TypeError ):
827+ DeepHashPrep (lazy_1 )[lazy_1 ] # lazy dfs can not be compared
828+ df_1 = lazy_1 .collect ()
829+ df_2 = lazy_2 .collect ()
830+ df_3 = lazy_3 .collect ()
831+ df_1_hash = DeepHashPrep (df_1 )[df_1 ]
832+ df_2_hash = DeepHashPrep (df_2 )[df_2 ]
833+ df_3_hash = DeepHashPrep (df_3 )[df_3 ]
834+ assert df_1_hash == df_2_hash
835+ assert df_1_hash != df_3_hash
836+
747837
748838class TestDeepHashSHA :
749839 """DeepHash with SHA Tests."""
0 commit comments