@@ -732,3 +732,98 @@ def test_add_files_subset_of_schema(spark: SparkSession, session_catalog: Catalo
732
732
for column in written_arrow_table .column_names :
733
733
for left , right in zip (lhs [column ].to_list (), rhs [column ].to_list ()):
734
734
assert left == right
735
+
736
+
737
+ @pytest .mark .integration
738
+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
739
+ def test_add_files_with_duplicate_files_in_file_paths (spark : SparkSession , session_catalog : Catalog , format_version : int ) -> None :
740
+ identifier = f"default.test_table_duplicate_add_files_v{ format_version } "
741
+ tbl = _create_table (session_catalog , identifier , format_version )
742
+ file_path = "s3://warehouse/default/unpartitioned/v{format_version}/test-1.parquet"
743
+ file_paths = [file_path , file_path ]
744
+
745
+ # add the parquet files as data files
746
+ with pytest .raises (ValueError ) as exc_info :
747
+ tbl .add_files (file_paths = file_paths )
748
+ assert "File paths must be unique" in str (exc_info .value )
749
+
750
+
751
+ @pytest .mark .integration
752
+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
753
+ def test_add_files_that_referenced_by_current_snapshot (
754
+ spark : SparkSession , session_catalog : Catalog , format_version : int
755
+ ) -> None :
756
+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
757
+ tbl = _create_table (session_catalog , identifier , format_version )
758
+
759
+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
760
+
761
+ # write parquet files
762
+ for file_path in file_paths :
763
+ fo = tbl .io .new_output (file_path )
764
+ with fo .create (overwrite = True ) as fos :
765
+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
766
+ writer .write_table (ARROW_TABLE )
767
+
768
+ # add the parquet files as data files
769
+ tbl .add_files (file_paths = file_paths )
770
+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
771
+
772
+ with pytest .raises (ValueError ) as exc_info :
773
+ tbl .add_files (file_paths = [existing_files_in_table ])
774
+ assert f"Cannot add files that are already referenced by table, files: { existing_files_in_table } " in str (exc_info .value )
775
+
776
+
777
+ @pytest .mark .integration
778
+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
779
+ def test_add_files_that_referenced_by_current_snapshot_with_check_duplicate_files_false (
780
+ spark : SparkSession , session_catalog : Catalog , format_version : int
781
+ ) -> None :
782
+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
783
+ tbl = _create_table (session_catalog , identifier , format_version )
784
+
785
+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
786
+ # write parquet files
787
+ for file_path in file_paths :
788
+ fo = tbl .io .new_output (file_path )
789
+ with fo .create (overwrite = True ) as fos :
790
+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
791
+ writer .write_table (ARROW_TABLE )
792
+
793
+ # add the parquet files as data files
794
+ tbl .add_files (file_paths = file_paths )
795
+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
796
+ tbl .add_files (file_paths = [existing_files_in_table ], check_duplicate_files = False )
797
+ rows = spark .sql (
798
+ f"""
799
+ SELECT added_data_files_count, existing_data_files_count, deleted_data_files_count
800
+ FROM { identifier } .all_manifests
801
+ """
802
+ ).collect ()
803
+ assert [row .added_data_files_count for row in rows ] == [5 , 1 , 5 ]
804
+ assert [row .existing_data_files_count for row in rows ] == [0 , 0 , 0 ]
805
+ assert [row .deleted_data_files_count for row in rows ] == [0 , 0 , 0 ]
806
+
807
+
808
+ @pytest .mark .integration
809
+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
810
+ def test_add_files_that_referenced_by_current_snapshot_with_check_duplicate_files_true (
811
+ spark : SparkSession , session_catalog : Catalog , format_version : int
812
+ ) -> None :
813
+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
814
+ tbl = _create_table (session_catalog , identifier , format_version )
815
+
816
+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
817
+ # write parquet files
818
+ for file_path in file_paths :
819
+ fo = tbl .io .new_output (file_path )
820
+ with fo .create (overwrite = True ) as fos :
821
+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
822
+ writer .write_table (ARROW_TABLE )
823
+
824
+ # add the parquet files as data files
825
+ tbl .add_files (file_paths = file_paths )
826
+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
827
+ with pytest .raises (ValueError ) as exc_info :
828
+ tbl .add_files (file_paths = [existing_files_in_table ], check_duplicate_files = True )
829
+ assert f"Cannot add files that are already referenced by table, files: { existing_files_in_table } " in str (exc_info .value )
0 commit comments