@@ -700,6 +700,112 @@ def _test_value_stats_cols_case(self, manifest_manager, table, value_stats_cols,
700700
701701 self .assertEqual (read_entry .file .value_stats .null_counts , null_counts )
702702
703+ def test_primary_key_value_stats (self ):
704+ pa_schema = pa .schema ([
705+ ('id' , pa .int64 ()),
706+ ('name' , pa .string ()),
707+ ('price' , pa .float64 ()),
708+ ('category' , pa .string ())
709+ ])
710+ schema = Schema .from_pyarrow_schema (
711+ pa_schema ,
712+ primary_keys = ['id' ],
713+ options = {'metadata.stats-mode' : 'full' , 'bucket' : '2' }
714+ )
715+ self .catalog .create_table ('default.test_pk_value_stats' , schema , False )
716+ table = self .catalog .get_table ('default.test_pk_value_stats' )
717+
718+ test_data = pa .Table .from_pydict ({
719+ 'id' : [1 , 2 , 3 , 4 , 5 ],
720+ 'name' : ['Alice' , 'Bob' , 'Charlie' , 'David' , 'Eve' ],
721+ 'price' : [10.5 , 20.3 , 30.7 , 40.1 , 50.9 ],
722+ 'category' : ['A' , 'B' , 'C' , 'D' , 'E' ]
723+ }, schema = pa_schema )
724+
725+ write_builder = table .new_batch_write_builder ()
726+ writer = write_builder .new_write ()
727+ writer .write_arrow (test_data )
728+ commit_messages = writer .prepare_commit ()
729+ commit = write_builder .new_commit ()
730+ commit .commit (commit_messages )
731+ writer .close ()
732+
733+ read_builder = table .new_read_builder ()
734+ table_scan = read_builder .new_scan ()
735+ latest_snapshot = SnapshotManager (table ).get_latest_snapshot ()
736+ manifest_files = table_scan .starting_scanner .manifest_list_manager .read_all (latest_snapshot )
737+ manifest_entries = table_scan .starting_scanner .manifest_file_manager .read (
738+ manifest_files [0 ].file_name ,
739+ lambda row : table_scan .starting_scanner ._filter_manifest_entry (row ),
740+ False
741+ )
742+
743+ self .assertGreater (len (manifest_entries ), 0 , "Should have at least one manifest entry" )
744+ file_meta = manifest_entries [0 ].file
745+
746+ key_stats = file_meta .key_stats
747+ self .assertIsNotNone (key_stats , "key_stats should not be None" )
748+ self .assertGreater (key_stats .min_values .arity , 0 , "key_stats should contain key fields" )
749+ self .assertEqual (key_stats .min_values .arity , 1 , "key_stats should contain exactly 1 key field (id)" )
750+
751+ value_stats = file_meta .value_stats
752+ self .assertIsNotNone (value_stats , "value_stats should not be None" )
753+
754+ if file_meta .value_stats_cols is None :
755+ expected_value_fields = ['name' , 'price' , 'category' ]
756+ self .assertGreaterEqual (value_stats .min_values .arity , len (expected_value_fields ),
757+ f"value_stats should contain at least { len (expected_value_fields )} value fields" )
758+ else :
759+ self .assertNotIn ('id' , file_meta .value_stats_cols ,
760+ "Key field 'id' should NOT be in value_stats_cols" )
761+
762+ expected_value_fields = ['name' , 'price' , 'category' ]
763+ self .assertTrue (set (expected_value_fields ).issubset (set (file_meta .value_stats_cols )),
764+ f"value_stats_cols should contain value fields: { expected_value_fields } , "
765+ f"but got: { file_meta .value_stats_cols } " )
766+
767+ expected_arity = len (file_meta .value_stats_cols )
768+ self .assertEqual (value_stats .min_values .arity , expected_arity ,
769+ f"value_stats should contain { expected_arity } fields (matching value_stats_cols), "
770+ f"but got { value_stats .min_values .arity } " )
771+ self .assertEqual (value_stats .max_values .arity , expected_arity ,
772+ f"value_stats should contain { expected_arity } fields (matching value_stats_cols), "
773+ f"but got { value_stats .max_values .arity } " )
774+ self .assertEqual (len (value_stats .null_counts ), expected_arity ,
775+ f"value_stats null_counts should have { expected_arity } elements, "
776+ f"but got { len (value_stats .null_counts )} " )
777+
778+ self .assertEqual (value_stats .min_values .arity , len (file_meta .value_stats_cols ),
779+ f"value_stats.min_values.arity ({ value_stats .min_values .arity } ) must match "
780+ f"value_stats_cols length ({ len (file_meta .value_stats_cols )} )" )
781+
782+ for field_name in file_meta .value_stats_cols :
783+ is_system_field = (field_name .startswith ('_KEY_' ) or
784+ field_name in ['_SEQUENCE_NUMBER' , '_VALUE_KIND' , '_ROW_ID' ])
785+ self .assertFalse (is_system_field ,
786+ f"value_stats_cols should not contain system field: { field_name } " )
787+
788+ value_stats_fields = table_scan .starting_scanner .manifest_file_manager ._get_value_stats_fields (
789+ {'_VALUE_STATS_COLS' : file_meta .value_stats_cols },
790+ table .fields
791+ )
792+ min_value_stats = GenericRowDeserializer .from_bytes (
793+ value_stats .min_values .data ,
794+ value_stats_fields
795+ ).values
796+ max_value_stats = GenericRowDeserializer .from_bytes (
797+ value_stats .max_values .data ,
798+ value_stats_fields
799+ ).values
800+
801+ self .assertEqual (len (min_value_stats ), 3 , "min_value_stats should have 3 values" )
802+ self .assertEqual (len (max_value_stats ), 3 , "max_value_stats should have 3 values" )
803+
804+ actual_data = read_builder .new_read ().to_arrow (table_scan .plan ().splits ())
805+ self .assertEqual (actual_data .num_rows , 5 , "Should have 5 rows" )
806+ actual_ids = sorted (actual_data .column ('id' ).to_pylist ())
807+ self .assertEqual (actual_ids , [1 , 2 , 3 , 4 , 5 ], "All IDs should be present" )
808+
703809 def test_split_target_size (self ):
704810 """Test source.split.target-size configuration effect on split generation."""
705811 from pypaimon .common .options .core_options import CoreOptions
0 commit comments