[python] introduce update_columns in python api.

zhoulii · zhoulii · commit 1a61811a67a8 · 2025-12-23T19:23:45.000+08:00
diff --git a/paimon-python/pypaimon/tests/partial_columns_write_test.py b/paimon-python/pypaimon/tests/partial_columns_write_test.py
@@ -328,6 +328,49 @@ def test_multiple_calls(self):
         self.assertEqual(ages, expected_ages, "Age column was not updated correctly")
         self.assertEqual(cities, expected_cities, "City column was not updated correctly")
 
+    def test_wrong_total_row_count(self):
+        """Test that wrong total row count raises an error."""
+        # Create table with initial data
+        table = self._create_table()
+
+        # Create data evolution writer using BatchTableWrite
+        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
+        batch_write = write_builder.new_write().with_write_type(['age'])
+
+        # Prepare update data with wrong row count (only 3 rows instead of 5)
+        update_data = pa.Table.from_pydict({
+            '_ROW_ID': [0, 1, 2],
+            'age': [26, 31, 36]
+        })
+
+        # Should raise ValueError for total row count mismatch
+        with self.assertRaises(ValueError) as context:
+            batch_write.write_arrow(update_data)
+
+        self.assertIn("does not match table total row count", str(context.exception))
+        batch_write.close()
+
+    def test_wrong_first_row_id_row_count(self):
+        """Test that wrong row count for a first_row_id raises an error."""
+        # Create table with initial data
+        table = self._create_table()
+
+        # Create data evolution writer using BatchTableWrite
+        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
+        batch_write = write_builder.new_write().with_write_type(['age'])
+
+        # Prepare update data with duplicate row_id (violates monotonically increasing)
+        update_data = pa.Table.from_pydict({
+            '_ROW_ID': [0, 1, 1, 4, 5],
+            'age': [26, 31, 36, 37, 38]
+        })
+
+        # Should raise ValueError for row ID validation
+        with self.assertRaises(ValueError) as context:
+            batch_write.write_arrow(update_data)
+
+        self.assertIn("Row IDs are not monotonically increasing", str(context.exception))
+        batch_write.close()
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/paimon-python/pypaimon/write/partial_column_write.py b/paimon-python/pypaimon/write/partial_column_write.py
@@ -44,7 +44,10 @@ def __init__(self, table, commit_user: str):
         self.commit_user = commit_user
 
         # Load existing first_row_ids and build partition map
-        self.first_row_ids, self.first_row_id_to_partition_map = self._load_existing_files_info()
+        (self.first_row_ids,
+         self.first_row_id_to_partition_map,
+         self.first_row_id_to_row_count_map,
+         self.total_row_count) = self._load_existing_files_info()
 
         # Collect commit messages
         self.commit_messages = []
@@ -53,18 +56,24 @@ def _load_existing_files_info(self):
         """Load existing first_row_ids and build partition map for efficient lookup."""
         first_row_ids = []
         first_row_id_to_partition_map: Dict[int, GenericRow] = {}
+        first_row_id_to_row_count_map: Dict[int, int] = {}
 
         read_builder = self.table.new_read_builder()
         scan = read_builder.new_scan()
         splits = scan.plan().splits()
 
         for split in splits:
             for file in split.files:
-                if file.first_row_id is not None:
-                    first_row_ids.append(file.first_row_id)
-                    first_row_id_to_partition_map[file.first_row_id] = split.partition
+                if file.first_row_id is not None and not file.file_name.endswith('.blob'):
+                    first_row_id = file.first_row_id
+                    first_row_ids.append(first_row_id)
+                    first_row_id_to_partition_map[first_row_id] = split.partition
+                    first_row_id_to_row_count_map[first_row_id] = file.row_count
 
-        return sorted(list(set(first_row_ids))), first_row_id_to_partition_map
+        total_row_count = sum(first_row_id_to_row_count_map.values())
+
+        return sorted(list(set(first_row_ids))
+                      ), first_row_id_to_partition_map, first_row_id_to_row_count_map, total_row_count
 
     def update_columns(self, data: pa.Table, column_names: List[str]) -> List:
         """
@@ -91,6 +100,11 @@ def update_columns(self, data: pa.Table, column_names: List[str]) -> List:
             if col_name not in self.table.field_names:
                 raise ValueError(f"Column {col_name} not found in table schema")
 
+        # Validate data row count matches total row count
+        if data.num_rows != self.total_row_count:
+            raise ValueError(
+                f"Input data row count ({data.num_rows}) does not match table total row count ({self.total_row_count})")
+
         # Sort data by _ROW_ID column
         sorted_data = data.sort_by([(SpecialFields.ROW_ID.name, "ascending")])
 
@@ -106,6 +120,12 @@ def _calculate_first_row_id(self, data: pa.Table) -> pa.Table:
         """Calculate _first_row_id for each row based on _ROW_ID."""
         row_ids = data[SpecialFields.ROW_ID.name].to_pylist()
 
+        # Validate that row_ids are monotonically increasing starting from 0
+        expected_row_ids = list(range(len(row_ids)))
+        if row_ids != expected_row_ids:
+            raise ValueError(f"Row IDs are not monotonically increasing starting from 0. "
+                             f"Expected: {expected_row_ids}")
+
         # Calculate first_row_id for each row_id
         first_row_id_values = []
         for row_id in row_ids:
@@ -155,6 +175,12 @@ def _write_group(self, partition: GenericRow, first_row_id: int,
                      data: pa.Table, column_names: List[str]):
         """Write a group of data with the same first_row_id."""
 
+        # Validate data row count matches the first_row_id's row count
+        expected_row_count = self.first_row_id_to_row_count_map.get(first_row_id, 0)
+        if data.num_rows != expected_row_count:
+            raise ValueError(
+                f"Data row count ({data.num_rows}) does not match expected row count ({expected_row_count}) for first_row_id {first_row_id}")
+
         # Create a file store write for this partition
         file_store_write = FileStoreWrite(self.table, self.commit_user)