apache
diff --git a/‎docs/content/program-api/python-api.md‎
Lines changed: 13 additions & 115 deletions b/‎docs/content/program-api/python-api.md‎
Lines changed: 13 additions & 115 deletions
diff --git a/‎…imon/tests/partial_columns_write_test.py‎ ‎…thon/pypaimon/tests/table_update_test.py‎paimon-python/pypaimon/tests/partial_columns_write_test.py renamed to paimon-python/pypaimon/tests/table_update_test.py
Lines changed: 33 additions & 50 deletions b/‎…imon/tests/partial_columns_write_test.py‎ ‎…thon/pypaimon/tests/table_update_test.py‎paimon-python/pypaimon/tests/partial_columns_write_test.py renamed to paimon-python/pypaimon/tests/table_update_test.py
Lines changed: 33 additions & 50 deletions
@@ -213,9 +213,12 @@ write_builder = table.new_batch_write_builder().overwrite()
 write_builder = table.new_batch_write_builder().overwrite({'dt': '2024-01-01'})
 ```
 
-### Write partial columns
+### Update columns
+
+You can create `TableUpdate.update_by_arrow_with_row_id` to update columns to data evolution tables.
 
-when enable data-evolution, you can write partial columns to table:
+The input data should include the `_ROW_ID` column, update operation will automatically sort and match each `_ROW_ID` to
+its corresponding `first_row_id`, then groups rows with the same `first_row_id` and writes them to a separate file.
 
 ```python
 simple_pa_schema = pa.schema([
@@ -240,129 +243,24 @@ table_commit.commit(table_write.prepare_commit())
 table_write.close()
 table_commit.close()
 
-# write partial columns
-table_write = write_builder.new_write().with_write_type(['f0'])
-table_commit = write_builder.new_commit()
-data2 = pa.Table.from_pydict({
-  'f0': [3, 4],
-}, schema=pa.schema([
-  ('f0', pa.int8()),
-]))
-table_write.write_arrow(data2)
-cmts = table_write.prepare_commit()
-
-# assign first row id
-cmts[0].new_files[0].first_row_id = 0
-table_commit.commit(cmts)
-table_write.close()
-table_commit.close()
-```
-
-Paimon data-evolution table use `first_row_id` to split files, when write partial columns, 
-you should split data into multiple parts by rows, and assign `first_row_id` for each file before commit 
-, or it may cause some fatal error during table reads.
-
-For example, in the following code, `write-1` will generate a file with `first_row_id=0` which contains 2 rows, 
-and `write-2` will generate a file with `first_row_id=2` which also contains 2 rows. Then, if we update column `f0` and 
-do not split data into multiple parts by rows, the generated file would have `first_row_id=0` and contains 4 rows, when reading 
-this table, it will cause a fatal error. 
-
-```python
-table = catalog.get_table('default.test_row_tracking')
-
-# write-1
-write_builder = table.new_batch_write_builder()
-table_write = write_builder.new_write()
-table_commit = write_builder.new_commit()
-expect_data = pa.Table.from_pydict({
-  'f0': [-1, 2],
-  'f1': [-1001, 1002]
-}, schema=simple_pa_schema)
-table_write.write_arrow(expect_data)
-table_commit.commit(table_write.prepare_commit())
-table_write.close()
-table_commit.close()
-
-# write-2
-table_write = write_builder.new_write()
-table_commit = write_builder.new_commit()
-expect_data = pa.Table.from_pydict({
-  'f0': [3, 4],
-  'f1': [1003, 1004]
-}, schema=simple_pa_schema)
-table_write.write_arrow(expect_data)
-table_commit.commit(table_write.prepare_commit())
-table_write.close()
-table_commit.close()
-
-# write partial columns
-table_write = write_builder.new_write().with_write_type(['f0'])
-table_commit = write_builder.new_commit()
-data2 = pa.Table.from_pydict({
-  'f0': [5, 6, 7, 8],
-}, schema=pa.schema([
-  ('f0', pa.int8()),
-]))
-table_write.write_arrow(data2)
-cmts = table_write.prepare_commit()
-cmts[0].new_files[0].first_row_id = 0
-table_commit.commit(cmts)
-table_write.close()
-table_commit.close()
-
-read_builder = table.new_read_builder()
-table_scan = read_builder.new_scan()
-table_read = read_builder.new_read()
-
-# a fatal error will be thrown
-actual_data = table_read.to_arrow(table_scan.plan().splits())
-```
-
-### Update columns
-
-Handle file `first_row_id` manually is inconvenient and error-prone. If you don't want to do this, you can enable `update_columns_by_row_id` 
-when create `WriteBuilder` and set write type for `TableWrite`, then you can write partial columns without handling file `first_row_id`.
-The input data should include the `_ROW_ID` column, writing operation will automatically sort and match each `_ROW_ID` to
-its corresponding `first_row_id`, then groups rows with the same `first_row_id` and writes them to a separate file.
-
-```python
-table = catalog.get_table('default.test_row_tracking')
-
-# write-1
-# same as above
-
-# write-2
-# same as above
-
 # update partial columns
-write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-table_write = write_builder.new_write().with_write_type(['f0'])
+write_builder = table.new_batch_write_builder()
+table_update = write_builder.new_update().with_update_type(['f0'])
 table_commit = write_builder.new_commit()
 data2 = pa.Table.from_pydict({
-  '_ROW_ID': [0, 1, 2, 3],
-  'f0': [5, 6, 7, 8],
+  '_ROW_ID': [0, 1],
+  'f0': [5, 6],
 }, schema=pa.schema([
   ('_ROW_ID', pa.int64()),
   ('f0', pa.int8()),
 ]))
-table_write.write_arrow(data2)
-cmts = table_write.prepare_commit()
+cmts = table_update.update_by_arrow_with_row_id(data2)
 table_commit.commit(cmts)
-table_write.close()
 table_commit.close()
 
-read_builder = table.new_read_builder()
-table_scan = read_builder.new_scan()
-table_read = read_builder.new_read()
-actual_data = table_read.to_arrow(table_scan.plan().splits())
-expect_data = pa.Table.from_pydict({
-  'f0': [5, 6, 7, 8],
-  'f1': [-1001, 1002, 1003, 1004]
-}, schema=pa.schema([
-  ('f0', pa.int8()),
-  ('f1', pa.int16()),
-]))
-self.assertEqual(actual_data, expect_data)
+# content should be:
+#   'f0': [5, 6],
+#   'f1': [-1001, 1002]
 ```
 
 ## Batch Read
 
@@ -25,7 +25,7 @@
 from pypaimon import CatalogFactory, Schema
 
 
-class PartialColumnsWriteTest(unittest.TestCase):
+class TableUpdateTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.tempdir = tempfile.mkdtemp()
@@ -101,7 +101,7 @@ def test_update_existing_column(self):
         # Create table with initial data
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
+        # Create data evolution table update
         write_builder = table.new_batch_write_builder()
         batch_write = write_builder.new_write()
 
@@ -112,10 +112,9 @@ def test_update_existing_column(self):
         })
 
         # Update the age column
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age'])
-        batch_write.write_arrow(update_data)
-        commit_messages = batch_write.prepare_commit()
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age'])
+        commit_messages = table_update.update_by_arrow_with_row_id(update_data)
 
         # Commit the changes
         table_commit = write_builder.new_commit()
@@ -139,7 +138,7 @@ def test_update_multiple_columns(self):
         # Create table with initial data
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
+        # Create data evolution table update
         write_builder = table.new_batch_write_builder()
         batch_write = write_builder.new_write()
 
@@ -151,10 +150,9 @@ def test_update_multiple_columns(self):
         })
 
         # Update multiple columns
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age', 'city'])
-        batch_write.write_arrow(update_data)
-        commit_messages = batch_write.prepare_commit()
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age', 'city'])
+        commit_messages = table_update.update_by_arrow_with_row_id(update_data)
 
         # Commit the changes
         table_commit = write_builder.new_commit()
@@ -182,10 +180,6 @@ def test_nonexistent_column(self):
         """Test that updating a non-existent column raises an error."""
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
-        write_builder = table.new_batch_write_builder()
-        batch_write = write_builder.new_write()
-
         # Try to update a non-existent column
         update_data = pa.Table.from_pydict({
             '_ROW_ID': [0, 1, 2, 3, 4],
@@ -194,18 +188,17 @@ def test_nonexistent_column(self):
 
         # Should raise ValueError
         with self.assertRaises(ValueError) as context:
-            write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-            batch_write = write_builder.new_write().with_write_type(['nonexistent_column'])
-            batch_write.write_arrow(update_data)
+            write_builder = table.new_batch_write_builder()
+            table_update = write_builder.new_update().with_update_type(['nonexistent_column'])
+            table_update.update_by_arrow_with_row_id(update_data)
 
         self.assertIn('not in table schema', str(context.exception))
-        batch_write.close()
 
     def test_missing_row_id_column(self):
         """Test that missing row_id column raises an error."""
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
+        # Create data evolution table update
         write_builder = table.new_batch_write_builder()
         batch_write = write_builder.new_write()
 
@@ -216,9 +209,9 @@ def test_missing_row_id_column(self):
 
         # Should raise ValueError
         with self.assertRaises(ValueError) as context:
-            write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-            batch_write = write_builder.new_write().with_write_type(['age'])
-            batch_write.write_arrow(update_data)
+            write_builder = table.new_batch_write_builder()
+            table_update = write_builder.new_update().with_update_type(['age'])
+            table_update.update_by_arrow_with_row_id(update_data)
 
         self.assertIn("Input data must contain _ROW_ID column", str(context.exception))
         batch_write.close()
@@ -247,24 +240,22 @@ def test_partitioned_table_update(self):
         table_write.close()
         table_commit.close()
 
-        # Create data evolution writer using BatchTableWrite
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age'])
+        # Create data evolution table update
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age'])
 
         # Update ages
         update_data = pa.Table.from_pydict({
             '_ROW_ID': [1, 0, 2, 3, 4],
             'age': [31, 26, 36, 41, 46]
         })
 
-        batch_write.write_arrow(update_data)
-        commit_messages = batch_write.prepare_commit()
+        commit_messages = table_update.update_by_arrow_with_row_id(update_data)
 
         # Commit the changes
         table_commit = write_builder.new_commit()
         table_commit.commit(commit_messages)
         table_commit.close()
-        batch_write.close()
 
         # Verify the updated data
         read_builder = table.new_read_builder()
@@ -283,16 +274,15 @@ def test_multiple_calls(self):
         table = self._create_table()
 
         # First update: Update age column
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age'])
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age'])
 
         update_age_data = pa.Table.from_pydict({
             '_ROW_ID': [1, 0, 2, 3, 4],
             'age': [31, 26, 36, 41, 46]
         })
 
-        batch_write.write_arrow(update_age_data)
-        commit_messages = batch_write.prepare_commit()
+        commit_messages = table_update.update_by_arrow_with_row_id(update_age_data)
         table_commit = write_builder.new_commit()
         table_commit.commit(commit_messages)
         table_commit.close()
@@ -302,17 +292,12 @@ def test_multiple_calls(self):
             '_ROW_ID': [1, 0, 2, 3, 4],
             'city': ['Los Angeles', 'New York', 'Chicago', 'Phoenix', 'Houston']
         })
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['city'])
-        batch_write.write_arrow(update_city_data)
-        commit_messages = batch_write.prepare_commit()
+        table_update.with_update_type(['city'])
+        commit_messages = table_update.update_by_arrow_with_row_id(update_city_data)
         table_commit = write_builder.new_commit()
         table_commit.commit(commit_messages)
         table_commit.close()
 
-        # Close the batch write
-        batch_write.close()
-
         # Verify both columns were updated correctly
         read_builder = table.new_read_builder()
         table_read = read_builder.new_read()
@@ -333,9 +318,9 @@ def test_wrong_total_row_count(self):
         # Create table with initial data
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age'])
+        # Create data evolution table update
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age'])
 
         # Prepare update data with wrong row count (only 3 rows instead of 5)
         update_data = pa.Table.from_pydict({
@@ -345,19 +330,18 @@ def test_wrong_total_row_count(self):
 
         # Should raise ValueError for total row count mismatch
         with self.assertRaises(ValueError) as context:
-            batch_write.write_arrow(update_data)
+            table_update.update_by_arrow_with_row_id(update_data)
 
         self.assertIn("does not match table total row count", str(context.exception))
-        batch_write.close()
 
     def test_wrong_first_row_id_row_count(self):
         """Test that wrong row count for a first_row_id raises an error."""
         # Create table with initial data
         table = self._create_table()
 
-        # Create data evolution writer using BatchTableWrite
-        write_builder = table.new_batch_write_builder().update_columns_by_row_id()
-        batch_write = write_builder.new_write().with_write_type(['age'])
+        # Create data evolution table update
+        write_builder = table.new_batch_write_builder()
+        table_update = write_builder.new_update().with_update_type(['age'])
 
         # Prepare update data with duplicate row_id (violates monotonically increasing)
         update_data = pa.Table.from_pydict({
@@ -367,10 +351,9 @@ def test_wrong_first_row_id_row_count(self):
 
         # Should raise ValueError for row ID validation
         with self.assertRaises(ValueError) as context:
-            batch_write.write_arrow(update_data)
+            table_update.update_by_arrow_with_row_id(update_data)
 
         self.assertIn("Row IDs are not monotonically increasing", str(context.exception))
-        batch_write.close()
 
 if __name__ == '__main__':
     unittest.main()