Skip to content

Commit f72e363

Browse files
authored
Refactor GlueCatalog _commit_table (#653)
* refactor _commit_table * small refactor * extract common logic of _commit_table * reformat
1 parent f2acf1d commit f72e363

File tree

4 files changed

+82
-50
lines changed

4 files changed

+82
-50
lines changed

pyiceberg/catalog/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
CreateTableTransaction,
4949
StagedTable,
5050
Table,
51+
update_table_metadata,
5152
)
5253
from pyiceberg.table.metadata import TableMetadata, TableMetadataV1, new_table_metadata
5354
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
@@ -728,6 +729,27 @@ def _create_staged_table(
728729
catalog=self,
729730
)
730731

732+
def _update_and_stage_table(self, current_table: Optional[Table], table_request: CommitTableRequest) -> StagedTable:
733+
for requirement in table_request.requirements:
734+
requirement.validate(current_table.metadata if current_table else None)
735+
736+
updated_metadata = update_table_metadata(
737+
base_metadata=current_table.metadata if current_table else self._empty_table_metadata(),
738+
updates=table_request.updates,
739+
enforce_validation=current_table is None,
740+
)
741+
742+
new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 if current_table else 0
743+
new_metadata_location = self._get_metadata_location(updated_metadata.location, new_metadata_version)
744+
745+
return StagedTable(
746+
identifier=tuple(table_request.identifier.namespace.root + [table_request.identifier.name]),
747+
metadata=updated_metadata,
748+
metadata_location=new_metadata_location,
749+
io=self._load_file_io(properties=updated_metadata.properties, location=new_metadata_location),
750+
catalog=self,
751+
)
752+
731753
def _get_updated_props_and_update_summary(
732754
self, current_properties: Properties, removals: Optional[Set[str]], updates: Properties
733755
) -> Tuple[PropertiesUpdateSummary, Properties]:

pyiceberg/catalog/glue.py

Lines changed: 35 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@
5858
NoSuchTableError,
5959
TableAlreadyExistsError,
6060
)
61-
from pyiceberg.io import load_file_io
6261
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
6362
from pyiceberg.schema import Schema, SchemaVisitor, visit
6463
from pyiceberg.serializers import FromInputFile
@@ -67,7 +66,6 @@
6766
CommitTableResponse,
6867
PropertyUtil,
6968
Table,
70-
update_table_metadata,
7169
)
7270
from pyiceberg.table.metadata import TableMetadata
7371
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
@@ -321,7 +319,7 @@ def _convert_glue_to_iceberg(self, glue_table: TableTypeDef) -> Table:
321319
)
322320
metadata_location = properties[METADATA_LOCATION]
323321

324-
io = load_file_io(properties=self.properties, location=metadata_location)
322+
io = self._load_file_io(location=metadata_location)
325323
file = io.new_input(metadata_location)
326324
metadata = FromInputFile.table_metadata(file)
327325
return Table(
@@ -439,71 +437,64 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons
439437
)
440438
database_name, table_name = self.identifier_to_database_and_table(identifier_tuple)
441439

440+
current_glue_table: Optional[TableTypeDef]
441+
glue_table_version_id: Optional[str]
442+
current_table: Optional[Table]
442443
try:
443444
current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name)
444-
# Update the table
445445
glue_table_version_id = current_glue_table.get("VersionId")
446+
current_table = self._convert_glue_to_iceberg(glue_table=current_glue_table)
447+
except NoSuchTableError:
448+
current_glue_table = None
449+
glue_table_version_id = None
450+
current_table = None
451+
452+
updated_staged_table = self._update_and_stage_table(current_table, table_request)
453+
if current_table and updated_staged_table.metadata == current_table.metadata:
454+
# no changes, do nothing
455+
return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location)
456+
self._write_metadata(
457+
metadata=updated_staged_table.metadata,
458+
io=updated_staged_table.io,
459+
metadata_path=updated_staged_table.metadata_location,
460+
)
461+
462+
if current_table:
463+
# table exists, update the table
446464
if not glue_table_version_id:
447465
raise CommitFailedException(
448466
f"Cannot commit {database_name}.{table_name} because Glue table version id is missing"
449467
)
450-
current_table = self._convert_glue_to_iceberg(glue_table=current_glue_table)
451-
base_metadata = current_table.metadata
452-
453-
# Validate the update requirements
454-
for requirement in table_request.requirements:
455-
requirement.validate(base_metadata)
456-
457-
updated_metadata = update_table_metadata(base_metadata=base_metadata, updates=table_request.updates)
458-
if updated_metadata == base_metadata:
459-
# no changes, do nothing
460-
return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location)
461-
462-
# write new metadata
463-
new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1
464-
new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version)
465-
self._write_metadata(updated_metadata, current_table.io, new_metadata_location)
466468

469+
# Pass `version_id` to implement optimistic locking: it ensures updates are rejected if concurrent
470+
# modifications occur. See more details at https://iceberg.apache.org/docs/latest/aws/#optimistic-locking
467471
update_table_input = _construct_table_input(
468472
table_name=table_name,
469-
metadata_location=new_metadata_location,
470-
properties=current_table.properties,
471-
metadata=updated_metadata,
473+
metadata_location=updated_staged_table.metadata_location,
474+
properties=updated_staged_table.properties,
475+
metadata=updated_staged_table.metadata,
472476
glue_table=current_glue_table,
473477
prev_metadata_location=current_table.metadata_location,
474478
)
475-
476-
# Pass `version_id` to implement optimistic locking: it ensures updates are rejected if concurrent
477-
# modifications occur. See more details at https://iceberg.apache.org/docs/latest/aws/#optimistic-locking
478479
self._update_glue_table(
479480
database_name=database_name,
480481
table_name=table_name,
481482
table_input=update_table_input,
482483
version_id=glue_table_version_id,
483484
)
484-
485-
return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location)
486-
except NoSuchTableError:
487-
# Create the table
488-
updated_metadata = update_table_metadata(
489-
base_metadata=self._empty_table_metadata(), updates=table_request.updates, enforce_validation=True
490-
)
491-
new_metadata_version = 0
492-
new_metadata_location = self._get_metadata_location(updated_metadata.location, new_metadata_version)
493-
self._write_metadata(
494-
updated_metadata, self._load_file_io(updated_metadata.properties, new_metadata_location), new_metadata_location
495-
)
496-
485+
else:
486+
# table does not exist, create the table
497487
create_table_input = _construct_table_input(
498488
table_name=table_name,
499-
metadata_location=new_metadata_location,
500-
properties=updated_metadata.properties,
501-
metadata=updated_metadata,
489+
metadata_location=updated_staged_table.metadata_location,
490+
properties=updated_staged_table.properties,
491+
metadata=updated_staged_table.metadata,
502492
)
503-
504493
self._create_glue_table(database_name=database_name, table_name=table_name, table_input=create_table_input)
505494

506-
return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location)
495+
return CommitTableResponse(
496+
metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location
497+
)
507498

508499
def load_table(self, identifier: Union[str, Identifier]) -> Table:
509500
"""Load the table's metadata and returns the table instance.

tests/catalog/integration_test_glue.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -462,21 +462,29 @@ def test_commit_table_update_schema(
462462
]
463463

464464

465-
def test_commit_table_properties(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_name: str) -> None:
465+
def test_commit_table_properties(
466+
test_catalog: Catalog, glue: boto3.client, table_schema_nested: Schema, database_name: str, table_name: str
467+
) -> None:
466468
identifier = (database_name, table_name)
467469
test_catalog.create_namespace(namespace=database_name)
468470
table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, properties={"test_a": "test_a"})
469471

470472
assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0
471473

472474
transaction = table.transaction()
473-
transaction.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c")
475+
transaction.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c", Description="test_description")
474476
transaction.remove_properties("test_b")
475477
transaction.commit_transaction()
476478

477479
updated_table_metadata = table.metadata
478480
assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 1
479-
assert updated_table_metadata.properties == {"test_a": "test_aa", "test_c": "test_c"}
481+
assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"}
482+
483+
table_info = glue.get_table(
484+
DatabaseName=database_name,
485+
Name=table_name,
486+
)
487+
assert table_info["Table"]["Description"] == "test_description"
480488

481489

482490
@pytest.mark.parametrize("format_version", [1, 2])

tests/catalog/test_glue.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,12 @@ def test_commit_table_update_schema(
677677

678678
@mock_aws
679679
def test_commit_table_properties(
680-
_bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str
680+
_glue: boto3.client,
681+
_bucket_initialize: None,
682+
moto_endpoint_url: str,
683+
table_schema_nested: Schema,
684+
database_name: str,
685+
table_name: str,
681686
) -> None:
682687
catalog_name = "glue"
683688
identifier = (database_name, table_name)
@@ -688,13 +693,19 @@ def test_commit_table_properties(
688693
assert test_catalog._parse_metadata_version(table.metadata_location) == 0
689694

690695
transaction = table.transaction()
691-
transaction.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c")
696+
transaction.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c", Description="test_description")
692697
transaction.remove_properties("test_b")
693698
transaction.commit_transaction()
694699

695700
updated_table_metadata = table.metadata
696701
assert test_catalog._parse_metadata_version(table.metadata_location) == 1
697-
assert updated_table_metadata.properties == {"test_a": "test_aa", "test_c": "test_c"}
702+
assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"}
703+
704+
table_info = _glue.get_table(
705+
DatabaseName=database_name,
706+
Name=table_name,
707+
)
708+
assert table_info["Table"]["Description"] == "test_description"
698709

699710

700711
@mock_aws

0 commit comments

Comments
 (0)