Skip to content

Commit 41ff734

Browse files
authored
change catalog default warehouse location to not use hive-style warehouse location (#2059)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> Closes #2052 # Rationale for this change Aligns catalog behavior with the java reference implementation. [HiveCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java#L697-L736), [DynamoDbCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java#L185), and [GlueCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java#L291) all use `.db` suffix in warehouse location [JdbcCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java#L268), [HadoopCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java#L233-L245), and [InMemoryCatalog](https://github.com/apache/iceberg/blob/7537c3c3a2a6491abcf0c3ef58cc4d5dc6ac4bae/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java#L106-L117) do not use `.db` suffix in warehouse location # Are these changes tested? Yes tests for sql catalog are modified to remove `.db` # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 4f0d7ef commit 41ff734

File tree

6 files changed

+43
-17
lines changed

6 files changed

+43
-17
lines changed

pyiceberg/catalog/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -927,6 +927,20 @@ def _resolve_table_location(self, location: Optional[str], database_name: str, t
927927
return location.rstrip("/")
928928

929929
def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
930+
"""Return the default warehouse location using the convention of `warehousePath/databaseName/tableName`."""
931+
database_properties = self.load_namespace_properties(database_name)
932+
if database_location := database_properties.get(LOCATION):
933+
database_location = database_location.rstrip("/")
934+
return f"{database_location}/{table_name}"
935+
936+
if warehouse_path := self.properties.get(WAREHOUSE_LOCATION):
937+
warehouse_path = warehouse_path.rstrip("/")
938+
return f"{warehouse_path}/{database_name}/{table_name}"
939+
940+
raise ValueError("No default path is set, please specify a location when creating a table")
941+
942+
def _get_hive_style_warehouse_location(self, database_name: str, table_name: str) -> str:
943+
"""Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`."""
930944
database_properties = self.load_namespace_properties(database_name)
931945
if database_location := database_properties.get(LOCATION):
932946
database_location = database_location.rstrip("/")

pyiceberg/catalog/dynamodb.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,10 @@ def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[st
664664
catalog=self,
665665
)
666666

667+
def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
668+
"""Override the default warehouse location to follow Hive-style conventions."""
669+
return self._get_hive_style_warehouse_location(database_name, table_name)
670+
667671

668672
def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]:
669673
current_timestamp_ms = str(round(time() * 1000))

pyiceberg/catalog/glue.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,3 +821,7 @@ def view_exists(self, identifier: Union[str, Identifier]) -> bool:
821821
@staticmethod
822822
def __is_iceberg_table(table: "TableTypeDef") -> bool:
823823
return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG
824+
825+
def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
826+
"""Override the default warehouse location to follow Hive-style conventions."""
827+
return self._get_hive_style_warehouse_location(database_name, table_name)

pyiceberg/catalog/hive.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,3 +798,7 @@ def update_namespace_properties(
798798

799799
def drop_view(self, identifier: Union[str, Identifier]) -> None:
800800
raise NotImplementedError
801+
802+
def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
803+
"""Override the default warehouse location to follow Hive-style conventions."""
804+
return self._get_hive_style_warehouse_location(database_name, table_name)

tests/catalog/test_sql.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -72,21 +72,21 @@ def catalog_name() -> str:
7272

7373
@pytest.fixture(name="random_table_identifier")
7474
def fixture_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier:
75-
os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True)
75+
os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True)
7676
return database_name, table_name
7777

7878

7979
@pytest.fixture(name="another_random_table_identifier")
8080
def fixture_another_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier:
8181
database_name = database_name + "_new"
8282
table_name = table_name + "_new"
83-
os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True)
83+
os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True)
8484
return database_name, table_name
8585

8686

8787
@pytest.fixture(name="random_hierarchical_identifier")
8888
def fixture_random_hierarchical_identifier(warehouse: Path, hierarchical_namespace_name: str, table_name: str) -> Identifier:
89-
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True)
89+
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True)
9090
return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name)))
9191

9292

@@ -96,7 +96,7 @@ def fixture_another_random_hierarchical_identifier(
9696
) -> Identifier:
9797
hierarchical_namespace_name = hierarchical_namespace_name + "_new"
9898
table_name = table_name + "_new"
99-
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True)
99+
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True)
100100
return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name)))
101101

102102

@@ -115,7 +115,7 @@ def catalog_memory(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog,
115115
@pytest.fixture(scope="module")
116116
def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
117117
props = {
118-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
118+
"uri": f"sqlite:////{warehouse}/sql-catalog",
119119
"warehouse": f"file://{warehouse}",
120120
}
121121
catalog = SqlCatalog(catalog_name, **props)
@@ -126,7 +126,7 @@ def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog,
126126

127127
@pytest.fixture(scope="module")
128128
def catalog_uri(warehouse: Path) -> str:
129-
return f"sqlite:////{warehouse}/sql-catalog.db"
129+
return f"sqlite:////{warehouse}/sql-catalog"
130130

131131

132132
@pytest.fixture(scope="module")
@@ -137,7 +137,7 @@ def alchemy_engine(catalog_uri: str) -> Engine:
137137
@pytest.fixture(scope="module")
138138
def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
139139
props = {
140-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
140+
"uri": f"sqlite:////{warehouse}/sql-catalog",
141141
"warehouse": f"file://{warehouse}",
142142
}
143143
catalog = SqlCatalog(catalog_name, **props)
@@ -150,7 +150,7 @@ def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Gener
150150
@pytest.fixture(scope="module")
151151
def catalog_sqlite_fsspec(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
152152
props = {
153-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
153+
"uri": f"sqlite:////{warehouse}/sql-catalog",
154154
"warehouse": f"file://{warehouse}",
155155
PY_IO_IMPL: FSSPEC_FILE_IO,
156156
}
@@ -176,7 +176,7 @@ def test_creation_with_echo_parameter(catalog_name: str, warehouse: Path) -> Non
176176

177177
for echo_param, expected_echo_value in test_cases:
178178
props = {
179-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
179+
"uri": f"sqlite:////{warehouse}/sql-catalog",
180180
"warehouse": f"file://{warehouse}",
181181
}
182182
# None is for default value
@@ -199,7 +199,7 @@ def test_creation_with_pool_pre_ping_parameter(catalog_name: str, warehouse: Pat
199199

200200
for pool_pre_ping_param, expected_pool_pre_ping_value in test_cases:
201201
props = {
202-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
202+
"uri": f"sqlite:////{warehouse}/sql-catalog",
203203
"warehouse": f"file://{warehouse}",
204204
}
205205
# None is for default value
@@ -219,7 +219,7 @@ def test_creation_from_impl(catalog_name: str, warehouse: Path) -> None:
219219
catalog_name,
220220
**{
221221
"py-catalog-impl": "pyiceberg.catalog.sql.SqlCatalog",
222-
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
222+
"uri": f"sqlite:////{warehouse}/sql-catalog",
223223
"warehouse": f"file://{warehouse}",
224224
},
225225
),
@@ -493,7 +493,7 @@ def test_create_table_with_given_location_removes_trailing_slash(
493493
identifier_tuple = Catalog.identifier_to_tuple(table_identifier)
494494
namespace = Catalog.namespace_from(table_identifier)
495495
table_name = Catalog.table_name_from(identifier_tuple)
496-
location = f"file://{warehouse}/{catalog.name}.db/{table_name}-given"
496+
location = f"file://{warehouse}/{catalog.name}/{table_name}-given"
497497
catalog.create_namespace(namespace)
498498
catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/")
499499
table = catalog.load_table(table_identifier)
@@ -1235,7 +1235,7 @@ def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None:
12351235
warehouse_location = "/test/location"
12361236
test_properties = {
12371237
"comment": "this is a test description",
1238-
"location": f"{warehouse_location}/{namespace}.db",
1238+
"location": f"{warehouse_location}/{namespace}",
12391239
"test_property1": "1",
12401240
"test_property2": "2",
12411241
"test_property3": "3",
@@ -1286,7 +1286,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non
12861286
warehouse_location = "/test/location"
12871287
test_properties = {
12881288
"comment": "this is a test description",
1289-
"location": f"{warehouse_location}/{namespace}.db",
1289+
"location": f"{warehouse_location}/{namespace}",
12901290
"test_property1": "1",
12911291
"test_property2": "2",
12921292
"test_property3": "3",
@@ -1306,7 +1306,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non
13061306
"comment": "updated test description",
13071307
"test_property4": "4",
13081308
"test_property5": "5",
1309-
"location": f"{warehouse_location}/{namespace}.db",
1309+
"location": f"{warehouse_location}/{namespace}",
13101310
}
13111311

13121312

tests/cli/test_console.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def test_location(catalog: InMemoryCatalog) -> None:
271271
runner = CliRunner()
272272
result = runner.invoke(run, ["location", "default.my_table"])
273273
assert result.exit_code == 0
274-
assert result.output == f"""{catalog._warehouse_location}/default.db/my_table\n"""
274+
assert result.output == f"""{catalog._warehouse_location}/default/my_table\n"""
275275

276276

277277
def test_location_does_not_exists(catalog: InMemoryCatalog) -> None:
@@ -700,7 +700,7 @@ def test_json_location(catalog: InMemoryCatalog) -> None:
700700
runner = CliRunner()
701701
result = runner.invoke(run, ["--output=json", "location", "default.my_table"])
702702
assert result.exit_code == 0
703-
assert result.output == f'"{catalog._warehouse_location}/default.db/my_table"\n'
703+
assert result.output == f'"{catalog._warehouse_location}/default/my_table"\n'
704704

705705

706706
def test_json_location_does_not_exists(catalog: InMemoryCatalog) -> None:

0 commit comments

Comments
 (0)