Skip to content

Commit 382e0ea

Browse files
authored
Merge branch 'main' into main
2 parents 352b48f + 831170d commit 382e0ea

File tree

20 files changed

+474
-145
lines changed

20 files changed

+474
-145
lines changed

dev/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_V
4747
&& rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
4848

4949
# Download iceberg spark runtime
50-
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.0-SNAPSHOT/iceberg-spark-runtime-3.5_2.12-1.9.0-20250408.001846-43.jar \
50+
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.0-SNAPSHOT/iceberg-spark-runtime-3.5_2.12-1.9.0-20250409.001855-44.jar \
5151
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
5252

5353

5454
# Download AWS bundle
55-
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-aws-bundle/1.9.0-SNAPSHOT/iceberg-aws-bundle-1.9.0-20250408.002722-86.jar \
55+
RUN curl --retry 5 -s https://repository.apache.org/content/groups/snapshots/org/apache/iceberg/iceberg-aws-bundle/1.9.0-SNAPSHOT/iceberg-aws-bundle-1.9.0-20250409.002731-88.jar \
5656
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
5757

5858
COPY spark-defaults.conf /opt/spark/conf

mkdocs/docs/api.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,17 @@ static_table = StaticTable.from_metadata(
215215

216216
The static-table is considered read-only.
217217

218+
Alternatively, if your table metadata directory contains a `version-hint.text` file, you can just specify
219+
the table root path, and the latest metadata file will be picked automatically.
220+
221+
```python
222+
from pyiceberg.table import StaticTable
223+
224+
static_table = StaticTable.from_metadata(
225+
"s3://warehouse/wh/nyc.db/taxis
226+
)
227+
```
228+
218229
## Check if a table exists
219230

220231
To check whether the `bids` table exists:

mkdocs/docs/configuration.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya
189189
| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. |
190190
| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. |
191191
| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. |
192-
| s3.force-virtual-addressing | True | Whether to use virtual addressing of buckets. This must be set to True as OSS can only be accessed with virtual hosted style address. |
192+
| s3.force-virtual-addressing | True | Whether to use virtual addressing of buckets. This is set to `True` by default as OSS can only be accessed with virtual hosted style address. |
193193

194194
<!-- markdown-link-check-enable-->
195195

poetry.lock

Lines changed: 78 additions & 78 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/catalog/hive.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import logging
1919
import socket
2020
import time
21+
from functools import cached_property
2122
from types import TracebackType
2223
from typing import (
2324
TYPE_CHECKING,
@@ -143,40 +144,47 @@ class _HiveClient:
143144
"""Helper class to nicely open and close the transport."""
144145

145146
_transport: TTransport
146-
_client: Client
147147
_ugi: Optional[List[str]]
148148

149149
def __init__(self, uri: str, ugi: Optional[str] = None, kerberos_auth: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT):
150150
self._uri = uri
151151
self._kerberos_auth = kerberos_auth
152152
self._ugi = ugi.split(":") if ugi else None
153+
self._transport = self._init_thrift_transport()
153154

154-
self._init_thrift_client()
155-
156-
def _init_thrift_client(self) -> None:
155+
def _init_thrift_transport(self) -> TTransport:
157156
url_parts = urlparse(self._uri)
158-
159157
socket = TSocket.TSocket(url_parts.hostname, url_parts.port)
160-
161158
if not self._kerberos_auth:
162-
self._transport = TTransport.TBufferedTransport(socket)
159+
return TTransport.TBufferedTransport(socket)
163160
else:
164-
self._transport = TTransport.TSaslClientTransport(socket, host=url_parts.hostname, service="hive")
161+
return TTransport.TSaslClientTransport(socket, host=url_parts.hostname, service="hive")
165162

163+
@cached_property
164+
def _client(self) -> Client:
166165
protocol = TBinaryProtocol.TBinaryProtocol(self._transport)
167-
168-
self._client = Client(protocol)
166+
client = Client(protocol)
167+
if self._ugi:
168+
client.set_ugi(*self._ugi)
169+
return client
169170

170171
def __enter__(self) -> Client:
171-
self._transport.open()
172-
if self._ugi:
173-
self._client.set_ugi(*self._ugi)
172+
"""Make sure the transport is initialized and open."""
173+
if not self._transport.isOpen():
174+
try:
175+
self._transport.open()
176+
except TTransport.TTransportException:
177+
# reinitialize _transport
178+
self._transport = self._init_thrift_transport()
179+
self._transport.open()
174180
return self._client
175181

176182
def __exit__(
177183
self, exctype: Optional[Type[BaseException]], excinst: Optional[BaseException], exctb: Optional[TracebackType]
178184
) -> None:
179-
self._transport.close()
185+
"""Close transport if it was opened."""
186+
if self._transport.isOpen():
187+
self._transport.close()
180188

181189

182190
def _construct_hive_storage_descriptor(
File renamed without changes.

pyiceberg/catalog/rest/auth.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import base64
19+
from abc import ABC, abstractmethod
20+
from typing import Optional
21+
22+
from requests import PreparedRequest
23+
from requests.auth import AuthBase
24+
25+
26+
class AuthManager(ABC):
27+
"""
28+
Abstract base class for Authentication Managers used to supply authorization headers to HTTP clients (e.g. requests.Session).
29+
30+
Subclasses must implement the `auth_header` method to return an Authorization header value.
31+
"""
32+
33+
@abstractmethod
34+
def auth_header(self) -> Optional[str]:
35+
"""Return the Authorization header value, or None if not applicable."""
36+
37+
38+
class NoopAuthManager(AuthManager):
39+
def auth_header(self) -> Optional[str]:
40+
return None
41+
42+
43+
class BasicAuthManager(AuthManager):
44+
def __init__(self, username: str, password: str):
45+
credentials = f"{username}:{password}"
46+
self._token = base64.b64encode(credentials.encode()).decode()
47+
48+
def auth_header(self) -> str:
49+
return f"Basic {self._token}"
50+
51+
52+
class AuthManagerAdapter(AuthBase):
53+
"""A `requests.auth.AuthBase` adapter that integrates an `AuthManager` into a `requests.Session` to automatically attach the appropriate Authorization header to every request.
54+
55+
This adapter is useful when working with `requests.Session.auth`
56+
and allows reuse of authentication strategies defined by `AuthManager`.
57+
This AuthManagerAdapter is only intended to be used against the REST Catalog
58+
Server that expects the Authorization Header.
59+
"""
60+
61+
def __init__(self, auth_manager: AuthManager):
62+
"""
63+
Initialize AuthManagerAdapter.
64+
65+
Args:
66+
auth_manager (AuthManager): An instance of an AuthManager subclass.
67+
"""
68+
self.auth_manager = auth_manager
69+
70+
def __call__(self, request: PreparedRequest) -> PreparedRequest:
71+
"""
72+
Modify the outgoing request to include the Authorization header.
73+
74+
Args:
75+
request (requests.PreparedRequest): The HTTP request being prepared.
76+
77+
Returns:
78+
requests.PreparedRequest: The modified request with Authorization header.
79+
"""
80+
if auth_header := self.auth_manager.auth_header():
81+
request.headers["Authorization"] = auth_header
82+
return request

pyiceberg/io/pyarrow.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ def _initialize_oss_fs(self) -> FileSystem:
409409
"secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY),
410410
"session_token": get_first_property_value(self.properties, S3_SESSION_TOKEN, AWS_SESSION_TOKEN),
411411
"region": get_first_property_value(self.properties, S3_REGION, AWS_REGION),
412+
"force_virtual_addressing": property_as_bool(self.properties, S3_FORCE_VIRTUAL_ADDRESSING, True),
412413
}
413414

414415
if proxy_uri := self.properties.get(S3_PROXY_URI):
@@ -426,9 +427,6 @@ def _initialize_oss_fs(self) -> FileSystem:
426427
if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME):
427428
client_kwargs["session_name"] = session_name
428429

429-
if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING):
430-
client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False)
431-
432430
return S3FileSystem(**client_kwargs)
433431

434432
def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
@@ -472,8 +470,8 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
472470
if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME):
473471
client_kwargs["session_name"] = session_name
474472

475-
if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING):
476-
client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False)
473+
if self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING) is not None:
474+
client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, S3_FORCE_VIRTUAL_ADDRESSING, False)
477475

478476
return S3FileSystem(**client_kwargs)
479477

@@ -2241,29 +2239,36 @@ def _partition_value(self, partition_field: PartitionField, schema: Schema) -> A
22412239
if partition_field.source_id not in self.column_aggregates:
22422240
return None
22432241

2244-
if not partition_field.transform.preserves_order:
2242+
source_field = schema.find_field(partition_field.source_id)
2243+
iceberg_transform = partition_field.transform
2244+
2245+
if not iceberg_transform.preserves_order:
22452246
raise ValueError(
22462247
f"Cannot infer partition value from parquet metadata for a non-linear Partition Field: {partition_field.name} with transform {partition_field.transform}"
22472248
)
22482249

2249-
lower_value = partition_record_value(
2250-
partition_field=partition_field,
2251-
value=self.column_aggregates[partition_field.source_id].current_min,
2252-
schema=schema,
2250+
transform_func = iceberg_transform.transform(source_field.field_type)
2251+
2252+
lower_value = transform_func(
2253+
partition_record_value(
2254+
partition_field=partition_field,
2255+
value=self.column_aggregates[partition_field.source_id].current_min,
2256+
schema=schema,
2257+
)
22532258
)
2254-
upper_value = partition_record_value(
2255-
partition_field=partition_field,
2256-
value=self.column_aggregates[partition_field.source_id].current_max,
2257-
schema=schema,
2259+
upper_value = transform_func(
2260+
partition_record_value(
2261+
partition_field=partition_field,
2262+
value=self.column_aggregates[partition_field.source_id].current_max,
2263+
schema=schema,
2264+
)
22582265
)
22592266
if lower_value != upper_value:
22602267
raise ValueError(
22612268
f"Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: {partition_field.name}. {lower_value=}, {upper_value=}"
22622269
)
22632270

2264-
source_field = schema.find_field(partition_field.source_id)
2265-
transform = partition_field.transform.transform(source_field.field_type)
2266-
return transform(lower_value)
2271+
return lower_value
22672272

22682273
def partition(self, partition_spec: PartitionSpec, schema: Schema) -> Record:
22692274
return Record(**{field.name: self._partition_value(field, schema) for field in partition_spec.fields})

pyiceberg/table/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from __future__ import annotations
1818

1919
import itertools
20+
import os
2021
import uuid
2122
import warnings
2223
from abc import ABC, abstractmethod
@@ -1378,8 +1379,27 @@ def refresh(self) -> Table:
13781379
"""Refresh the current table metadata."""
13791380
raise NotImplementedError("To be implemented")
13801381

1382+
@classmethod
1383+
def _metadata_location_from_version_hint(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> str:
1384+
version_hint_location = os.path.join(metadata_location, "metadata", "version-hint.text")
1385+
io = load_file_io(properties=properties, location=version_hint_location)
1386+
file = io.new_input(version_hint_location)
1387+
1388+
with file.open() as stream:
1389+
content = stream.read().decode("utf-8")
1390+
1391+
if content.endswith(".metadata.json"):
1392+
return os.path.join(metadata_location, "metadata", content)
1393+
elif content.isnumeric():
1394+
return os.path.join(metadata_location, "metadata", "v%s.metadata.json").format(content)
1395+
else:
1396+
return os.path.join(metadata_location, "metadata", "%s.metadata.json").format(content)
1397+
13811398
@classmethod
13821399
def from_metadata(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> StaticTable:
1400+
if not metadata_location.endswith(".metadata.json"):
1401+
metadata_location = StaticTable._metadata_location_from_version_hint(metadata_location, properties)
1402+
13831403
io = load_file_io(properties=properties, location=metadata_location)
13841404
file = io.new_input(metadata_location)
13851405

pyiceberg/table/inspect.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,9 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
205205
"record_count": entry.data_file.record_count,
206206
"file_size_in_bytes": entry.data_file.file_size_in_bytes,
207207
"column_sizes": dict(entry.data_file.column_sizes),
208-
"value_counts": dict(entry.data_file.value_counts),
209-
"null_value_counts": dict(entry.data_file.null_value_counts),
210-
"nan_value_counts": dict(entry.data_file.nan_value_counts),
208+
"value_counts": dict(entry.data_file.value_counts or {}),
209+
"null_value_counts": dict(entry.data_file.null_value_counts or {}),
210+
"nan_value_counts": dict(entry.data_file.nan_value_counts or {}),
211211
"lower_bounds": entry.data_file.lower_bounds,
212212
"upper_bounds": entry.data_file.upper_bounds,
213213
"key_metadata": entry.data_file.key_metadata,

0 commit comments

Comments
 (0)