Skip to content

Commit 1bad384

Browse files
committed
refactor resolve region
1 parent e39e2c8 commit 1bad384

File tree

1 file changed

+48
-23
lines changed

1 file changed

+48
-23
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,50 @@ def _get_first_property_value_with_tracking(self, props: Properties, used_keys:
471471
return props[key]
472472
return None
473473

474+
def _convert_str_to_bool(self, value: Any) -> bool:
475+
"""Convert string or other value to boolean, handling string representations properly."""
476+
if isinstance(value, str):
477+
return strtobool(value)
478+
return bool(value)
479+
480+
def _resolve_s3_region(
481+
self, provided_region: Optional[str], resolve_region_override: Any, bucket: Optional[str]
482+
) -> Optional[str]:
483+
"""
484+
Resolve S3 region based on configuration and optional bucket-based resolution.
485+
486+
Args:
487+
provided_region: Region explicitly provided in configuration
488+
resolve_region_override: Whether to resolve region from bucket (can be string or bool)
489+
bucket: Bucket name for region resolution
490+
491+
Returns:
492+
The resolved region string, or None if no region could be determined
493+
"""
494+
# Handle resolve_region_override conversion
495+
should_resolve_region = False
496+
if resolve_region_override is not None:
497+
should_resolve_region = self._convert_str_to_bool(resolve_region_override)
498+
499+
# If no region provided or explicit resolve requested, try to resolve from bucket
500+
if provided_region is None or should_resolve_region:
501+
resolved_region = _cached_resolve_s3_region(bucket=bucket)
502+
503+
# Warn if resolved region differs from provided region
504+
if provided_region is not None and resolved_region and resolved_region != provided_region:
505+
logger.warning(
506+
f"PyArrow FileIO overriding S3 bucket region for bucket {bucket}: "
507+
f"provided region {provided_region}, actual region {resolved_region}"
508+
)
509+
510+
return resolved_region or provided_region
511+
512+
return provided_region
513+
474514
def _initialize_oss_fs(self) -> FileSystem:
475515
from pyarrow.fs import S3FileSystem
476516

477-
properties = filter_properties(self.properties, key_predicate=lambda k: k.startswith(("oss.", "client.")))
517+
properties = filter_properties(self.properties, key_predicate=lambda k: k.startswith(("s3.", "client.", "oss.")))
478518
used_keys: set[str] = set()
479519
client_kwargs = {}
480520

@@ -490,12 +530,10 @@ def _initialize_oss_fs(self) -> FileSystem:
490530
client_kwargs["session_token"] = session_token
491531
if region := get(S3_REGION, AWS_REGION, "oss.region"):
492532
client_kwargs["region"] = region
493-
# Check for force_virtual_addressing in order of preference. For oss FS, defaulting to True if not found
494533
if force_virtual_addressing := get(S3_FORCE_VIRTUAL_ADDRESSING, "oss.force_virtual_addressing"):
495-
if isinstance(force_virtual_addressing, str): # S3_FORCE_VIRTUAL_ADDRESSING's value can be a string
496-
force_virtual_addressing = strtobool(force_virtual_addressing)
497-
client_kwargs["force_virtual_addressing"] = force_virtual_addressing
534+
client_kwargs["force_virtual_addressing"] = self._convert_str_to_bool(force_virtual_addressing)
498535
else:
536+
# For OSS FS, default to True
499537
client_kwargs["force_virtual_addressing"] = True
500538
if proxy_uri := get(S3_PROXY_URI, "oss.proxy_options"):
501539
client_kwargs["proxy_options"] = proxy_uri
@@ -532,26 +570,13 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
532570
if session_token := get(S3_SESSION_TOKEN, AWS_SESSION_TOKEN, "s3.session_token"):
533571
client_kwargs["session_token"] = session_token
534572

535-
provided_region = get(S3_REGION, AWS_REGION)
536-
# Do this when we don't provide the region at all, or when we explicitly enable it
537-
if provided_region is None or property_as_bool(self.properties, S3_RESOLVE_REGION, False) is True:
538-
# Resolve region from netloc(bucket), fallback to user-provided region
539-
# Only supported by buckets hosted by S3
540-
bucket_region = _cached_resolve_s3_region(bucket=netloc) or provided_region
541-
if provided_region is not None and bucket_region != provided_region:
542-
logger.warning(
543-
f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: "
544-
f"provided region {provided_region}, actual region {bucket_region}"
545-
)
546-
else:
547-
bucket_region = provided_region
548-
client_kwargs["region"] = bucket_region
549-
used_keys.add(S3_RESOLVE_REGION)
573+
# Handle S3 region configuration with optional auto-resolution
574+
client_kwargs["region"] = self._resolve_s3_region(
575+
provided_region=get(S3_REGION, AWS_REGION), resolve_region_override=get(S3_RESOLVE_REGION), bucket=netloc
576+
)
550577

551578
if force_virtual_addressing := get(S3_FORCE_VIRTUAL_ADDRESSING, "s3.force_virtual_addressing"):
552-
if isinstance(force_virtual_addressing, str): # S3_FORCE_VIRTUAL_ADDRESSING's value can be a string
553-
force_virtual_addressing = strtobool(force_virtual_addressing)
554-
client_kwargs["force_virtual_addressing"] = force_virtual_addressing
579+
client_kwargs["force_virtual_addressing"] = self._convert_str_to_bool(force_virtual_addressing)
555580

556581
if proxy_uri := get(S3_PROXY_URI, "s3.proxy_options"):
557582
client_kwargs["proxy_options"] = proxy_uri

0 commit comments

Comments
 (0)