apache
diff --git a/‎.github/workflows/check-md-link.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check-md-link.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎mkdocs/docs/api.md
Lines changed: 51 additions & 8 deletions b/‎mkdocs/docs/api.md
Lines changed: 51 additions & 8 deletions
diff --git a/‎mkdocs/docs/configuration.md
Lines changed: 20 additions & 2 deletions b/‎mkdocs/docs/configuration.md
Lines changed: 20 additions & 2 deletions
diff --git a/‎pyiceberg/catalog/rest/auth.py
Lines changed: 97 additions & 0 deletions b/‎pyiceberg/catalog/rest/auth.py
Lines changed: 97 additions & 0 deletions
diff --git a/‎pyiceberg/io/__init__.py
Lines changed: 1 addition & 0 deletions b/‎pyiceberg/io/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyiceberg/io/fsspec.py
Lines changed: 5 additions & 0 deletions b/‎pyiceberg/io/fsspec.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎pyiceberg/io/pyarrow.py
Lines changed: 19 additions & 5 deletions b/‎pyiceberg/io/pyarrow.py
Lines changed: 19 additions & 5 deletions
@@ -36,4 +36,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@master
-    - uses: gaurav-nelson/github-action-markdown-link-check@v1
+    - uses: tcort/github-action-markdown-link-check@v1
@@ -1004,6 +1004,33 @@ To show only data files or delete files in the current snapshot, use `table.insp
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.
 
+<!-- prettier-ignore-start -->
+
+!!! note "Name Mapping"
+    Because `add_files` uses existing files without writing new parquet files that are aware of the Iceberg's schema, it requires the Iceberg's table to have a [Name Mapping](https://iceberg.apache.org/spec/?h=name+mapping#name-mapping-serialization) (The Name mapping maps the field names within the parquet files to the Iceberg field IDs). Hence, `add_files` requires that there are no field IDs in the parquet file's metadata, and creates a new Name Mapping based on the table's current schema if the table doesn't already have one.
+
+!!! note "Partitions"
+    `add_files` only requires the client to read the existing parquet files' metadata footer to infer the partition value of each file. This implementation also supports adding files to Iceberg tables with partition transforms like `MonthTransform`, and `TruncateTransform` which preserve the order of the values after the transformation (Any Transform that has the `preserves_order` property set to True is supported). Please note that if the column statistics of the `PartitionField`'s source column are not present in the parquet metadata, the partition value is inferred as `None`.
+
+!!! warning "Maintenance Operations"
+    Because `add_files` commits the existing parquet files to the Iceberg Table as any other data file, destructive maintenance operations like expiring snapshots will remove them.
+
+!!! warning "Check Duplicate Files"
+    The `check_duplicate_files` parameter determines whether the method validates that the specified `file_paths` do not already exist in the Iceberg table. When set to True (the default), the method performs a validation against the table’s current data files to prevent accidental duplication, helping to maintain data consistency by ensuring the same file is not added multiple times. While this check is important for data integrity, it can introduce performance overhead for tables with a large number of files. Setting check_duplicate_files=False can improve performance but increases the risk of duplicate files, which may lead to data inconsistencies or table corruption. It is strongly recommended to keep this parameter enabled unless duplicate file handling is strictly enforced elsewhere.
+
+<!-- prettier-ignore-end -->
+
+### Usage
+
+| Parameter                 | Required? | Type           | Description                                                             |
+| ------------------------- | --------- | -------------- | ----------------------------------------------------------------------- |
+| `file_paths`            | ✔️      | List[str]      | The list of full file paths to be added as data files to the table      |
+| `snapshot_properties`   |           | Dict[str, str] | Properties to set for the new snapshot. Defaults to an empty dictionary |
+| `check_duplicate_files` |           | bool           | Whether to check for duplicate files. Defaults to `True`             |
+
+### Example
+
+Add files to Iceberg table:
 ```python
 # Given that these parquet files have schema consistent with the Iceberg table
 
@@ -1019,18 +1046,34 @@ tbl.add_files(file_paths=file_paths)
 # A new snapshot is committed to the table with manifests pointing to the existing parquet files
 ```
 
-<!-- prettier-ignore-start -->
+Add files to Iceberg table with custom snapshot properties:
+```python
+# Assume an existing Iceberg table object `tbl`
 
-!!! note "Name Mapping"
-    Because `add_files` uses existing files without writing new parquet files that are aware of the Iceberg's schema, it requires the Iceberg's table to have a [Name Mapping](https://iceberg.apache.org/spec/?h=name+mapping#name-mapping-serialization) (The Name mapping maps the field names within the parquet files to the Iceberg field IDs). Hence, `add_files` requires that there are no field IDs in the parquet file's metadata, and creates a new Name Mapping based on the table's current schema if the table doesn't already have one.
+file_paths = [
+    "s3a://warehouse/default/existing-1.parquet",
+    "s3a://warehouse/default/existing-2.parquet",
+]
 
-!!! note "Partitions"
-    `add_files` only requires the client to read the existing parquet files' metadata footer to infer the partition value of each file. This implementation also supports adding files to Iceberg tables with partition transforms like `MonthTransform`, and `TruncateTransform` which preserve the order of the values after the transformation (Any Transform that has the `preserves_order` property set to True is supported). Please note that if the column statistics of the `PartitionField`'s source column are not present in the parquet metadata, the partition value is inferred as `None`.
+# Custom snapshot properties
+snapshot_properties = {"abc": "def"}
 
-!!! warning "Maintenance Operations"
-    Because `add_files` commits the existing parquet files to the Iceberg Table as any other data file, destructive maintenance operations like expiring snapshots will remove them.
+# Enable duplicate file checking
+check_duplicate_files = True
 
-<!-- prettier-ignore-end -->
+# Add the Parquet files to the Iceberg table without rewriting
+tbl.add_files(
+    file_paths=file_paths,
+    snapshot_properties=snapshot_properties,
+    check_duplicate_files=check_duplicate_files
+)
+
+# NameMapping must have been set to enable reads
+assert tbl.name_mapping() is not None
+
+# Verify that the snapshot property was set correctly
+assert tbl.metadata.snapshots[-1].summary["abc"] == "def"
+```
 
 ## Schema evolution
 
 
@@ -127,6 +127,7 @@ For the FileIO there are several configuration options available:
 | s3.request-timeout          | 60.0                       | Configure socket read timeouts on Windows and macOS, in seconds.                                                                                                                                                                                            |
 | s3.force-virtual-addressing | False                      | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. |
 | s3.retry-strategy-impl      | None                       | Ability to set a custom S3 retry strategy. A full path to a class needs to be given that extends the [S3RetryStrategy](https://github.com/apache/arrow/blob/639201bfa412db26ce45e73851432018af6c945e/python/pyarrow/_s3fs.pyx#L110) base class.            |
+| s3.anonymous                | True                       | Configure whether to use anonymous connection. If False (default), uses key/secret if configured or boto's credential resolver. |
 
 <!-- markdown-link-check-enable-->
 
@@ -197,6 +198,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya
 | s3.secret-access-key | password                   | Configure the static secret access key used to access the FileIO.                                                                                                                                                                                         |
 | s3.session-token     | AQoDYXdzEJr...             | Configure the static session token used to access the FileIO.                                                                                                                                                                                             |
 | s3.force-virtual-addressing   | True                       | Whether to use virtual addressing of buckets. This is set to `True` by default as OSS can only be accessed with virtual hosted style address.                                                                                                                                                                                                        |
+| s3.anonymous                | True                       | Configure whether to use anonymous connection. If False (default), uses key/secret if configured or standard AWS configuration methods. |
 
 <!-- markdown-link-check-enable-->
 
@@ -388,6 +390,7 @@ The RESTCatalog supports pluggable authentication via the `auth` configuration b
 
 - `noop`: No authentication (no Authorization header sent).
 - `basic`: HTTP Basic authentication.
+- `oauth2`: OAuth2 client credentials flow.
 - `custom`: Custom authentication manager (requires `auth.impl`).
 - `google`: Google Authentication support
 
@@ -411,9 +414,10 @@ catalog:
 
 | Property         | Required | Description                                                                                     |
 |------------------|----------|-------------------------------------------------------------------------------------------------|
-| `auth.type`      | Yes      | The authentication type to use (`noop`, `basic`, or `custom`).                       |
+| `auth.type`      | Yes      | The authentication type to use (`noop`, `basic`, `oauth2`, or `custom`).                       |
 | `auth.impl`      | Conditionally | The fully qualified class path for a custom AuthManager. Required if `auth.type` is `custom`. |
 | `auth.basic`     | If type is `basic` | Block containing `username` and `password` for HTTP Basic authentication.           |
+| `auth.oauth2`    | If type is `oauth2` | Block containing OAuth2 configuration (see below).                                 |
 | `auth.custom`    | If type is `custom` | Block containing configuration for the custom AuthManager.                          |
 | `auth.google`    | If type is `google` | Block containing `credentials_path` to a service account file (if using). Will default to using Application Default Credentials. |
 
@@ -436,6 +440,20 @@ auth:
     password: mypass
 ```
 
+OAuth2 Authentication:
+
+```yaml
+auth:
+  type: oauth2
+  oauth2:
+    client_id: my-client-id
+    client_secret: my-client-secret
+    token_url: https://auth.example.com/oauth/token
+    scope: read
+    refresh_margin: 60         # (optional) seconds before expiry to refresh
+    expires_in: 3600           # (optional) fallback if server does not provide
+```
+
 Custom Authentication:
 
 ```yaml
@@ -451,7 +469,7 @@ auth:
 
 - If `auth.type` is `custom`, you **must** specify `auth.impl` with the full class path to your custom AuthManager.
 - If `auth.type` is not `custom`, specifying `auth.impl` is not allowed.
-- The configuration block under each type (e.g., `basic`, `custom`) is passed as keyword arguments to the corresponding AuthManager.
+- The configuration block under each type (e.g., `basic`, `oauth2`, `custom`) is passed as keyword arguments to the corresponding AuthManager.
 
 <!-- markdown-link-check-enable-->
 
 
@@ -18,9 +18,13 @@
 import base64
 import importlib
 import logging
+import threading
+import time
 from abc import ABC, abstractmethod
+from functools import cached_property
 from typing import Any, Dict, List, Optional, Type
 
+import requests
 from requests import HTTPError, PreparedRequest, Session
 from requests.auth import AuthBase
 
@@ -121,6 +125,98 @@ def auth_header(self) -> str:
         return f"Bearer {self._token}"
 
 
+class OAuth2TokenProvider:
+    """Thread-safe OAuth2 token provider with token refresh support."""
+
+    client_id: str
+    client_secret: str
+    token_url: str
+    scope: Optional[str]
+    refresh_margin: int
+    expires_in: Optional[int]
+
+    _token: Optional[str]
+    _expires_at: int
+    _lock: threading.Lock
+
+    def __init__(
+        self,
+        client_id: str,
+        client_secret: str,
+        token_url: str,
+        scope: Optional[str] = None,
+        refresh_margin: int = 60,
+        expires_in: Optional[int] = None,
+    ):
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.token_url = token_url
+        self.scope = scope
+        self.refresh_margin = refresh_margin
+        self.expires_in = expires_in
+
+        self._token = None
+        self._expires_at = 0
+        self._lock = threading.Lock()
+
+    @cached_property
+    def _client_secret_header(self) -> str:
+        creds = f"{self.client_id}:{self.client_secret}"
+        creds_bytes = creds.encode("utf-8")
+        b64_creds = base64.b64encode(creds_bytes).decode("utf-8")
+        return f"Basic {b64_creds}"
+
+    def _refresh_token(self) -> None:
+        data = {"grant_type": "client_credentials"}
+        if self.scope:
+            data["scope"] = self.scope
+
+        response = requests.post(self.token_url, data=data, headers={"Authorization": self._client_secret_header})
+        response.raise_for_status()
+        result = response.json()
+
+        self._token = result["access_token"]
+        expires_in = result.get("expires_in", self.expires_in)
+        if expires_in is None:
+            raise ValueError(
+                "The expiration time of the Token must be provided by the Server in the Access Token Response in `expires_in` field, or by the PyIceberg Client."
+            )
+        self._expires_at = time.monotonic() + expires_in - self.refresh_margin
+
+    def get_token(self) -> str:
+        with self._lock:
+            if not self._token or time.monotonic() >= self._expires_at:
+                self._refresh_token()
+            if self._token is None:
+                raise ValueError("Authorization token is None after refresh")
+            return self._token
+
+
+class OAuth2AuthManager(AuthManager):
+    """Auth Manager implementation that supports OAuth2 as defined in IETF RFC6749."""
+
+    def __init__(
+        self,
+        client_id: str,
+        client_secret: str,
+        token_url: str,
+        scope: Optional[str] = None,
+        refresh_margin: int = 60,
+        expires_in: Optional[int] = None,
+    ):
+        self.token_provider = OAuth2TokenProvider(
+            client_id,
+            client_secret,
+            token_url,
+            scope,
+            refresh_margin,
+            expires_in,
+        )
+
+    def auth_header(self) -> str:
+        return f"Bearer {self.token_provider.get_token()}"
+
+
 class GoogleAuthManager(AuthManager):
     """An auth manager that is responsible for handling Google credentials."""
 
@@ -228,4 +324,5 @@ def create(cls, class_or_name: str, config: Dict[str, Any]) -> AuthManager:
 AuthManagerFactory.register("noop", NoopAuthManager)
 AuthManagerFactory.register("basic", BasicAuthManager)
 AuthManagerFactory.register("legacyoauth2", LegacyOAuth2AuthManager)
+AuthManagerFactory.register("oauth2", OAuth2AuthManager)
 AuthManagerFactory.register("google", GoogleAuthManager)
@@ -52,6 +52,7 @@
 AWS_SESSION_TOKEN = "client.session-token"
 AWS_ROLE_ARN = "client.role-arn"
 AWS_ROLE_SESSION_NAME = "client.role-session-name"
+S3_ANONYMOUS = "s3.anonymous"
 S3_ENDPOINT = "s3.endpoint"
 S3_ACCESS_KEY_ID = "s3.access-key-id"
 S3_SECRET_ACCESS_KEY = "s3.secret-access-key"
 
@@ -65,6 +65,7 @@
     HF_ENDPOINT,
     HF_TOKEN,
     S3_ACCESS_KEY_ID,
+    S3_ANONYMOUS,
     S3_CONNECT_TIMEOUT,
     S3_ENDPOINT,
     S3_PROXY_URI,
@@ -83,6 +84,7 @@
     OutputStream,
 )
 from pyiceberg.typedef import Properties
+from pyiceberg.types import strtobool
 from pyiceberg.utils.properties import get_first_property_value, get_header_properties, property_as_bool
 
 logger = logging.getLogger(__name__)
@@ -164,6 +166,9 @@ def _s3(properties: Properties) -> AbstractFileSystem:
     if request_timeout := properties.get(S3_REQUEST_TIMEOUT):
         config_kwargs["read_timeout"] = float(request_timeout)
 
+    if s3_anonymous := properties.get(S3_ANONYMOUS):
+        config_kwargs["anon"] = strtobool(s3_anonymous)
+
     fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs)
 
     for event_name, event_function in register_events.items():
 
@@ -109,6 +109,7 @@
     HDFS_USER,
     PYARROW_USE_LARGE_TYPES_ON_READ,
     S3_ACCESS_KEY_ID,
+    S3_ANONYMOUS,
     S3_CONNECT_TIMEOUT,
     S3_ENDPOINT,
     S3_FORCE_VIRTUAL_ADDRESSING,
@@ -179,6 +180,7 @@
     TimeType,
     UnknownType,
     UUIDType,
+    strtobool,
 )
 from pyiceberg.utils.concurrent import ExecutorFactory
 from pyiceberg.utils.config import Config
@@ -450,6 +452,9 @@ def _initialize_oss_fs(self) -> FileSystem:
         if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME):
             client_kwargs["session_name"] = session_name
 
+        if s3_anonymous := self.properties.get(S3_ANONYMOUS):
+            client_kwargs["anonymous"] = strtobool(s3_anonymous)
+
         return S3FileSystem(**client_kwargs)
 
     def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
@@ -501,6 +506,9 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
         ):
             client_kwargs["retry_strategy"] = retry_instance
 
+        if s3_anonymous := self.properties.get(S3_ANONYMOUS):
+            client_kwargs["anonymous"] = strtobool(s3_anonymous)
+
         return S3FileSystem(**client_kwargs)
 
     def _initialize_azure_fs(self) -> FileSystem:
@@ -2445,8 +2453,12 @@ def data_file_statistics_from_parquet_metadata(
 
                     if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
                         scale = stats_col.iceberg_type.scale
-                        col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale))
-                        col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale))
+                        col_aggs[field_id].update_min(
+                            unscaled_to_decimal(statistics.min_raw, scale)
+                        ) if statistics.min_raw is not None else None
+                        col_aggs[field_id].update_max(
+                            unscaled_to_decimal(statistics.max_raw, scale)
+                        ) if statistics.max_raw is not None else None
                     else:
                         col_aggs[field_id].update_min(statistics.min)
                         col_aggs[field_id].update_max(statistics.max)
@@ -2793,9 +2805,11 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
             functools.reduce(
                 operator.and_,
                 [
-                    pc.field(partition_field_name) == unique_partition[partition_field_name]
-                    if unique_partition[partition_field_name] is not None
-                    else pc.field(partition_field_name).is_null()
+                    (
+                        pc.field(partition_field_name) == unique_partition[partition_field_name]
+                        if unique_partition[partition_field_name] is not None
+                        else pc.field(partition_field_name).is_null()
+                    )
                     for field, partition_field_name in zip(spec.fields, partition_fields)
                 ],
             )