kevinjqliu · kevinjqliu · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.asf.yaml b/.asf.yaml
@@ -21,7 +21,7 @@
 # https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features
 
 github:
-  description: "Apache PyIceberg"
+  description: "PyIceberg"
   homepage: https://py.iceberg.apache.org/
   labels:
     - iceberg

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -7,8 +7,8 @@ Thanks for opening a pull request!
 
 # Rationale for this change
 
-# Are these changes tested?
+## Are these changes tested?
 
-# Are there any user-facing changes?
+## Are there any user-facing changes?
 
 <!-- In the case of user-facing changes, please add the changelog label. -->
diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml
@@ -36,4 +36,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@master
-    - uses: gaurav-nelson/github-action-markdown-link-check@v1
+    - uses: tcort/github-action-markdown-link-check@v1
diff --git a/.github/workflows/pypi-build-artifacts.yml b/.github/workflows/pypi-build-artifacts.yml
@@ -62,7 +62,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/[email protected].3
+        uses: pypa/[email protected].4
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"

diff --git a/.github/workflows/svn-build-artifacts.yml b/.github/workflows/svn-build-artifacts.yml
@@ -57,7 +57,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/[email protected].3
+        uses: pypa/[email protected].4
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,27 +19,27 @@ exclude: ^vendor/
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: debug-statements
       - id: check-yaml
       - id: check-ast
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.13
+    rev: v0.12.9
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix ]
       - id: ruff-format
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.17.1
     hooks:
       - id: mypy
         args:
           [--install-types, --non-interactive, --config=pyproject.toml]
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.43.0
+    rev: v0.45.0
     hooks:
       - id: markdownlint
         args: ["--fix"]
@@ -54,25 +54,14 @@ repos:
         additional_dependencies:
           - tomli==2.0.1
   - repo: https://github.com/ikamensh/flynt
-    rev: 1.0.1
+    rev: 1.0.6
     hooks:
       - id: flynt
         args:
           # --line-length is set to a high value to deal with very long lines
           - --line-length
           - '99999'
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
+    rev: v2.4.1
     hooks:
       - id: codespell
-ci:
-    autofix_commit_msg: |
-        [pre-commit.ci] auto fixes from pre-commit.com hooks
-
-        for more information, see https://pre-commit.ci
-    autofix_prs: true
-    autoupdate_branch: ''
-    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
-    autoupdate_schedule: weekly
-    skip: []
-    submodules: false
diff --git a/README.md b/README.md
@@ -21,6 +21,6 @@ PyIceberg is a Python library for programmatic access to Iceberg table metadata
 
 The documentation is available at [https://py.iceberg.apache.org/](https://py.iceberg.apache.org/).
 
-# Get in Touch
+## Get in Touch
 
 - [Iceberg community](https://iceberg.apache.org/community/)
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -1004,6 +1004,34 @@ To show only data files or delete files in the current snapshot, use `table.insp
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.
 
+<!-- prettier-ignore-start -->
+
+!!! note "Name Mapping"
+    Because `add_files` uses existing files without writing new parquet files that are aware of the Iceberg's schema, it requires the Iceberg's table to have a [Name Mapping](https://iceberg.apache.org/spec/?h=name+mapping#name-mapping-serialization) (The Name mapping maps the field names within the parquet files to the Iceberg field IDs). Hence, `add_files` requires that there are no field IDs in the parquet file's metadata, and creates a new Name Mapping based on the table's current schema if the table doesn't already have one.
+
+!!! note "Partitions"
+    `add_files` only requires the client to read the existing parquet files' metadata footer to infer the partition value of each file. This implementation also supports adding files to Iceberg tables with partition transforms like `MonthTransform`, and `TruncateTransform` which preserve the order of the values after the transformation (Any Transform that has the `preserves_order` property set to True is supported). Please note that if the column statistics of the `PartitionField`'s source column are not present in the parquet metadata, the partition value is inferred as `None`.
+
+!!! warning "Maintenance Operations"
+    Because `add_files` commits the existing parquet files to the Iceberg Table as any other data file, destructive maintenance operations like expiring snapshots will remove them.
+
+!!! warning "Check Duplicate Files"
+    The `check_duplicate_files` parameter determines whether the method validates that the specified `file_paths` do not already exist in the Iceberg table. When set to True (the default), the method performs a validation against the table’s current data files to prevent accidental duplication, helping to maintain data consistency by ensuring the same file is not added multiple times. While this check is important for data integrity, it can introduce performance overhead for tables with a large number of files. Setting check_duplicate_files=False can improve performance but increases the risk of duplicate files, which may lead to data inconsistencies or table corruption. It is strongly recommended to keep this parameter enabled unless duplicate file handling is strictly enforced elsewhere.
+
+<!-- prettier-ignore-end -->
+
+### Usage
+
+| Parameter                 | Required? | Type           | Description                                                             |
+| ------------------------- | --------- | -------------- | ----------------------------------------------------------------------- |
+| `file_paths`            | ✔️      | List[str]      | The list of full file paths to be added as data files to the table      |
+| `snapshot_properties`   |           | Dict[str, str] | Properties to set for the new snapshot. Defaults to an empty dictionary |
+| `check_duplicate_files` |           | bool           | Whether to check for duplicate files. Defaults to `True`             |
+
+### Example
+
+Add files to Iceberg table:
+
 ```python
 # Given that these parquet files have schema consistent with the Iceberg table
 
@@ -1019,18 +1047,35 @@ tbl.add_files(file_paths=file_paths)
 # A new snapshot is committed to the table with manifests pointing to the existing parquet files
 ```
 
-<!-- prettier-ignore-start -->
+Add files to Iceberg table with custom snapshot properties:
 
-!!! note "Name Mapping"
-    Because `add_files` uses existing files without writing new parquet files that are aware of the Iceberg's schema, it requires the Iceberg's table to have a [Name Mapping](https://iceberg.apache.org/spec/?h=name+mapping#name-mapping-serialization) (The Name mapping maps the field names within the parquet files to the Iceberg field IDs). Hence, `add_files` requires that there are no field IDs in the parquet file's metadata, and creates a new Name Mapping based on the table's current schema if the table doesn't already have one.
+```python
+# Assume an existing Iceberg table object `tbl`
 
-!!! note "Partitions"
-    `add_files` only requires the client to read the existing parquet files' metadata footer to infer the partition value of each file. This implementation also supports adding files to Iceberg tables with partition transforms like `MonthTransform`, and `TruncateTransform` which preserve the order of the values after the transformation (Any Transform that has the `preserves_order` property set to True is supported). Please note that if the column statistics of the `PartitionField`'s source column are not present in the parquet metadata, the partition value is inferred as `None`.
+file_paths = [
+    "s3a://warehouse/default/existing-1.parquet",
+    "s3a://warehouse/default/existing-2.parquet",
+]
 
-!!! warning "Maintenance Operations"
-    Because `add_files` commits the existing parquet files to the Iceberg Table as any other data file, destructive maintenance operations like expiring snapshots will remove them.
+# Custom snapshot properties
+snapshot_properties = {"abc": "def"}
 
-<!-- prettier-ignore-end -->
+# Enable duplicate file checking
+check_duplicate_files = True
+
+# Add the Parquet files to the Iceberg table without rewriting
+tbl.add_files(
+    file_paths=file_paths,
+    snapshot_properties=snapshot_properties,
+    check_duplicate_files=check_duplicate_files
+)
+
+# NameMapping must have been set to enable reads
+assert tbl.name_mapping() is not None
+
+# Verify that the snapshot property was set correctly
+assert tbl.metadata.snapshots[-1].summary["abc"] == "def"
+```
 
 ## Schema evolution
 

diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -127,6 +127,7 @@ For the FileIO there are several configuration options available:
 | s3.request-timeout          | 60.0                       | Configure socket read timeouts on Windows and macOS, in seconds.                                                                                                                                                                                            |
 | s3.force-virtual-addressing | False                      | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. |
 | s3.retry-strategy-impl      | None                       | Ability to set a custom S3 retry strategy. A full path to a class needs to be given that extends the [S3RetryStrategy](https://github.com/apache/arrow/blob/639201bfa412db26ce45e73851432018af6c945e/python/pyarrow/_s3fs.pyx#L110) base class.            |
+| s3.anonymous                | True                       | Configure whether to use anonymous connection. If False (default), uses key/secret if configured or boto's credential resolver. |
 
 <!-- markdown-link-check-enable-->
 
@@ -161,6 +162,7 @@ For the FileIO there are several configuration options available:
 | adls.dfs-storage-authority   | .dfs.core.windows.net                                                                       | The hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference                |
 | adls.blob-storage-scheme     | https                                                                                       | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference                                                      |
 | adls.dfs-storage-scheme      | https                                                                                       | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference                                                          |
+| adls.token                   | eyJ0eXAiOiJKV1QiLCJhbGci...                                                                 | Static access token for authenticating with ADLS. Used for OAuth2 flows.                                                                                                                                                                       |
 
 <!-- markdown-link-check-enable-->
 
@@ -197,6 +199,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya
 | s3.secret-access-key | password                   | Configure the static secret access key used to access the FileIO.                                                                                                                                                                                         |
 | s3.session-token     | AQoDYXdzEJr...             | Configure the static session token used to access the FileIO.                                                                                                                                                                                             |
 | s3.force-virtual-addressing   | True                       | Whether to use virtual addressing of buckets. This is set to `True` by default as OSS can only be accessed with virtual hosted style address.                                                                                                                                                                                                        |
+| s3.anonymous                | True                       | Configure whether to use anonymous connection. If False (default), uses key/secret if configured or standard AWS configuration methods. |
 
 <!-- markdown-link-check-enable-->
 
@@ -388,6 +391,7 @@ The RESTCatalog supports pluggable authentication via the `auth` configuration b
 
 - `noop`: No authentication (no Authorization header sent).
 - `basic`: HTTP Basic authentication.
+- `oauth2`: OAuth2 client credentials flow.
 - `custom`: Custom authentication manager (requires `auth.impl`).
 - `google`: Google Authentication support
 
@@ -411,9 +415,10 @@ catalog:
 
 | Property         | Required | Description                                                                                     |
 |------------------|----------|-------------------------------------------------------------------------------------------------|
-| `auth.type`      | Yes      | The authentication type to use (`noop`, `basic`, or `custom`).                       |
+| `auth.type`      | Yes      | The authentication type to use (`noop`, `basic`, `oauth2`, or `custom`).                       |
 | `auth.impl`      | Conditionally | The fully qualified class path for a custom AuthManager. Required if `auth.type` is `custom`. |
 | `auth.basic`     | If type is `basic` | Block containing `username` and `password` for HTTP Basic authentication.           |
+| `auth.oauth2`    | If type is `oauth2` | Block containing OAuth2 configuration (see below).                                 |
 | `auth.custom`    | If type is `custom` | Block containing configuration for the custom AuthManager.                          |
 | `auth.google`    | If type is `google` | Block containing `credentials_path` to a service account file (if using). Will default to using Application Default Credentials. |
 
@@ -436,6 +441,20 @@ auth:
     password: mypass
 ```
 
+OAuth2 Authentication:
+
+```yaml
+auth:
+  type: oauth2
+  oauth2:
+    client_id: my-client-id
+    client_secret: my-client-secret
+    token_url: https://auth.example.com/oauth/token
+    scope: read
+    refresh_margin: 60         # (optional) seconds before expiry to refresh
+    expires_in: 3600           # (optional) fallback if server does not provide
+```
+
 Custom Authentication:
 
 ```yaml
@@ -451,7 +470,7 @@ auth:
 
 - If `auth.type` is `custom`, you **must** specify `auth.impl` with the full class path to your custom AuthManager.
 - If `auth.type` is not `custom`, specifying `auth.impl` is not allowed.
-- The configuration block under each type (e.g., `basic`, `custom`) is passed as keyword arguments to the corresponding AuthManager.
+- The configuration block under each type (e.g., `basic`, `oauth2`, `custom`) is passed as keyword arguments to the corresponding AuthManager.
 
 <!-- markdown-link-check-enable-->
 

diff --git a/mkdocs/docs/contributing.md b/mkdocs/docs/contributing.md
@@ -159,7 +159,7 @@ To rebuild the containers from scratch.
 
 Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.
 
-## API Compatibility
+### API Compatibility
 
 It is important to keep the Python public API compatible across versions. The Python official [PEP-8](https://peps.python.org/pep-0008/) defines public methods as: _Public attributes should have no leading underscores_. This means not removing any methods without any notice, or removing or renaming any existing parameters. Adding new optional parameters is okay.
 
@@ -202,12 +202,12 @@ Which will warn:
 Deprecated in 0.1.0, will be removed in 0.2.0. The old_property is deprecated. Please use the something_else property instead.
 ```
 
-## Type annotations
+### Type annotations
 
 For the type annotation the types from the `Typing` package are used.
 
 PyIceberg offers support from Python 3.9 onwards, we can't use the [type hints from the standard collections](https://peps.python.org/pep-0585/).
 
-## Third party libraries
+### Third party libraries
 
 PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesitant adding third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
diff --git a/mkdocs/docs/expression-dsl.md b/mkdocs/docs/expression-dsl.md
@@ -60,6 +60,8 @@ age_greater_than_18 = GreaterThan("age", 18)
 
 # Greater than or equal to
 age_greater_than_or_equal_18 = GreaterThanOrEqual("age", 18)
+
+
 ```
 
 #### Set Predicates

diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md
@@ -232,7 +232,7 @@ export LAST_COMMIT_ID=$(git rev-list ${GIT_TAG} 2> /dev/null | head -n 1)
 
 cat << EOF > release-announcement-email.txt
 To: [email protected]
-Subject: [VOTE] Release Apache PyIceberg $VERSION_WITH_RC
+Subject: [VOTE] PyIceberg $VERSION_WITH_RC
 Hi Everyone,
 
 I propose that we release the following RC as the official PyIceberg $VERSION release.
@@ -351,9 +351,9 @@ Send out an announcement on the dev mail list:
 
 ```text
 To: [email protected]
-Subject: [ANNOUNCE] Apache PyIceberg release <VERSION>
+Subject: [ANNOUNCE] PyIceberg <VERSION>
 
-I'm pleased to announce the release of Apache PyIceberg <VERSION>!
+I'm pleased to announce the release of PyIceberg <VERSION>!
 
 Apache Iceberg is an open table format for huge analytic datasets. Iceberg
 delivers high query performance for tables with tens of petabytes of data,
@@ -397,7 +397,7 @@ Ensure to update the `PYICEBERG_VERSION` in the [Dockerfile](https://github.com/
 
 ### Set up GPG key and Upload to Apache Iceberg KEYS file
 
-To set up GPG key locally, see the instructions [here](http://www.apache.org/dev/openpgp.html#key-gen-generate-key).
+To set up GPG key locally, see the [instructions](http://www.apache.org/dev/openpgp.html#key-gen-generate-key).
 
 To install gpg on a M1 based Mac, a couple of additional steps are required: <https://gist.github.com/phortuin/cf24b1cca3258720c71ad42977e1ba57>.
 

diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
@@ -46,6 +46,7 @@ You can mix and match optional dependencies depending on your needs:
 | hive-kerberos | Support for Hive metastore in Kerberos environment                       |
 | glue          | Support for AWS Glue                                                      |
 | dynamodb      | Support for AWS DynamoDB                                                  |
+| bigquery      | Support for Google Cloud BigQuery                                        |
 | sql-postgres  | Support for SQL Catalog backed by Postgresql                              |
 | sql-sqlite    | Support for SQL Catalog backed by SQLite                                  |
 | pyarrow       | PyArrow as a FileIO implementation to interact with the object store      |
-Original file line number
+Diff line change
@@ Expand Up / @@ -60,6 +60,8 @@ age_greater_than_18 = GreaterThan("age", 18) @@
     # Greater than or equal to
     age_greater_than_or_equal_18 = GreaterThanOrEqual("age", 18)
     ```
     #### Set Predicates
@@ Expand Down @@