From 78cd838c07ba925f1c797a603fca1542e60b7583 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 3 Sep 2025 11:24:48 -0700 Subject: [PATCH 01/11] 1.10.0rc4 --- dev/Dockerfile | 8 ++++---- tests/conftest.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index 5292e26421..876bfa42e3 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -39,20 +39,20 @@ WORKDIR ${SPARK_HOME} # Remember to also update `tests/conftest`'s spark setting ENV SPARK_VERSION=3.5.6 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 -ENV ICEBERG_VERSION=1.9.1 +ENV ICEBERG_VERSION=1.10.0 ENV PYICEBERG_VERSION=0.9.1 +ENV BASE_ARTIFACT_URL=https://repository.apache.org/content/repositories/orgapacheiceberg-1268 RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz # Download iceberg spark runtime -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ +RUN curl --retry 5 -s ${BASE_ARTIFACT_URL}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar - # Download AWS bundle -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ +RUN curl --retry 5 -s ${BASE_ARTIFACT_URL}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar COPY spark-defaults.conf /opt/spark/conf diff --git a/tests/conftest.py b/tests/conftest.py index 5aff45c1ed..2bdd36f8fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2534,7 +2534,7 @@ def spark() -> "SparkSession": # Remember to also update `dev/Dockerfile` spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) scala_version = "2.12" - iceberg_version = "1.9.2" + iceberg_version = "1.10.0" hadoop_version = "3.3.4" aws_sdk_version = "1.12.753" @@ -2551,6 +2551,7 @@ def spark() -> "SparkSession": spark = ( SparkSession.builder.appName("PyIceberg integration test") + .config("spark.jars.repositories", "https://repository.apache.org/content/repositories/orgapacheiceberg-1268/") .config("spark.sql.session.timeZone", "UTC") .config("spark.sql.shuffle.partitions", "1") .config("spark.default.parallelism", "1") From 8a177a2d6380a0b9506e24c590e36ec8654ab6ae Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 09:21:21 -0700 Subject: [PATCH 02/11] RC5 --- dev/Dockerfile | 2 +- tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index 876bfa42e3..fe1bef289f 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -42,7 +42,7 @@ ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ENV ICEBERG_VERSION=1.10.0 ENV PYICEBERG_VERSION=0.9.1 -ENV BASE_ARTIFACT_URL=https://repository.apache.org/content/repositories/orgapacheiceberg-1268 +ENV BASE_ARTIFACT_URL=https://repository.apache.org/content/repositories/orgapacheiceberg-1269 RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz diff --git a/tests/conftest.py b/tests/conftest.py index 2bdd36f8fa..49320c32fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2551,7 +2551,7 @@ def spark() -> "SparkSession": spark = ( SparkSession.builder.appName("PyIceberg integration test") - .config("spark.jars.repositories", "https://repository.apache.org/content/repositories/orgapacheiceberg-1268/") + .config("spark.jars.repositories", "https://repository.apache.org/content/repositories/orgapacheiceberg-1269/") .config("spark.sql.session.timeZone", "UTC") .config("spark.sql.shuffle.partitions", "1") .config("spark.default.parallelism", "1") From 9b4492a8370f04efa851678487ab93c29ea645e1 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 11:55:10 -0700 Subject: [PATCH 03/11] use spark 4.0.1 --- dev/Dockerfile | 4 +- poetry.lock | 106 +++++++++++++++++++++++----------------------- pyproject.toml | 3 +- tests/conftest.py | 2 +- 4 files changed, 58 insertions(+), 57 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index fe1bef289f..515cc6f0c4 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -37,8 +37,8 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/ WORKDIR ${SPARK_HOME} # Remember to also update `tests/conftest`'s spark setting -ENV SPARK_VERSION=3.5.6 -ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 +ENV SPARK_VERSION=4.0.1 +ENV ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 ENV ICEBERG_VERSION=1.10.0 ENV PYICEBERG_VERSION=0.9.1 diff --git a/poetry.lock b/poetry.lock index 5ccc3d0682..868dabc90d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. [[package]] name = "adlfs" @@ -58,7 +58,7 @@ description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -71,7 +71,7 @@ description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiohttp-3.12.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6f25e9d274d6abbb15254f76f100c3984d6b9ad6e66263cc60a465dd5c7e48f5"}, {file = "aiohttp-3.12.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8ec3c1a1c13d24941b5b913607e57b9364e4c0ea69d5363181467492c4b2ba6"}, @@ -201,7 +201,7 @@ description = "aiosignal: a list of registered asynchronous callbacks" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\") and (extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\" or extra == \"ray\")" +markers = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, @@ -267,7 +267,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\") and python_version <= \"3.10\"" +markers = "(extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\") and python_version <= \"3.10\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -284,7 +284,7 @@ files = [ {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"}, {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"}, ] -markers = {main = "(extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\") and (extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\" or extra == \"ray\")"} +markers = {main = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\""} [package.extras] benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] @@ -543,7 +543,7 @@ files = [ {file = "boto3-1.37.3-py3-none-any.whl", hash = "sha256:2063b40af99fd02f6228ff52397b552ff3353831edaf8d25cc04801827ab9794"}, {file = "boto3-1.37.3.tar.gz", hash = "sha256:21f3ce0ef111297e63a6eb998a25197b8c10982970c320d4c6e8db08be2157be"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} [package.dependencies] botocore = ">=1.37.3,<1.38.0" @@ -564,14 +564,14 @@ files = [ {file = "botocore-1.37.3-py3-none-any.whl", hash = "sha256:d01bd3bf4c80e61fa88d636ad9f5c9f60a551d71549b481386c6b4efe0bb2b2e"}, {file = "botocore-1.37.3.tar.gz", hash = "sha256:fe8403eb55a88faf9b0f9da6615e5bee7be056d75e17af66c3c8f0a3b0648da4"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = [ - {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, ] [package.extras] @@ -1610,7 +1610,7 @@ description = "A list-like structure which implements collections.abc.MutableSeq optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\") and (extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\" or extra == \"ray\")" +markers = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cc4df77d638aa2ed703b878dd093725b72a824c3c546c076e8fdf276f78ee84a"}, {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:716a9973a2cc963160394f701964fe25012600f3d311f60c790400b00e568b61"}, @@ -1809,7 +1809,7 @@ description = "Google API client core library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"}, {file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"}, @@ -1819,16 +1819,16 @@ files = [ google-auth = ">=2.14.1,<3.0.0" googleapis-common-protos = ">=1.56.2,<2.0.0" grpcio = [ - {version = ">=1.33.2,<2.0.0", optional = true, markers = "extra == \"grpc\""}, {version = ">=1.49.1,<2.0.0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0.0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.33.2,<2.0.0", optional = true, markers = "extra == \"grpc\""}, {version = ">=1.49.1,<2.0.0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0.0", optional = true, markers = "extra == \"grpc\""}, ] proto-plus = [ - {version = ">=1.22.3,<2.0.0"}, {version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""}, + {version = ">=1.22.3,<2.0.0"}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0" requests = ">=2.18.0,<3.0.0" @@ -1846,7 +1846,7 @@ description = "Google Authentication Library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, @@ -1928,7 +1928,7 @@ description = "Google Cloud API client core library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e"}, {file = "google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53"}, @@ -1973,7 +1973,7 @@ description = "A python wrapper of the C library 'Google CRC32C'" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76"}, {file = "google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d"}, @@ -2021,7 +2021,7 @@ description = "Utilities for Google Media Downloads and Resumable Uploads" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"}, {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"}, @@ -2041,7 +2041,7 @@ description = "Common protobufs used in Google APIs" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"bigquery\" or extra == \"gcsfs\" or python_version <= \"3.10\" and (extra == \"gcsfs\" or extra == \"bigquery\")" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, @@ -2075,7 +2075,7 @@ description = "Lightweight in-process concurrent programming" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and (extra == \"sql-postgres\" or extra == \"sql-sqlite\")" +markers = "(platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and (extra == \"sql-postgres\" or extra == \"sql-sqlite\") and python_version <= \"3.13\"" files = [ {file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"}, {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"}, @@ -2484,7 +2484,7 @@ files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} [[package]] name = "joserfc" @@ -3344,7 +3344,7 @@ description = "multidict implementation" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8adee3ac041145ffe4488ea73fa0a622b464cc25340d98be76924d0cda8545ff"}, {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b61e98c3e2a861035aaccd207da585bdcacef65fe01d7a0d07478efac005e028"}, @@ -3655,7 +3655,7 @@ description = "Fundamental package for array computing in Python" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version < \"3.10\" and (extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\")" +markers = "python_version < \"3.10\" and (extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\")" files = [ {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, @@ -3711,7 +3711,7 @@ description = "Fundamental package for array computing in Python" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version >= \"3.10\" and (extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\")" +markers = "python_version >= \"3.10\" and (extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\")" files = [ {file = "numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb"}, {file = "numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90"}, @@ -3834,7 +3834,7 @@ files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, ] -markers = {main = "(extra == \"bigquery\" or extra == \"hf\") and (extra == \"bigquery\" or extra == \"hf\" or extra == \"ray\")"} +markers = {main = "extra == \"ray\" or extra == \"bigquery\" or extra == \"hf\""} [[package]] name = "paginate" @@ -3859,7 +3859,7 @@ description = "Powerful data structures for data analysis, time series, and stat optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\"" +markers = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\"" files = [ {file = "pandas-2.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:52bc29a946304c360561974c6542d1dd628ddafa69134a7131fdfd6a5d7a1a35"}, {file = "pandas-2.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:220cc5c35ffaa764dd5bb17cf42df283b5cb7fdf49e10a7b053a06c9cb48ee2b"}, @@ -3907,9 +3907,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4080,7 +4080,7 @@ description = "Accelerated property cache" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:22d9962a358aedbb7a2e36187ff273adeaab9743373a272976d2e348d08c7770"}, {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d0fda578d1dc3f77b6b5a5dce3b9ad69a8250a891760a548df850a5e8da87f3"}, @@ -4189,7 +4189,7 @@ description = "Beautiful, Pythonic protocol buffers" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "(python_version < \"3.12\" or python_version >= \"3.13\" or extra == \"bigquery\" or extra == \"gcsfs\") and (extra == \"gcsfs\" or extra == \"bigquery\")" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66"}, {file = "proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012"}, @@ -4208,7 +4208,7 @@ description = "" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"gcsfs\" or extra == \"bigquery\") and (extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"ray\")" +markers = "extra == \"ray\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"}, {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"}, @@ -4358,14 +4358,14 @@ dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] [[package]] name = "py4j" -version = "0.10.9.7" +version = "0.10.9.9" description = "Enables Python programs to dynamically access arbitrary Java objects" optional = false python-versions = "*" groups = ["dev"] files = [ - {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, - {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, + {file = "py4j-0.10.9.9-py2.py3-none-any.whl", hash = "sha256:c7c26e4158defb37b0bb124933163641a2ff6e3a3913f7811b0ddbe07ed61533"}, + {file = "py4j-0.10.9.9.tar.gz", hash = "sha256:f694cad19efa5bd1dee4f3e5270eb406613c974394035e5bfc4ec1aba870b879"}, ] [[package]] @@ -4375,7 +4375,7 @@ description = "Python library for Apache Arrow" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"bodo\" or extra == \"daft\" or extra == \"datafusion\" or extra == \"duckdb\" or extra == \"pandas\" or extra == \"pyarrow\" or extra == \"ray\"" +markers = "extra == \"pyarrow\" or extra == \"pandas\" or extra == \"duckdb\" or extra == \"ray\" or extra == \"bodo\" or extra == \"daft\" or extra == \"datafusion\"" files = [ {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69"}, {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec"}, @@ -4431,7 +4431,7 @@ description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, @@ -4444,7 +4444,7 @@ description = "A collection of ASN.1-based protocols modules" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, @@ -4796,24 +4796,24 @@ files = [ [[package]] name = "pyspark" -version = "3.5.6" +version = "4.0.1" description = "Apache Spark Python API" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "pyspark-3.5.6.tar.gz", hash = "sha256:f8b1c4360e41ab398c64904fae08740503bcb6bd389457d659fa6d9f2952cc48"}, + {file = "pyspark-4.0.1.tar.gz", hash = "sha256:9d1f22d994f60369228397e3479003ffe2dd736ba79165003246ff7bd48e2c73"}, ] [package.dependencies] -py4j = "0.10.9.7" +py4j = "0.10.9.9" [package.extras] -connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] -ml = ["numpy (>=1.15,<2)"] -mllib = ["numpy (>=1.15,<2)"] -pandas-on-spark = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] -sql = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +connect = ["googleapis-common-protos (>=1.65.0)", "grpcio (>=1.67.0)", "grpcio-status (>=1.67.0)", "numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] +ml = ["numpy (>=1.21)"] +mllib = ["numpy (>=1.21)"] +pandas-on-spark = ["numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] +sql = ["numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] [[package]] name = "pytest" @@ -4929,7 +4929,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\"" +markers = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\"" files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, @@ -5506,7 +5506,7 @@ description = "Pure-Python RSA implementation" optional = true python-versions = "<4,>=3.6" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, @@ -5548,7 +5548,7 @@ files = [ {file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"}, {file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} [package.dependencies] botocore = ">=1.36.0,<2.0a.0" @@ -6056,7 +6056,7 @@ description = "Fast, Extensible Progress Meter" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"daft\" or extra == \"hf\"" +markers = "extra == \"hf\" or extra == \"daft\"" files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, @@ -6107,7 +6107,7 @@ description = "Provider of IANA time zone data" optional = true python-versions = ">=2" groups = ["main"] -markers = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\"" +markers = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\"" files = [ {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, @@ -6341,7 +6341,7 @@ description = "Yet another URL library" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6032e6da6abd41e4acda34d75a816012717000fa6839f37124a47fcefc49bec4"}, {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2c7b34d804b8cf9b214f05015c4fee2ebe7ed05cf581e7192c06555c71f4446a"}, @@ -6618,4 +6618,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.1" python-versions = "^3.9.2, !=3.9.7" -content-hash = "432569eecac43492ae9237e594aefce04cab271ba9a4890f70e7c6086e82e594" +content-hash = "5082f096c8c6bef01cbba4470e2ed885af0f42045ce2e75c0f027be5287433a9" diff --git a/pyproject.toml b/pyproject.toml index 42bbf11e66..32a1322249 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ requests-mock = "1.12.1" moto = { version = "^5.0.2", extras = ["server"] } typing-extensions = "4.15.0" pytest-mock = "3.14.1" -pyspark = "3.5.6" +pyspark = "4.0.1" cython = "3.1.3" deptry = ">=0.14,<0.24" docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520 @@ -356,6 +356,7 @@ filterwarnings = [ "ignore:datetime.datetime.utcnow\\(\\) is deprecated and scheduled for removal in a future version.", # Latest PySpark version (v3.5.3) throws this error, remove in a future release of PySpark (possibly v4.0.0). "ignore:is_datetime64tz_dtype is deprecated and will be removed in a future version.", + "ignore::DeprecationWarning:jupyter_core.*" ] [tool.black] diff --git a/tests/conftest.py b/tests/conftest.py index 49320c32fd..9796d56859 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2533,7 +2533,7 @@ def spark() -> "SparkSession": # Remember to also update `dev/Dockerfile` spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) - scala_version = "2.12" + scala_version = "2.13" iceberg_version = "1.10.0" hadoop_version = "3.3.4" aws_sdk_version = "1.12.753" From 66f20cf5aa221f0a51213fb42ea7bf3714e34b53 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 12:07:08 -0700 Subject: [PATCH 04/11] fix warning --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 32a1322249..59d97c3c33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -356,7 +356,7 @@ filterwarnings = [ "ignore:datetime.datetime.utcnow\\(\\) is deprecated and scheduled for removal in a future version.", # Latest PySpark version (v3.5.3) throws this error, remove in a future release of PySpark (possibly v4.0.0). "ignore:is_datetime64tz_dtype is deprecated and will be removed in a future version.", - "ignore::DeprecationWarning:jupyter_core.*" + "ignore:.*Jupyter is migrating.*:DeprecationWarning", ] [tool.black] From 044fd651e26f625dd982a95c70f2da862f29707c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 15:14:13 -0700 Subject: [PATCH 05/11] use cdn, its faster --- dev/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index 515cc6f0c4..721495e7af 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -43,7 +43,7 @@ ENV ICEBERG_VERSION=1.10.0 ENV PYICEBERG_VERSION=0.9.1 ENV BASE_ARTIFACT_URL=https://repository.apache.org/content/repositories/orgapacheiceberg-1269 -RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ +RUN curl --retry 5 -s -C - https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz From c280a709a70b034f6a2aebce2a939628df55abe9 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 15:34:05 -0700 Subject: [PATCH 06/11] fix spark 4 --- dev/Dockerfile | 3 ++- dev/provision.py | 1 + tests/conftest.py | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index 721495e7af..c14aae1496 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -21,7 +21,7 @@ RUN apt-get -qq update && \ curl \ vim \ unzip \ - openjdk-11-jdk \ + openjdk-17-jdk \ build-essential \ software-properties-common \ ssh && \ @@ -62,6 +62,7 @@ RUN chmod u+x /opt/spark/sbin/* && \ chmod u+x /opt/spark/bin/* RUN pip3 install -q ipython +RUN pip3 install py4j RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}" diff --git a/dev/provision.py b/dev/provision.py index 231f5123ce..6da7b70442 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -32,6 +32,7 @@ .builder .config("spark.sql.shuffle.partitions", "1") .config("spark.default.parallelism", "1") + .config("spark.sql.ansi.enabled", "false") .getOrCreate() ) diff --git a/tests/conftest.py b/tests/conftest.py index 9796d56859..9389b4a87f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2535,7 +2535,7 @@ def spark() -> "SparkSession": spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) scala_version = "2.13" iceberg_version = "1.10.0" - hadoop_version = "3.3.4" + hadoop_version = "3.3.6" aws_sdk_version = "1.12.753" os.environ["PYSPARK_SUBMIT_ARGS"] = ( @@ -2577,6 +2577,11 @@ def spark() -> "SparkSession": .config("spark.sql.catalog.spark_catalog.warehouse", "s3://warehouse/hive/") .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") .config("spark.hadoop.fs.s3a.path.style.access", "true") + .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60000") + .config("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") + .config("spark.hadoop.fs.s3a.connection.timeout", "200000") + .config("spark.hadoop.fs.s3a.multipart.purge.age", str(24 * 60 * 60)) + .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider") .config("spark.sql.catalogImplementation", "hive") .config("spark.sql.defaultCatalog", "integration") .config("spark.sql.execution.arrow.pyspark.enabled", "true") From 8b603ab562d1838bc89807ef667d5c42c26b7a20 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 17:55:02 -0700 Subject: [PATCH 07/11] use java 17 in ci --- .github/workflows/python-ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 63559c35d4..60dafce45b 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -56,6 +56,11 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} + - name: Set up JDK 17 for Spark 4 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos - name: Install From c659f6e3861a23b394e46a3ade9e0a094dcd6070 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 18:09:19 -0700 Subject: [PATCH 08/11] use java 17 --- .github/workflows/python-ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 60dafce45b..3ed1bc5a1b 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -80,6 +80,11 @@ jobs: steps: - uses: actions/checkout@v5 + - name: Set up JDK 17 for Spark 4 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos - name: Install From 1e47f0fab053708ed49733d06d552377bb3bf290 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 19:48:44 -0700 Subject: [PATCH 09/11] 3.3.x < hadoop <= 3.4.1 --- dev/hive/Dockerfile | 4 ++-- tests/conftest.py | 7 +------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile index 2ff3dbce67..40a721e6a9 100644 --- a/dev/hive/Dockerfile +++ b/dev/hive/Dockerfile @@ -17,7 +17,7 @@ FROM openjdk:8-jre-slim AS build RUN apt-get update -qq && apt-get -qq -y install curl -ENV HADOOP_VERSION=3.3.6 +ENV HADOOP_VERSION=3.4.1 ENV AWS_SDK_BUNDLE=1.12.753 RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /tmp/hadoop-aws-${HADOOP_VERSION}.jar @@ -25,7 +25,7 @@ RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_ FROM apache/hive:4.0.0 -ENV HADOOP_VERSION=3.3.6 +ENV HADOOP_VERSION=3.4.1 ENV AWS_SDK_BUNDLE=1.12.753 COPY --from=build /tmp/hadoop-aws-${HADOOP_VERSION}.jar /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar diff --git a/tests/conftest.py b/tests/conftest.py index 9389b4a87f..e10df8fe75 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2535,7 +2535,7 @@ def spark() -> "SparkSession": spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) scala_version = "2.13" iceberg_version = "1.10.0" - hadoop_version = "3.3.6" + hadoop_version = "3.4.1" aws_sdk_version = "1.12.753" os.environ["PYSPARK_SUBMIT_ARGS"] = ( @@ -2577,11 +2577,6 @@ def spark() -> "SparkSession": .config("spark.sql.catalog.spark_catalog.warehouse", "s3://warehouse/hive/") .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") .config("spark.hadoop.fs.s3a.path.style.access", "true") - .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60000") - .config("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") - .config("spark.hadoop.fs.s3a.connection.timeout", "200000") - .config("spark.hadoop.fs.s3a.multipart.purge.age", str(24 * 60 * 60)) - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider") .config("spark.sql.catalogImplementation", "hive") .config("spark.sql.defaultCatalog", "integration") .config("spark.sql.execution.arrow.pyspark.enabled", "true") From 446f208bb5166979f5753c89e8b6ddf2738e953d Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 8 Sep 2025 21:03:28 -0700 Subject: [PATCH 10/11] dont change hive --- dev/hive/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile index 40a721e6a9..2ff3dbce67 100644 --- a/dev/hive/Dockerfile +++ b/dev/hive/Dockerfile @@ -17,7 +17,7 @@ FROM openjdk:8-jre-slim AS build RUN apt-get update -qq && apt-get -qq -y install curl -ENV HADOOP_VERSION=3.4.1 +ENV HADOOP_VERSION=3.3.6 ENV AWS_SDK_BUNDLE=1.12.753 RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /tmp/hadoop-aws-${HADOOP_VERSION}.jar @@ -25,7 +25,7 @@ RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_ FROM apache/hive:4.0.0 -ENV HADOOP_VERSION=3.4.1 +ENV HADOOP_VERSION=3.3.6 ENV AWS_SDK_BUNDLE=1.12.753 COPY --from=build /tmp/hadoop-aws-${HADOOP_VERSION}.jar /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar From 77c978488c00403ae465796490f84226f41a3589 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 9 Sep 2025 09:03:39 -0700 Subject: [PATCH 11/11] reuse py4j --- dev/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index c14aae1496..e4f3879be7 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -31,7 +31,7 @@ RUN apt-get -qq update && \ # Optional env variables ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} -ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH +ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events WORKDIR ${SPARK_HOME} @@ -62,7 +62,6 @@ RUN chmod u+x /opt/spark/sbin/* && \ chmod u+x /opt/spark/bin/* RUN pip3 install -q ipython -RUN pip3 install py4j RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}"