Skip to content

Commit 5ee5eea

Browse files
authored
CI: Use Spark base image for Docker (#2540)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Closes #1527 This PR modifies the `dev/Dockerfile` file to use spark as the base image. This should be better than downloading spark from source. And likely faster on github runner. This PR also - modifies provision.py uses spark connect - add healthcheck to spark docker container ## Are these changes tested? ## Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 21416d6 commit 5ee5eea

File tree

9 files changed

+88
-145
lines changed

9 files changed

+88
-145
lines changed

.github/workflows/python-ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ jobs:
7575

7676
steps:
7777
- uses: actions/checkout@v5
78+
- uses: actions/setup-python@v6
79+
with:
80+
python-version: ${{ matrix.python }}
7881
- name: Install system dependencies
7982
run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
8083
- name: Install

Makefile

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,8 @@ test-integration: test-integration-setup test-integration-exec test-integration-
100100
test-integration-setup: ## Start Docker services for integration tests
101101
docker compose -f dev/docker-compose-integration.yml kill
102102
docker compose -f dev/docker-compose-integration.yml rm -f
103-
docker compose -f dev/docker-compose-integration.yml up -d
104-
sleep 10
105-
docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py
106-
docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
103+
docker compose -f dev/docker-compose-integration.yml up -d --wait
104+
$(POETRY) run python dev/provision.py
107105

108106
test-integration-exec: ## Run integration tests (excluding provision)
109107
$(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)

dev/Dockerfile

Lines changed: 47 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -13,86 +13,57 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
FROM python:3.12-bullseye
16+
ARG BASE_IMAGE_SPARK_VERSION=3.5.6
1717

18-
RUN apt-get -qq update && \
19-
apt-get -qq install -y --no-install-recommends \
20-
sudo \
21-
curl \
22-
vim \
23-
unzip \
24-
openjdk-11-jdk \
25-
build-essential \
26-
software-properties-common \
27-
ssh && \
28-
apt-get -qq clean && \
29-
rm -rf /var/lib/apt/lists/*
18+
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
3019

31-
# Optional env variables
32-
ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
33-
ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
34-
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
20+
# Dependency versions - keep these compatible
21+
ARG ICEBERG_VERSION=1.10.0
22+
ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
23+
ARG SPARK_VERSION=3.5.6
24+
ARG SCALA_VERSION=2.12
25+
ARG HADOOP_VERSION=3.3.4
26+
ARG AWS_SDK_VERSION=1.12.753
27+
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
3528

36-
RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
29+
USER root
3730
WORKDIR ${SPARK_HOME}
3831

39-
ENV SPARK_VERSION=3.5.6
40-
ENV SCALA_VERSION=2.12
41-
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION}
42-
ENV ICEBERG_VERSION=1.10.0
43-
ENV PYICEBERG_VERSION=0.10.0
44-
ENV HADOOP_VERSION=3.3.4
45-
ENV AWS_SDK_VERSION=1.12.753
46-
47-
# Try the primary Apache mirror (downloads.apache.org) first, then fall back to the archive
48-
RUN set -eux; \
49-
FILE=spark-${SPARK_VERSION}-bin-hadoop3.tgz; \
50-
URLS="https://downloads.apache.org/spark/spark-${SPARK_VERSION}/${FILE} https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${FILE}"; \
51-
for url in $URLS; do \
52-
echo "Attempting download: $url"; \
53-
if curl --retry 3 --retry-delay 5 -f -s -C - "$url" -o "$FILE"; then \
54-
echo "Downloaded from: $url"; \
55-
break; \
56-
else \
57-
echo "Failed to download from: $url"; \
58-
fi; \
59-
done; \
60-
if [ ! -f "$FILE" ]; then echo "Failed to download Spark from all mirrors" >&2; exit 1; fi; \
61-
tar xzf "$FILE" --directory /opt/spark --strip-components 1; \
62-
rm -rf "$FILE"
63-
64-
# Download Spark Connect server JAR
65-
RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
66-
-Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
67-
68-
# Download iceberg spark runtime
69-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
70-
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
71-
72-
# Download AWS bundle
73-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
74-
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
75-
76-
# Download hadoop-aws (required for S3 support)
77-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
78-
-Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar
79-
80-
# Download AWS SDK bundle
81-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \
82-
-Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
83-
84-
COPY spark-defaults.conf /opt/spark/conf
85-
ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
86-
87-
RUN chmod u+x /opt/spark/sbin/* && \
88-
chmod u+x /opt/spark/bin/*
89-
90-
RUN pip3 install -q ipython
91-
92-
RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}"
32+
# Install curl for JAR downloads
33+
RUN apt-get update && \
34+
apt-get install -y --no-install-recommends curl && \
35+
rm -rf /var/lib/apt/lists/*
9336

94-
COPY entrypoint.sh .
95-
COPY provision.py .
37+
# Copy configuration (early for better caching)
38+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
39+
40+
# Create event log directory
41+
RUN mkdir -p /home/iceberg/spark-events && \
42+
chown -R spark:spark /home/iceberg
43+
44+
# Required JAR dependencies
45+
ENV JARS_TO_DOWNLOAD="\
46+
org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
47+
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
48+
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
49+
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
50+
com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar"
51+
52+
# Download JARs with retry logic
53+
RUN set -e && \
54+
cd "${SPARK_HOME}/jars" && \
55+
for jar_path in ${JARS_TO_DOWNLOAD}; do \
56+
jar_name=$(basename "${jar_path}") && \
57+
echo "Downloading ${jar_name}..." && \
58+
curl -fsSL --retry 3 --retry-delay 5 \
59+
-o "${jar_name}" \
60+
"${MAVEN_MIRROR}/${jar_path}" && \
61+
echo "✓ Downloaded ${jar_name}"; \
62+
done && \
63+
chown -R spark:spark "${SPARK_HOME}/jars"
64+
65+
USER spark
66+
WORKDIR ${SPARK_HOME}
9667

97-
ENTRYPOINT ["./entrypoint.sh"]
98-
CMD ["notebook"]
68+
# Start Spark Connect server
69+
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]

dev/docker-compose-integration.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
services:
1919
spark-iceberg:
20-
image: python-integration
2120
container_name: pyiceberg-spark
2221
build: .
2322
networks:
@@ -37,6 +36,12 @@ services:
3736
- rest:rest
3837
- hive:hive
3938
- minio:minio
39+
healthcheck:
40+
test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"]
41+
interval: 30s
42+
timeout: 10s
43+
retries: 5
44+
start_period: 90s
4045
rest:
4146
image: apache/iceberg-rest-fixture
4247
container_name: pyiceberg-rest
@@ -87,7 +92,7 @@ services:
8792
"
8893
hive:
8994
build: hive/
90-
container_name: hive
95+
container_name: pyiceberg-hive
9196
hostname: hive
9297
networks:
9398
iceberg_net:

dev/entrypoint.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

dev/provision.py

Lines changed: 21 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
import math
1817

1918
from pyspark.sql import SparkSession
2019
from pyspark.sql.functions import current_date, date_add, expr
@@ -23,35 +22,26 @@
2322
from pyiceberg.schema import Schema
2423
from pyiceberg.types import FixedType, NestedField, UUIDType
2524

26-
# The configuration is important, otherwise we get many small
27-
# parquet files with a single row. When a positional delete
28-
# hits the Parquet file with one row, the parquet file gets
29-
# dropped instead of having a merge-on-read delete file.
30-
spark = (
31-
SparkSession
32-
.builder
33-
.config("spark.sql.shuffle.partitions", "1")
34-
.config("spark.default.parallelism", "1")
35-
.getOrCreate()
36-
)
25+
# Create SparkSession against the remote Spark Connect server
26+
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
3727

3828
catalogs = {
39-
'rest': load_catalog(
29+
"rest": load_catalog(
4030
"rest",
4131
**{
4232
"type": "rest",
43-
"uri": "http://rest:8181",
44-
"s3.endpoint": "http://minio:9000",
33+
"uri": "http://localhost:8181",
34+
"s3.endpoint": "http://localhost:9000",
4535
"s3.access-key-id": "admin",
4636
"s3.secret-access-key": "password",
4737
},
4838
),
49-
'hive': load_catalog(
39+
"hive": load_catalog(
5040
"hive",
5141
**{
5242
"type": "hive",
53-
"uri": "thrift://hive:9083",
54-
"s3.endpoint": "http://minio:9000",
43+
"uri": "thrift://localhost:9083",
44+
"s3.endpoint": "http://localhost:9000",
5545
"s3.access-key-id": "admin",
5646
"s3.secret-access-key": "password",
5747
},
@@ -119,7 +109,7 @@
119109
# v3: Using deletion vectors
120110

121111
for format_version in [2, 3]:
122-
identifier = f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}'
112+
identifier = f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}"
123113
spark.sql(
124114
f"""
125115
CREATE OR REPLACE TABLE {identifier} (
@@ -137,10 +127,8 @@
137127
"""
138128
)
139129

140-
spark.sql(
141-
f"""
142-
INSERT INTO {identifier}
143-
VALUES
130+
spark.sql("""
131+
SELECT * FROM VALUES
144132
(CAST('2023-03-01' AS date), 1, 'a'),
145133
(CAST('2023-03-02' AS date), 2, 'b'),
146134
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -152,9 +140,9 @@
152140
(CAST('2023-03-09' AS date), 9, 'i'),
153141
(CAST('2023-03-10' AS date), 10, 'j'),
154142
(CAST('2023-03-11' AS date), 11, 'k'),
155-
(CAST('2023-03-12' AS date), 12, 'l');
156-
"""
157-
)
143+
(CAST('2023-03-12' AS date), 12, 'l')
144+
AS t(dt, number, letter)
145+
""").coalesce(1).writeTo(identifier).append()
158146

159147
spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12")
160148

@@ -164,7 +152,7 @@
164152

165153
spark.sql(f"DELETE FROM {identifier} WHERE number = 9")
166154

167-
identifier = f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}'
155+
identifier = f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}"
168156

169157
spark.sql(
170158
f"""
@@ -178,15 +166,13 @@
178166
'write.delete.mode'='merge-on-read',
179167
'write.update.mode'='merge-on-read',
180168
'write.merge.mode'='merge-on-read',
181-
'format-version'='2'
169+
'format-version'='{format_version}'
182170
);
183171
"""
184172
)
185173

186-
spark.sql(
187-
f"""
188-
INSERT INTO {identifier}
189-
VALUES
174+
spark.sql("""
175+
SELECT * FROM VALUES
190176
(CAST('2023-03-01' AS date), 1, 'a'),
191177
(CAST('2023-03-02' AS date), 2, 'b'),
192178
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -198,9 +184,9 @@
198184
(CAST('2023-03-09' AS date), 9, 'i'),
199185
(CAST('2023-03-10' AS date), 10, 'j'),
200186
(CAST('2023-03-11' AS date), 11, 'k'),
201-
(CAST('2023-03-12' AS date), 12, 'l');
202-
"""
203-
)
187+
(CAST('2023-03-12' AS date), 12, 'l')
188+
AS t(dt, number, letter)
189+
""").coalesce(1).writeTo(identifier).append()
204190

205191
# Perform two deletes, should produce:
206192
# v2: two positional delete files in v2

mkdocs/docs/how-to-release.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,6 @@ Run the [`Release Docs` Github Action](https://github.com/apache/iceberg-python/
389389

390390
Make sure to create a PR to update the [GitHub issues template](https://github.com/apache/iceberg-python/blob/main/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml) with the latest version.
391391

392-
### Update the integration tests
393-
394-
Ensure to update the `PYICEBERG_VERSION` in the [Dockerfile](https://github.com/apache/iceberg-python/blob/main/dev/Dockerfile).
395-
396392
## Misc
397393

398394
### Set up GPG key and Upload to Apache Iceberg KEYS file

ruff.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
# under the License.
1717

1818
src = ['pyiceberg','tests']
19-
extend-exclude = ["dev/provision.py"]
2019

2120
# Exclude a variety of commonly ignored directories.
2221
exclude = [

tests/integration/test_reads.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,8 @@ def test_pyarrow_deletes(catalog: Catalog, format_version: int) -> None:
432432
# (11, 'k'),
433433
# (12, 'l')
434434
test_positional_mor_deletes = catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}")
435+
if format_version == 2:
436+
assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, "Table should produce position delete files"
435437
arrow_table = test_positional_mor_deletes.scan().to_arrow()
436438
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]
437439

@@ -470,6 +472,8 @@ def test_pyarrow_deletes_double(catalog: Catalog, format_version: int) -> None:
470472
# (11, 'k'),
471473
# (12, 'l')
472474
test_positional_mor_double_deletes = catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}")
475+
if format_version == 2:
476+
assert len(test_positional_mor_double_deletes.inspect.delete_files()) > 0, "Table should produce position delete files"
473477
arrow_table = test_positional_mor_double_deletes.scan().to_arrow()
474478
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 12]
475479

@@ -508,6 +512,8 @@ def test_pyarrow_batches_deletes(catalog: Catalog, format_version: int) -> None:
508512
# (11, 'k'),
509513
# (12, 'l')
510514
test_positional_mor_deletes = catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}")
515+
if format_version == 2:
516+
assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, "Table should produce position delete files"
511517
arrow_table = test_positional_mor_deletes.scan().to_arrow_batch_reader().read_all()
512518
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]
513519

@@ -550,6 +556,8 @@ def test_pyarrow_batches_deletes_double(catalog: Catalog, format_version: int) -
550556
# (11, 'k'),
551557
# (12, 'l')
552558
test_positional_mor_double_deletes = catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}")
559+
if format_version == 2:
560+
assert len(test_positional_mor_double_deletes.inspect.delete_files()) > 0, "Table should produce position delete files"
553561
arrow_table = test_positional_mor_double_deletes.scan().to_arrow_batch_reader().read_all()
554562
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 12]
555563

0 commit comments

Comments
 (0)