Skip to content

Commit 513295d

Browse files
kevinjqliuFokko
andauthored
infra: use spark connect to run pytests (#2491)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Closes #2492 Run pytest using Spark Connect for a more consistent test env + a few general cleanup changes ## Are these changes tested? ## Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Fokko Driesprong <[email protected]>
1 parent e3e0ec6 commit 513295d

File tree

13 files changed

+153
-255
lines changed

13 files changed

+153
-255
lines changed

.gitignore

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,6 @@ coverage.xml
3737
bin/
3838
.vscode/
3939

40-
# Hive/metastore files
41-
metastore_db/
42-
43-
# Spark/metastore files
44-
spark-warehouse/
45-
derby.log
46-
4740
# Python stuff
4841
.mypy_cache/
4942
htmlcov

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# Configuration Variables
1919
# ========================
2020

21-
PYTEST_ARGS ?= -v # Override with e.g. PYTEST_ARGS="-vv --tb=short"
21+
PYTEST_ARGS ?= -v -x # Override with e.g. PYTEST_ARGS="-vv --tb=short"
2222
COVERAGE ?= 0 # Set COVERAGE=1 to enable coverage: make test COVERAGE=1
2323
COVERAGE_FAIL_UNDER ?= 85 # Minimum coverage % to pass: make coverage-report COVERAGE_FAIL_UNDER=70
2424
KEEP_COMPOSE ?= 0 # Set KEEP_COMPOSE=1 to keep containers after integration tests
@@ -37,7 +37,7 @@ endif
3737
ifeq ($(KEEP_COMPOSE),1)
3838
CLEANUP_COMMAND = echo "Keeping containers running for debugging (KEEP_COMPOSE=1)"
3939
else
40-
CLEANUP_COMMAND = docker compose -f dev/docker-compose-integration.yml down -v --remove-orphans 2>/dev/null || true
40+
CLEANUP_COMMAND = docker compose -f dev/docker-compose-integration.yml down -v --remove-orphans --timeout 0 2>/dev/null || true
4141
endif
4242

4343
# ============

dev/Dockerfile

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$
3636
RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
3737
WORKDIR ${SPARK_HOME}
3838

39-
# Remember to also update `tests/conftest`'s spark setting
4039
ENV SPARK_VERSION=3.5.6
41-
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
42-
ENV ICEBERG_VERSION=1.9.1
40+
ENV SCALA_VERSION=2.12
41+
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION}
42+
ENV ICEBERG_VERSION=1.9.2
4343
ENV PYICEBERG_VERSION=0.10.0
44+
ENV HADOOP_VERSION=3.3.4
45+
ENV AWS_SDK_VERSION=1.12.753
4446

4547
# Try the primary Apache mirror (downloads.apache.org) first, then fall back to the archive
4648
RUN set -eux; \
@@ -59,15 +61,26 @@ RUN set -eux; \
5961
tar xzf "$FILE" --directory /opt/spark --strip-components 1; \
6062
rm -rf "$FILE"
6163

64+
# Download Spark Connect server JAR
65+
RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
66+
-Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
67+
6268
# Download iceberg spark runtime
6369
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
6470
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
6571

66-
6772
# Download AWS bundle
6873
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
6974
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
7075

76+
# Download hadoop-aws (required for S3 support)
77+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
78+
-Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar
79+
80+
# Download AWS SDK bundle
81+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \
82+
-Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
83+
7184
COPY spark-defaults.conf /opt/spark/conf
7285
ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
7386

dev/docker-compose-integration.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,13 @@ services:
2626
- rest
2727
- hive
2828
- minio
29-
volumes:
30-
- ./warehouse:/home/iceberg/warehouse
3129
environment:
3230
- AWS_ACCESS_KEY_ID=admin
3331
- AWS_SECRET_ACCESS_KEY=password
3432
- AWS_REGION=us-east-1
3533
ports:
36-
- 8888:8888
37-
- 8080:8080
34+
- 15002:15002 # Spark Connect
35+
- 4040:4040 # Spark UI
3836
links:
3937
- rest:rest
4038
- hive:hive

dev/entrypoint.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
# under the License.
1919
#
2020

21-
start-master.sh -p 7077
22-
start-worker.sh spark://spark-iceberg:7077
23-
start-history-server.sh
21+
start-connect-server.sh
2422

2523
tail -f /dev/null

dev/provision.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
"hive",
5151
**{
5252
"type": "hive",
53-
"uri": "http://hive:9083",
53+
"uri": "thrift://hive:9083",
5454
"s3.endpoint": "http://minio:9000",
5555
"s3.access-key-id": "admin",
5656
"s3.secret-access-key": "password",

dev/spark-defaults.conf

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,35 @@
1616
#
1717

1818
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
19+
20+
# Configure Iceberg REST catalog
1921
spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog
2022
spark.sql.catalog.rest.type rest
2123
spark.sql.catalog.rest.uri http://rest:8181
2224
spark.sql.catalog.rest.io-impl org.apache.iceberg.aws.s3.S3FileIO
2325
spark.sql.catalog.rest.warehouse s3://warehouse/rest/
2426
spark.sql.catalog.rest.s3.endpoint http://minio:9000
27+
spark.sql.catalog.rest.cache-enabled false
28+
29+
# Configure Iceberg Hive catalog
2530
spark.sql.catalog.hive org.apache.iceberg.spark.SparkCatalog
2631
spark.sql.catalog.hive.type hive
27-
spark.sql.catalog.hive.uri http://hive:9083
32+
spark.sql.catalog.hive.uri thrift://hive:9083
2833
spark.sql.catalog.hive.io-impl org.apache.iceberg.aws.s3.S3FileIO
2934
spark.sql.catalog.hive.warehouse s3://warehouse/hive/
3035
spark.sql.catalog.hive.s3.endpoint http://minio:9000
36+
37+
# Configure Spark's default session catalog (spark_catalog) to use Iceberg backed by the Hive Metastore
38+
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
39+
spark.sql.catalog.spark_catalog.type hive
40+
spark.sql.catalog.spark_catalog.uri thrift://hive:9083
41+
spark.hadoop.fs.s3a.endpoint http://minio:9000
42+
spark.sql.catalogImplementation hive
43+
spark.sql.warehouse.dir s3a://warehouse/hive/
44+
3145
spark.sql.defaultCatalog rest
46+
47+
# Configure Spark UI and event logging
48+
spark.ui.enabled true
3249
spark.eventLog.enabled true
3350
spark.eventLog.dir /home/iceberg/spark-events
34-
spark.history.fs.logDirectory /home/iceberg/spark-events
35-
spark.sql.catalogImplementation in-memory

0 commit comments

Comments
 (0)