Skip to content

Commit b925c0f

Browse files
committed
[DOP-26758] Include .jar files into worker image
1 parent acc102c commit b925c0f

File tree

7 files changed

+118
-45
lines changed

7 files changed

+118
-45
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ repos:
3939
- id: chmod
4040
args: ['644']
4141
exclude_types: [shell]
42-
exclude: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py)$
42+
exclude: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py|docker/.*\.py)$
4343
- id: chmod
4444
args: ['755']
4545
types: [shell]
4646
- id: chmod
4747
args: ['755']
48-
files: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py)$
48+
files: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py|docker/.*\.py)$
4949
- id: insert-license
5050
types: [python]
5151
exclude: ^(syncmaster/server/dependencies/stub.py|docs/.*\.py|tests/.*\.py)$

docker-compose.test.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ services:
114114
- SYNCMASTER__SERVER__STATIC_FILES__ENABLED=false
115115
volumes:
116116
- ./syncmaster:/app/syncmaster
117-
- ./cached_jars:/root/.ivy2
118117
- ./reports:/app/reports
119118
- ./tests:/app/tests
120119
- ./pyproject.toml:/app/pyproject.toml

docker/Dockerfile.worker

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,31 @@ RUN --mount=type=cache,target=/root/.cache/pypoetry \
4040
--without test,docs,dev \
4141
&& python -m compileall -j 4 .venv
4242

43+
FROM builder AS maven_packages
4344

44-
FROM base AS prod
45+
ARG SPARK_CONNECTORS_TO_DOWNLOAD="all"
46+
ENV SPARK_CONNECTORS_TO_DOWNLOAD=$SPARK_CONNECTORS_TO_DOWNLOAD
47+
RUN --mount=type=bind,source=./syncmaster,target=/app/syncmaster \
48+
--mount=type=bind,source=./docker/download_maven_packages.py,target=/app/docker/download_maven_packages.py \
49+
mkdir /root/.ivy2 && \
50+
python /app/docker/download_maven_packages.py
4551

46-
# We don't need poetry and compilers in final image
47-
COPY --from=builder /app/.venv/ /app/.venv/
48-
COPY ./syncmaster/ /app/syncmaster/
49-
RUN python -m compileall syncmaster
52+
53+
FROM base AS prod
5054

5155
# Do not run production as root, to improve security.
5256
# Also user does not own anything inside the image, including venv and source code.
5357
RUN useradd syncmaster && \
54-
mkdir -p /home/syncmaster && \
58+
mkdir -p /home/syncmaster /home/syncmaster/.ivy2 && \
5559
chown -R syncmaster:syncmaster /home/syncmaster
60+
61+
# We don't need poetry and compilers in final image
62+
COPY --from=builder /app/.venv/ /app/.venv/
63+
# custom Spark session function may download different jars, so syncmaster have to own them
64+
COPY --from=maven_packages --chown=syncmaster:syncmaster /root/.ivy2/ /home/syncmaster/.ivy2/
65+
66+
COPY ./syncmaster/ /app/syncmaster/
67+
RUN python -m compileall syncmaster
5668
USER syncmaster
5769

5870

@@ -68,6 +80,7 @@ RUN --mount=type=cache,target=/root/.cache/pypoetry \
6880
--without docs,dev \
6981
&& python -m compileall -j 4 .venv
7082

83+
COPY --from=maven_packages /root/.ivy2/ /root/.ivy2/
7184
ENV SYNCMASTER__WORKER__CREATE_SPARK_SESSION_FUNCTION=tests.spark.get_worker_spark_session
7285

7386
# Collect coverage from worker

docker/download_maven_packages.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from __future__ import annotations
6+
7+
import logging
8+
import os
9+
from typing import TYPE_CHECKING
10+
11+
from syncmaster.worker.spark import get_excluded_packages, get_packages
12+
13+
if TYPE_CHECKING:
14+
from pyspark.sql import SparkSession
15+
16+
log = logging.getLogger(__name__)
17+
18+
19+
def get_spark_session_conf_for_docker_image() -> dict:
20+
types = os.getenv("SPARK_CONNECTORS_TO_DOWNLOAD", "all").split(",")
21+
maven_packages: list[str] = get_packages(connection_types=set(types))
22+
excluded_packages: list[str] = get_excluded_packages()
23+
24+
return {
25+
"spark.jars.packages": ",".join(maven_packages),
26+
"spark.jars.excludes": ",".join(excluded_packages),
27+
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
28+
# use only minimal available resoures
29+
"spark.driver.cores": "1",
30+
"spark.driver.memory": "512M",
31+
"spark.executor.cores": "1",
32+
"spark.executor.memory": "512M",
33+
"spark.executor.instances": "1",
34+
}
35+
36+
37+
def get_worker_spark_session_for_docker() -> SparkSession:
38+
"""
39+
Construct dummy Spark session with all .jars included.
40+
Designed to be used in Dockerfile.worker to populate the image.
41+
"""
42+
from pyspark.sql import SparkSession
43+
44+
spark_builder = SparkSession.builder.appName("syncmaster_jar_downloader")
45+
46+
for k, v in get_spark_session_conf_for_docker_image().items():
47+
spark_builder = spark_builder.config(k, v)
48+
49+
return spark_builder.getOrCreate()
50+
51+
52+
def download_maven_packages():
53+
log.info("Downloading Maven packages...")
54+
with get_worker_spark_session_for_docker():
55+
log.info("Done!")
56+
57+
58+
if __name__ == "__main__":
59+
download_maven_packages()

docs/changelog/0.2.4.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
0.2.4 (2025-07-07)
2+
==================
3+
4+
Improvements
5+
------------
6+
7+
- Include all required jars from Maven to worker image. This increases image size, but drastically reduces time of Spark session startup.
8+

docs/changelog/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
:caption: Changelog
44

55
DRAFT
6+
0.2.4
67
0.2.3
78
0.2.2
89
0.2.1

syncmaster/worker/spark.py

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -47,44 +47,42 @@ def get_worker_spark_session(
4747
return spark_builder.getOrCreate()
4848

4949

50-
def get_packages(connection_type: str) -> list[str]: # noqa: WPS212
50+
def get_packages(connection_types: set[str]) -> list[str]: # noqa: WPS212
5151
import pyspark
5252
from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, SparkS3
5353
from onetl.file.format import XML, Excel
5454

55+
spark_version = pyspark.__version__
5556
# excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
5657
file_formats_spark_packages: list[str] = [
57-
*XML.get_packages(spark_version=pyspark.__version__),
58+
*XML.get_packages(spark_version=spark_version),
5859
*Excel.get_packages(spark_version="3.5.1"),
5960
]
6061

61-
if connection_type == "postgres":
62-
return Postgres.get_packages()
63-
if connection_type == "oracle":
64-
return Oracle.get_packages()
65-
if connection_type == "clickhouse":
66-
return [
67-
"io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2",
68-
*Clickhouse.get_packages(),
69-
]
70-
if connection_type == "mssql":
71-
return MSSQL.get_packages()
72-
if connection_type == "mysql":
73-
return MySQL.get_packages()
74-
if connection_type == "s3":
75-
import pyspark
76-
77-
spark_version = pyspark.__version__
78-
return SparkS3.get_packages(spark_version=spark_version) + file_formats_spark_packages
79-
80-
if connection_type in ("hdfs", "sftp", "ftp", "ftps", "samba", "webdav"):
81-
return file_formats_spark_packages
82-
83-
# If the database type does not require downloading .jar packages
84-
return []
85-
86-
87-
def get_excluded_packages(db_type: str) -> list[str]:
62+
result = []
63+
64+
if connection_types & {"postgres", "all"}:
65+
result.extend(Postgres.get_packages())
66+
if connection_types & {"oracle", "all"}:
67+
result.extend(Oracle.get_packages())
68+
if connection_types & {"clickhouse", "all"}:
69+
result.append("io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2")
70+
result.extend(Clickhouse.get_packages())
71+
if connection_types & {"mssql", "all"}:
72+
result.extend(MSSQL.get_packages())
73+
if connection_types & {"mysql", "all"}:
74+
result.extend(MySQL.get_packages())
75+
76+
if connection_types & {"s3", "all"}:
77+
result.extend(SparkS3.get_packages(spark_version=spark_version))
78+
79+
if connection_types & {"s3", "hdfs", "sftp", "ftp", "ftps", "samba", "webdav", "all"}:
80+
result.extend(file_formats_spark_packages)
81+
82+
return result
83+
84+
85+
def get_excluded_packages() -> list[str]:
8886
from onetl.connection import SparkS3
8987

9088
return SparkS3.get_exclude_packages()
@@ -95,16 +93,11 @@ def get_spark_session_conf(
9593
target: ConnectionDTO,
9694
resources: dict,
9795
) -> dict:
98-
maven_packages: list[str] = []
99-
excluded_packages: list[str] = []
100-
101-
for db_type in source, target:
102-
maven_packages.extend(get_packages(connection_type=db_type.type)) # type: ignore
103-
excluded_packages.extend(get_excluded_packages(db_type=db_type.type)) # type: ignore
96+
maven_packages: list[str] = get_packages(connection_types={source.type, target.type})
97+
excluded_packages: list[str] = get_excluded_packages()
10498

10599
memory_mb = math.ceil(resources["ram_bytes_per_task"] / 1024 / 1024)
106100
config = {
107-
"spark.jars.packages": ",".join(maven_packages),
108101
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
109102
"spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs": "false",
110103
"spark.executor.cores": resources["cpu_cores_per_task"],

0 commit comments

Comments
 (0)