Skip to content

Commit ecda51f

Browse files
committed
[DOP-26758] Include .jar files into worker image
1 parent d4b9a96 commit ecda51f

File tree

7 files changed

+129
-45
lines changed

7 files changed

+129
-45
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ repos:
3939
- id: chmod
4040
args: ['644']
4141
exclude_types: [shell]
42-
exclude: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py)$
42+
exclude: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py|docker/.*\.py)$
4343
- id: chmod
4444
args: ['755']
4545
types: [shell]
4646
- id: chmod
4747
args: ['755']
48-
files: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py)$
48+
files: ^(.*__main__\.py|syncmaster/server/scripts/.*\.py|docker/.*\.py)$
4949
- id: insert-license
5050
types: [python]
5151
exclude: ^(syncmaster/server/dependencies/stub.py|docs/.*\.py|tests/.*\.py)$

docker-compose.test.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ services:
114114
- SYNCMASTER__SERVER__STATIC_FILES__ENABLED=false
115115
volumes:
116116
- ./syncmaster:/app/syncmaster
117-
- ./cached_jars:/root/.ivy2
118117
- ./reports:/app/reports
119118
- ./tests:/app/tests
120119
- ./pyproject.toml:/app/pyproject.toml

docker/Dockerfile.worker

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,41 @@ RUN --mount=type=cache,target=/root/.cache/pypoetry \
4141
&& python -m compileall -j 4 .venv
4242

4343

44-
FROM base AS prod
44+
FROM builder AS maven_packages
45+
46+
RUN --mount=type=bind,source=./syncmaster/,target=/app/syncmaster/ \
47+
--mount=type=bind,source=./docker/download_maven_packages.py,target=/app/docker/download_maven_packages.py \
48+
mkdir /root/.ivy2 && \
49+
# Try to download all dependencies at once.
50+
# If multiple packages depends on the same transitive dependency, Spark uses maximum version of this dependency.
51+
python /app/docker/download_maven_packages.py all && \
52+
# Then try to download specific connectors to fetch exact dependency version specified within connector.
53+
# Yes, this is slow, but overwise using worker without internet access will fail, unless custom ivysettings.xml is used
54+
python /app/docker/download_maven_packages.py s3 && \
55+
python /app/docker/download_maven_packages.py hdfs && \
56+
python /app/docker/download_maven_packages.py clickhouse && \
57+
python /app/docker/download_maven_packages.py postgres && \
58+
python /app/docker/download_maven_packages.py oracle && \
59+
python /app/docker/download_maven_packages.py mssql && \
60+
python /app/docker/download_maven_packages.py mysql
61+
# if someone uses custom worker image, they should download jars on their own
4562

46-
# We don't need poetry and compilers in final image
47-
COPY --from=builder /app/.venv/ /app/.venv/
48-
COPY ./syncmaster/ /app/syncmaster/
49-
RUN python -m compileall syncmaster
63+
64+
FROM base AS prod
5065

5166
# Do not run production as root, to improve security.
5267
# Also user does not own anything inside the image, including venv and source code.
5368
RUN useradd syncmaster && \
54-
mkdir -p /home/syncmaster && \
69+
mkdir -p /home/syncmaster /home/syncmaster/.ivy2 && \
5570
chown -R syncmaster:syncmaster /home/syncmaster
71+
72+
# We don't need poetry and compilers in final image
73+
COPY --from=builder /app/.venv/ /app/.venv/
74+
# custom Spark session function may download different jars, so syncmaster have to own them
75+
COPY --from=maven_packages --chown=syncmaster:syncmaster /root/.ivy2/ /home/syncmaster/.ivy2/
76+
77+
COPY ./syncmaster/ /app/syncmaster/
78+
RUN python -m compileall syncmaster
5679
USER syncmaster
5780

5881

docker/download_maven_packages.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from __future__ import annotations
6+
7+
import logging
8+
import sys
9+
from typing import TYPE_CHECKING
10+
11+
from syncmaster.worker.spark import get_excluded_packages, get_packages
12+
13+
if TYPE_CHECKING:
14+
from pyspark.sql import SparkSession
15+
16+
log = logging.getLogger(__name__)
17+
18+
19+
def get_spark_session_conf_for_docker_image(connection_types: set[str]) -> dict:
20+
maven_packages: list[str] = get_packages(connection_types=connection_types or {"all"})
21+
excluded_packages: list[str] = get_excluded_packages()
22+
23+
return {
24+
"spark.jars.packages": ",".join(maven_packages),
25+
"spark.jars.excludes": ",".join(excluded_packages),
26+
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
27+
# use only minimal available resoures
28+
"spark.driver.cores": "1",
29+
"spark.driver.memory": "512M",
30+
"spark.executor.cores": "1",
31+
"spark.executor.memory": "512M",
32+
"spark.executor.instances": "1",
33+
}
34+
35+
36+
def get_worker_spark_session_for_docker(connection_types: set[str]) -> SparkSession:
37+
"""
38+
Construct dummy Spark session with all .jars included.
39+
Designed to be used in Dockerfile.worker to populate the image.
40+
"""
41+
from pyspark.sql import SparkSession
42+
43+
spark_builder = SparkSession.builder.appName("syncmaster_jar_downloader").master("local")
44+
45+
for k, v in get_spark_session_conf_for_docker_image(connection_types).items():
46+
spark_builder = spark_builder.config(k, v)
47+
48+
return spark_builder.getOrCreate()
49+
50+
51+
def download_maven_packages(connection_types: set[str]):
52+
log.info("Downloading Maven packages for connectors %s...", connection_types)
53+
with get_worker_spark_session_for_docker(connection_types):
54+
log.info("Done!")
55+
56+
57+
if __name__ == "__main__":
58+
connection_types = "all"
59+
if len(sys.argv) > 1:
60+
connection_types = sys.argv[1:]
61+
download_maven_packages(set(connection_types))

docs/changelog/0.2.4.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
0.2.4 (2025-07-07)
2+
==================
3+
4+
Improvements
5+
------------
6+
7+
- Include all required jars from Maven to worker image. This increases image size, but drastically reduces time of Spark session startup.
8+

docs/changelog/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
:caption: Changelog
44

55
DRAFT
6+
0.2.4
67
0.2.3
78
0.2.2
89
0.2.1

syncmaster/worker/spark.py

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -47,44 +47,41 @@ def get_worker_spark_session(
4747
return spark_builder.getOrCreate()
4848

4949

50-
def get_packages(connection_type: str) -> list[str]: # noqa: WPS212
50+
def get_packages(connection_types: set[str]) -> list[str]: # noqa: WPS212
5151
import pyspark
5252
from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, SparkS3
5353
from onetl.file.format import XML, Excel
5454

55+
spark_version = pyspark.__version__
5556
# excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
5657
file_formats_spark_packages: list[str] = [
57-
*XML.get_packages(spark_version=pyspark.__version__),
58+
*XML.get_packages(spark_version=spark_version),
5859
*Excel.get_packages(spark_version="3.5.1"),
5960
]
6061

61-
if connection_type == "postgres":
62-
return Postgres.get_packages()
63-
if connection_type == "oracle":
64-
return Oracle.get_packages()
65-
if connection_type == "clickhouse":
66-
return [
67-
"io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2",
68-
*Clickhouse.get_packages(),
69-
]
70-
if connection_type == "mssql":
71-
return MSSQL.get_packages()
72-
if connection_type == "mysql":
73-
return MySQL.get_packages()
74-
if connection_type == "s3":
75-
import pyspark
76-
77-
spark_version = pyspark.__version__
78-
return SparkS3.get_packages(spark_version=spark_version) + file_formats_spark_packages
79-
80-
if connection_type in ("hdfs", "sftp", "ftp", "ftps", "samba", "webdav"):
81-
return file_formats_spark_packages
82-
83-
# If the database type does not require downloading .jar packages
84-
return []
85-
86-
87-
def get_excluded_packages(db_type: str) -> list[str]:
62+
result = []
63+
if connection_types & {"postgres", "all"}:
64+
result.extend(Postgres.get_packages())
65+
if connection_types & {"oracle", "all"}:
66+
result.extend(Oracle.get_packages())
67+
if connection_types & {"clickhouse", "all"}:
68+
result.append("io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2")
69+
result.extend(Clickhouse.get_packages())
70+
if connection_types & {"mssql", "all"}:
71+
result.extend(MSSQL.get_packages())
72+
if connection_types & {"mysql", "all"}:
73+
result.extend(MySQL.get_packages())
74+
75+
if connection_types & {"s3", "all"}:
76+
result.extend(SparkS3.get_packages(spark_version=spark_version))
77+
78+
if connection_types & {"s3", "hdfs", "sftp", "ftp", "ftps", "samba", "webdav", "all"}:
79+
result.extend(file_formats_spark_packages)
80+
81+
return result
82+
83+
84+
def get_excluded_packages() -> list[str]:
8885
from onetl.connection import SparkS3
8986

9087
return SparkS3.get_exclude_packages()
@@ -95,16 +92,11 @@ def get_spark_session_conf(
9592
target: ConnectionDTO,
9693
resources: dict,
9794
) -> dict:
98-
maven_packages: list[str] = []
99-
excluded_packages: list[str] = []
100-
101-
for db_type in source, target:
102-
maven_packages.extend(get_packages(connection_type=db_type.type)) # type: ignore
103-
excluded_packages.extend(get_excluded_packages(db_type=db_type.type)) # type: ignore
95+
maven_packages: list[str] = get_packages(connection_types={source.type, target.type})
96+
excluded_packages: list[str] = get_excluded_packages()
10497

10598
memory_mb = math.ceil(resources["ram_bytes_per_task"] / 1024 / 1024)
10699
config = {
107-
"spark.jars.packages": ",".join(maven_packages),
108100
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
109101
"spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs": "false",
110102
"spark.executor.cores": resources["cpu_cores_per_task"],

0 commit comments

Comments
 (0)