Skip to content

Commit 462fb89

Browse files
committed
[DOP-30579] Improve ivy2 package caching
1 parent 0d055e1 commit 462fb89

File tree

4 files changed

+62
-58
lines changed

4 files changed

+62
-58
lines changed

docker/Dockerfile.worker

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ RUN --mount=type=cache,target=/root/.cache/pypoetry \
4343

4444
FROM builder AS maven_packages
4545

46-
RUN --mount=type=bind,source=./syncmaster/,target=/app/syncmaster/ \
46+
RUN --mount=type=bind,source=./syncmaster/worker/ivy2.py,target=/app/syncmaster/worker/ivy2.py \
4747
--mount=type=bind,source=./docker/download_maven_packages.py,target=/app/docker/download_maven_packages.py \
4848
mkdir /root/.ivy2 && \
4949
# Try to download all dependencies at once.

docker/download_maven_packages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import sys
99
from typing import TYPE_CHECKING
1010

11-
from syncmaster.worker.spark import get_excluded_packages, get_packages
11+
from syncmaster.worker.ivy2 import get_excluded_packages, get_packages
1212

1313
if TYPE_CHECKING:
1414
from pyspark.sql import SparkSession

syncmaster/worker/ivy2.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# SPDX-FileCopyrightText: 2023-2024 MTS PJSC
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Using a dedicated module to avoid importing other SyncMaster modules,
5+
# for better docker caching
6+
from onetl.connection import (
7+
MSSQL,
8+
Clickhouse,
9+
Iceberg,
10+
MySQL,
11+
Oracle,
12+
Postgres,
13+
SparkS3,
14+
)
15+
from onetl.file.format import XML, Excel
16+
17+
18+
def get_packages(connection_types: set[str]) -> list[str]: # noqa: WPS212
19+
import pyspark
20+
21+
spark_version = pyspark.__version__
22+
# excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
23+
file_formats_spark_packages: list[str] = [
24+
*XML.get_packages(spark_version=spark_version),
25+
*Excel.get_packages(package_version="0.31.2", spark_version="3.5.6"),
26+
]
27+
28+
result = []
29+
if connection_types & {"postgres", "all"}:
30+
result.extend(Postgres.get_packages())
31+
if connection_types & {"oracle", "all"}:
32+
result.extend(Oracle.get_packages())
33+
if connection_types & {"clickhouse", "all"}:
34+
result.append("io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2")
35+
result.extend(Clickhouse.get_packages())
36+
if connection_types & {"mssql", "all"}:
37+
result.extend(MSSQL.get_packages())
38+
if connection_types & {"mysql", "all"}:
39+
result.extend(MySQL.get_packages())
40+
41+
if connection_types & {"s3", "all"}:
42+
result.extend(SparkS3.get_packages(spark_version=spark_version))
43+
44+
if connection_types & {"iceberg", "all"}:
45+
result.extend(
46+
[
47+
*Iceberg.get_packages(package_version="1.10.0", spark_version=spark_version),
48+
*Iceberg.S3Warehouse.get_packages(package_version="1.10.0"),
49+
],
50+
)
51+
52+
if connection_types & {"s3", "hdfs", "sftp", "ftp", "ftps", "samba", "webdav", "all"}:
53+
result.extend(file_formats_spark_packages)
54+
55+
return result
56+
57+
58+
def get_excluded_packages() -> list[str]:
59+
return SparkS3.get_exclude_packages()

syncmaster/worker/spark.py

Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
HDFSConnectionDTO,
1515
HiveConnectionDTO,
1616
)
17+
from syncmaster.worker.ivy2 import get_excluded_packages, get_packages
1718
from syncmaster.worker.settings import WorkerSettings
1819

1920
if TYPE_CHECKING:
@@ -53,62 +54,6 @@ def get_worker_spark_session(
5354
return spark_builder.getOrCreate()
5455

5556

56-
def get_packages(connection_types: set[str]) -> list[str]: # noqa: WPS212
57-
import pyspark
58-
from onetl.connection import (
59-
MSSQL,
60-
Clickhouse,
61-
Iceberg,
62-
MySQL,
63-
Oracle,
64-
Postgres,
65-
SparkS3,
66-
)
67-
from onetl.file.format import XML, Excel
68-
69-
spark_version = pyspark.__version__
70-
# excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
71-
file_formats_spark_packages: list[str] = [
72-
*XML.get_packages(spark_version=spark_version),
73-
*Excel.get_packages(package_version="0.31.2", spark_version="3.5.6"),
74-
]
75-
76-
result = []
77-
if connection_types & {"postgres", "all"}:
78-
result.extend(Postgres.get_packages())
79-
if connection_types & {"oracle", "all"}:
80-
result.extend(Oracle.get_packages())
81-
if connection_types & {"clickhouse", "all"}:
82-
result.append("io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.2")
83-
result.extend(Clickhouse.get_packages())
84-
if connection_types & {"mssql", "all"}:
85-
result.extend(MSSQL.get_packages())
86-
if connection_types & {"mysql", "all"}:
87-
result.extend(MySQL.get_packages())
88-
89-
if connection_types & {"s3", "all"}:
90-
result.extend(SparkS3.get_packages(spark_version=spark_version))
91-
92-
if connection_types & {"iceberg", "all"}:
93-
result.extend(
94-
[
95-
*Iceberg.get_packages(package_version="1.10.0", spark_version=spark_version),
96-
*Iceberg.S3Warehouse.get_packages(package_version="1.10.0"),
97-
],
98-
)
99-
100-
if connection_types & {"s3", "hdfs", "sftp", "ftp", "ftps", "samba", "webdav", "all"}:
101-
result.extend(file_formats_spark_packages)
102-
103-
return result
104-
105-
106-
def get_excluded_packages() -> list[str]:
107-
from onetl.connection import SparkS3
108-
109-
return SparkS3.get_exclude_packages()
110-
111-
11257
def get_spark_session_conf(
11358
spark_master: str | None,
11459
source: ConnectionDTO,

0 commit comments

Comments
 (0)