Skip to content

Commit aaec0c0

Browse files
authored
update EFA version for pytorch 2.4 training (#5048)
1 parent 8fc599e commit aaec0c0

12 files changed

+317
-134
lines changed

dlc_developer_config.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,15 @@ ec2_benchmark_tests = false
7272
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
7373
### Off by default (set to false)
7474
ec2_tests_on_heavy_instances = false
75+
7576
### SM specific tests
7677
### On by default
7778
sagemaker_local_tests = true
79+
7880
### Set enable_ipv6 = true to run tests with IPv6-enabled resources
7981
### Off by default (set to false)
8082
enable_ipv6 = false
83+
8184
### Set the VPC name to be used for IPv6 testing, this variable is empty by default
8285
### To create an IPv6-enabled VPC and its related resources:
8386
### 1. Follow this AWS doc: https://docs.aws.amazon.com/vpc/latest/userguide/create-vpc.html#create-vpc-and-other-resources

pytorch/training/buildspec-2-4-ec2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.4.0
66
short_version: &SHORT_VERSION "2.4"
77
arch_type: x86
8-
autopatch_build: "True"
8+
autopatch_build: "False"
99

1010
repository_info:
1111
training_repository: &TRAINING_REPOSITORY

pytorch/training/docker/2.4/py3/Dockerfile.cpu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,20 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
166166
fsspec \
167167
"idna>=3.7" \
168168
"tqdm>=4.66.3" \
169-
"requests>=2.32.0" \
170-
"setuptools>=70.0.0" \
171-
"urllib3<2" \
169+
"requests>=2.32.4" \
170+
"setuptools>=78.1.1" \
171+
"urllib3==2.5.0" \
172172
"awscli<2" \
173+
"pip>=25.0" \
173174
&& /opt/conda/bin/mamba clean -afy \
174175
&& rm -rf /etc/apt/sources.list.d/*
175176

176177
# Install common pip packages (in case of conda package is not available)
177-
RUN pip install --no-cache-dir opencv-python mpi4py
178+
RUN pip install --no-cache-dir \
179+
# pin numpy version
180+
"numpy<2" \
181+
opencv-python \
182+
mpi4py
178183

179184
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt
180185

@@ -217,9 +222,9 @@ RUN pip install --no-cache-dir -U \
217222
${TORCHAUDIO_URL} \
218223
${TORCHTEXT_URL} \
219224
torchtnt \
220-
s3torchconnector \
221-
fastai \
222-
accelerate \
225+
"s3torchconnector==1.2.6" \
226+
"fastai==2.7.18" \
227+
"accelerate==1.0.1" \
223228
# pin numpy requirement for fastai dependency
224229
# requires explicit declaration of spacy, thic, blis
225230
spacy \

pytorch/training/docker/2.4/py3/Dockerfile.ec2.cpu.core_packages.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
"version_specifier": ">=70.0.0"
4444
},
4545
"urllib3": {
46-
"version_specifier": "<2"
46+
"version_specifier": "==2.5.0"
4747
},
4848
"awscli": {
4949
"version_specifier": "<2"

pytorch/training/docker/2.4/py3/Dockerfile.ec2.cpu.os_scan_allowlist.json

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"severity": "CRITICAL",
2828
"status": "ACTIVE",
2929
"title": "CVE-2025-32434 - torch",
30-
"reason_to_ignore": "this container is specifically pytorch 2.5.x so we can’t upgrade to 2.6"
30+
"reason_to_ignore": "this container is specifically pytorch 2.4.x so we cant upgrade to later minor versions"
3131
},
3232
{
3333
"description": "PyTorch is a Python package that provides tensor computation with strong GPU acceleration and deep neural networks built on a tape-based autograd system. In version 2.5.1 and prior, a Remote Command Execution (RCE) vulnerability exists in PyTorch when loading a model using torch.load with weights_only=True. This issue has been patched in version 2.6.0.",
@@ -57,6 +57,35 @@
5757
"status": "ACTIVE",
5858
"title": "CVE-2025-32434 - torch",
5959
"reason_to_ignore": "N/A"
60+
},
61+
{
62+
"description": "In PyTorch <=2.4.1, the RemoteModule has Deserialization RCE. NOTE: this is disputed by multiple parties because this is intended behavior in PyTorch distributed computing.",
63+
"vulnerability_id": "CVE-2024-48063",
64+
"name": "CVE-2024-48063",
65+
"package_name": "torch",
66+
"package_details": {
67+
"file_path": "/opt/conda/lib/python3.11/site-packages/torch-2.4.0+cpu.dist-info/METADATA",
68+
"name": "torch",
69+
"package_manager": "PYTHON",
70+
"version": "2.4.0+cpu",
71+
"release": null
72+
},
73+
"remediation": {
74+
"recommendation": {
75+
"text": "None Provided"
76+
}
77+
},
78+
"cvss_v3_score": 9.8,
79+
"cvss_v30_score": 0.0,
80+
"cvss_v31_score": 9.8,
81+
"cvss_v2_score": 0.0,
82+
"cvss_v3_severity": "CRITICAL",
83+
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-48063",
84+
"source": "NVD",
85+
"severity": "CRITICAL",
86+
"status": "ACTIVE",
87+
"title": "CVE-2024-48063 - torch",
88+
"reason_to_ignore": "this container is specifically pytorch 2.4.x so we cant upgrade to later minor versions"
6089
}
6190
],
6291
"jupyter_core": [
@@ -89,5 +118,36 @@
89118
"title": "CVE-2025-30167 - jupyter_core",
90119
"reason_to_ignore": "N/A"
91120
}
121+
],
122+
"libgdk-pixbuf-2.0-0": [
123+
{
124+
"description": "A flaw exists in gdk\u2011pixbuf within the gdk_pixbuf__jpeg_image_load_increment function (io-jpeg.c) and in glib\u2019s g_base64_encode_step (glib/gbase64.c). When processing maliciously crafted JPEG images, a heap buffer overflow can occur during Base64 encoding, allowing out-of-bounds reads from heap memory, potentially causing application crashes or arbitrary code execution.",
125+
"vulnerability_id": "CVE-2025-7345",
126+
"name": "CVE-2025-7345",
127+
"package_name": "libgdk-pixbuf-2.0-0",
128+
"package_details": {
129+
"file_path": null,
130+
"name": "libgdk-pixbuf-2.0-0",
131+
"package_manager": "OS",
132+
"version": "2.42.8+dfsg",
133+
"release": "1ubuntu0.3"
134+
},
135+
"remediation": {
136+
"recommendation": {
137+
"text": "None Provided"
138+
}
139+
},
140+
"cvss_v3_score": 7.5,
141+
"cvss_v30_score": 0.0,
142+
"cvss_v31_score": 7.5,
143+
"cvss_v2_score": 0.0,
144+
"cvss_v3_severity": "HIGH",
145+
"source_url": "https://people.canonical.com/~ubuntu-security/cve/2025/CVE-2025-7345.html",
146+
"source": "UBUNTU_CVE",
147+
"severity": "HIGH",
148+
"status": "ACTIVE",
149+
"title": "CVE-2025-7345 - libgdk-pixbuf-2.0-0",
150+
"reason_to_ignore": "No fix"
151+
}
92152
]
93153
}

pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.os_scan_allowlist.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,35 @@
5757
"status": "ACTIVE",
5858
"title": "CVE-2025-32434 - torch",
5959
"reason_to_ignore": "N/A"
60+
},
61+
{
62+
"description": "In PyTorch <=2.4.1, the RemoteModule has Deserialization RCE. NOTE: this is disputed by multiple parties because this is intended behavior in PyTorch distributed computing.",
63+
"vulnerability_id": "CVE-2024-48063",
64+
"name": "CVE-2024-48063",
65+
"package_name": "torch",
66+
"package_details": {
67+
"file_path": "/opt/conda/lib/python3.11/site-packages/torch-2.4.0+cpu.dist-info/METADATA",
68+
"name": "torch",
69+
"package_manager": "PYTHON",
70+
"version": "2.4.0+cpu",
71+
"release": null
72+
},
73+
"remediation": {
74+
"recommendation": {
75+
"text": "None Provided"
76+
}
77+
},
78+
"cvss_v3_score": 9.8,
79+
"cvss_v30_score": 0.0,
80+
"cvss_v31_score": 9.8,
81+
"cvss_v2_score": 0.0,
82+
"cvss_v3_severity": "CRITICAL",
83+
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-48063",
84+
"source": "NVD",
85+
"severity": "CRITICAL",
86+
"status": "ACTIVE",
87+
"title": "CVE-2024-48063 - torch",
88+
"reason_to_ignore": "this container is specifically pytorch 2.4.x so we cant upgrade to later minor versions"
6089
}
6190
],
6291
"libxml2": [
@@ -149,5 +178,36 @@
149178
"title": "CVE-2025-30167 - jupyter_core",
150179
"reason_to_ignore": "N/A"
151180
}
181+
],
182+
"libgdk-pixbuf-2.0-0": [
183+
{
184+
"description": "A flaw exists in gdk\u2011pixbuf within the gdk_pixbuf__jpeg_image_load_increment function (io-jpeg.c) and in glib\u2019s g_base64_encode_step (glib/gbase64.c). When processing maliciously crafted JPEG images, a heap buffer overflow can occur during Base64 encoding, allowing out-of-bounds reads from heap memory, potentially causing application crashes or arbitrary code execution.",
185+
"vulnerability_id": "CVE-2025-7345",
186+
"name": "CVE-2025-7345",
187+
"package_name": "libgdk-pixbuf-2.0-0",
188+
"package_details": {
189+
"file_path": null,
190+
"name": "libgdk-pixbuf-2.0-0",
191+
"package_manager": "OS",
192+
"version": "2.42.8+dfsg",
193+
"release": "1ubuntu0.3"
194+
},
195+
"remediation": {
196+
"recommendation": {
197+
"text": "None Provided"
198+
}
199+
},
200+
"cvss_v3_score": 7.5,
201+
"cvss_v30_score": 0.0,
202+
"cvss_v31_score": 7.5,
203+
"cvss_v2_score": 0.0,
204+
"cvss_v3_severity": "HIGH",
205+
"source_url": "https://people.canonical.com/~ubuntu-security/cve/2025/CVE-2025-7345.html",
206+
"source": "UBUNTU_CVE",
207+
"severity": "HIGH",
208+
"status": "ACTIVE",
209+
"title": "CVE-2025-7345 - libgdk-pixbuf-2.0-0",
210+
"reason_to_ignore": "No fix"
211+
}
152212
]
153213
}

pytorch/training/docker/2.4/py3/cu124/Dockerfile.ec2.gpu.core_packages.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,15 @@
4747
"version_specifier": ">=70.0.0"
4848
},
4949
"urllib3": {
50-
"version_specifier": "<2"
50+
"version_specifier": "==2.5.0"
5151
},
5252
"awscli": {
5353
"version_specifier": "<2"
5454
},
5555
"numpy": {
5656
"version_specifier": "<2"
57+
},
58+
"triton": {
59+
"version_specifier": "==3.1.0"
5760
}
5861
}

pytorch/training/docker/2.4/py3/cu124/Dockerfile.ec2.gpu.os_scan_allowlist.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,35 @@
906906
"status": "ACTIVE",
907907
"title": "CVE-2025-32434 - torch",
908908
"reason_to_ignore": "N/A"
909+
},
910+
{
911+
"description": "In PyTorch <=2.4.1, the RemoteModule has Deserialization RCE. NOTE: this is disputed by multiple parties because this is intended behavior in PyTorch distributed computing.",
912+
"vulnerability_id": "CVE-2024-48063",
913+
"name": "CVE-2024-48063",
914+
"package_name": "torch",
915+
"package_details": {
916+
"file_path": "/opt/conda/lib/python3.11/site-packages/torch-2.4.0+cu124.dist-info/METADATA",
917+
"name": "torch",
918+
"package_manager": "PYTHON",
919+
"version": "2.4.0+cu124",
920+
"release": null
921+
},
922+
"remediation": {
923+
"recommendation": {
924+
"text": "None Provided"
925+
}
926+
},
927+
"cvss_v3_score": 9.8,
928+
"cvss_v30_score": 0.0,
929+
"cvss_v31_score": 9.8,
930+
"cvss_v2_score": 0.0,
931+
"cvss_v3_severity": "CRITICAL",
932+
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-48063",
933+
"source": "NVD",
934+
"severity": "CRITICAL",
935+
"status": "ACTIVE",
936+
"title": "CVE-2024-48063 - torch",
937+
"reason_to_ignore": "this container is specifically pytorch 2.4.x so we cant upgrade to later minor versions"
909938
}
910939
],
911940
"jupyter_core": [
@@ -938,5 +967,36 @@
938967
"title": "CVE-2025-30167 - jupyter_core",
939968
"reason_to_ignore": "N/A"
940969
}
970+
],
971+
"libgdk-pixbuf-2.0-0": [
972+
{
973+
"description": "A flaw exists in gdk\u2011pixbuf within the gdk_pixbuf__jpeg_image_load_increment function (io-jpeg.c) and in glib\u2019s g_base64_encode_step (glib/gbase64.c). When processing maliciously crafted JPEG images, a heap buffer overflow can occur during Base64 encoding, allowing out-of-bounds reads from heap memory, potentially causing application crashes or arbitrary code execution.",
974+
"vulnerability_id": "CVE-2025-7345",
975+
"name": "CVE-2025-7345",
976+
"package_name": "libgdk-pixbuf-2.0-0",
977+
"package_details": {
978+
"file_path": null,
979+
"name": "libgdk-pixbuf-2.0-0",
980+
"package_manager": "OS",
981+
"version": "2.42.8+dfsg",
982+
"release": "1ubuntu0.3"
983+
},
984+
"remediation": {
985+
"recommendation": {
986+
"text": "None Provided"
987+
}
988+
},
989+
"cvss_v3_score": 7.5,
990+
"cvss_v30_score": 0.0,
991+
"cvss_v31_score": 7.5,
992+
"cvss_v2_score": 0.0,
993+
"cvss_v3_severity": "HIGH",
994+
"source_url": "https://people.canonical.com/~ubuntu-security/cve/2025/CVE-2025-7345.html",
995+
"source": "UBUNTU_CVE",
996+
"severity": "HIGH",
997+
"status": "ACTIVE",
998+
"title": "CVE-2025-7345 - libgdk-pixbuf-2.0-0",
999+
"reason_to_ignore": "No fix"
1000+
}
9411001
]
9421002
}

0 commit comments

Comments
 (0)