Skip to content

Commit a25bfe5

Browse files
authored
[pytorch] [build] [training] [ec2, sagemaker] Upgrade pytorch 2.6 gdrcopy to 2.5 and add telemetry (#4919)
Upgrade pytorch 2.6 gdrcopy to 2.5 and add telemetry integration
1 parent 3c2f92d commit a25bfe5

9 files changed

+44
-21
lines changed

pytorch/training/buildspec-2-6-ec2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.6.0
66
short_version: &SHORT_VERSION "2.6"
77
arch_type: x86
8-
autopatch_build: "True"
8+
# autopatch_build: "True"
99

1010
repository_info:
1111
training_repository: &TRAINING_REPOSITORY

pytorch/training/buildspec-2-6-sm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.6.0
66
short_version: &SHORT_VERSION "2.6"
77
arch_type: x86
8-
autopatch_build: "True"
8+
# autopatch_build: "True"
99

1010
repository_info:
1111
training_repository: &TRAINING_REPOSITORY

pytorch/training/docker/2.6/py3/Dockerfile.cpu

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py
198198

199199
RUN chmod +x /usr/local/bin/deep_learning_container.py
200200

201+
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
202+
RUN chmod +x /usr/local/bin/bash_telemetry.sh
203+
RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
204+
201205
# Removing the cache as it is needed for security verification
202206
RUN rm -rf /root/.cache | true
203207

@@ -244,7 +248,8 @@ RUN pip install --no-cache-dir -U \
244248
# pin numpy requirement for fastai dependency
245249
# requires explicit declaration of spacy, thic, blis
246250
spacy \
247-
thinc \
251+
#thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
252+
thinc==8.3.4 \
248253
blis \
249254
numpy \
250255
&& pip uninstall -y dataclasses
@@ -262,6 +267,10 @@ RUN HOME_DIR=/root \
262267
# Removing the cache as it is needed for security verification
263268
RUN rm -rf /root/.cache | true
264269

270+
COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
271+
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
272+
ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
273+
265274
# Starts framework
266275
CMD ["/bin/bash"]
267276

@@ -313,7 +322,8 @@ RUN pip install --no-cache-dir -U \
313322
# pin numpy requirement for fastai dependency
314323
# requires explicit declaration of spacy, thic, blis
315324
spacy \
316-
thinc \
325+
#thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
326+
thinc==8.3.4 \
317327
blis \
318328
numpy \
319329
&& pip uninstall -y dataclasses
@@ -324,7 +334,7 @@ RUN pip install --no-cache-dir -U \
324334
"sagemaker>=2,<3" \
325335
"sagemaker-experiments<1" \
326336
sagemaker-pytorch-training \
327-
sagemaker-training
337+
"sagemaker-training==4.9.0"
328338

329339
# Install extra packages
330340
RUN pip install --no-cache-dir -U \

pytorch/training/docker/2.6/py3/Dockerfile.ec2.cpu.core_packages.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"accelerate": {
3-
"version_specifier": "==1.4.0",
3+
"version_specifier": "==1.8.0",
44
"skip": "True"
55
},
66
"s3torchconnector": {
7-
"version_specifier": "==1.3.2",
7+
"version_specifier": "==1.4.1",
88
"skip": "True"
99
},
1010
"torch": {

pytorch/training/docker/2.6/py3/Dockerfile.sagemaker.cpu.core_packages.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"accelerate": {
3-
"version_specifier": "==1.4.0",
3+
"version_specifier": "==1.8.0",
44
"skip": "True"
55
},
66
"s3torchconnector": {
7-
"version_specifier": "==1.3.2",
7+
"version_specifier": "==1.4.1",
88
"skip": "True"
99
},
1010
"torch": {

pytorch/training/docker/2.6/py3/cu126/Dockerfile.ec2.gpu.core_packages.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"accelerate": {
3-
"version_specifier": "==1.4.0",
3+
"version_specifier": "==1.8.0",
44
"skip": "True"
55
},
66
"flash-attn": {
@@ -12,7 +12,7 @@
1212
"skip": "True"
1313
},
1414
"s3torchconnector": {
15-
"version_specifier": "==1.3.2",
15+
"version_specifier": "==1.4.1",
1616
"skip": "True"
1717
},
1818
"torch": {
@@ -51,5 +51,8 @@
5151
},
5252
"awscli": {
5353
"version_specifier": "<2"
54+
},
55+
"triton": {
56+
"version_specifier": "==3.2.0"
5457
}
5558
}

pytorch/training/docker/2.6/py3/cu126/Dockerfile.gpu

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ARG CUDA_VERSION=12.6.3
77
ARG CUDNN_VERSION=9.7.0.66
88
ARG NCCL_VERSION=2.23.4
99
ARG EFA_VERSION=1.38.0
10-
ARG GDRCOPY_VERSION=2.4.4
10+
ARG GDRCOPY_VERSION=2.5
1111
ARG TE_VERSION=2.0
1212
ARG FLASH_ATTN_VERSION=2.7.3
1313

@@ -247,6 +247,10 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py
247247
COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
248248
RUN chmod +x /usr/local/bin/start_cuda_compat.sh
249249

250+
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
251+
RUN chmod +x /usr/local/bin/bash_telemetry.sh
252+
RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
253+
250254
# Removing the cache as it is needed for security verification
251255
RUN rm -rf /root/.cache | true
252256

@@ -288,7 +292,7 @@ RUN pip install --no-cache-dir -U \
288292
${TORCHTEXT_URL} \
289293
${TORCHDATA_URL} \
290294
torchtnt \
291-
triton \
295+
triton==3.2.0 \
292296
s3torchconnector \
293297
# fastai hasn't released a version compatible with torch 2.6.0
294298
# https://github.com/fastai/fastai/issues/4068
@@ -297,7 +301,8 @@ RUN pip install --no-cache-dir -U \
297301
# pin numpy requirement for fastai dependency
298302
# requires explicit declaration of spacy, thic, blis
299303
spacy \
300-
thinc \
304+
#thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
305+
thinc==8.3.4 \
301306
blis \
302307
numpy \
303308
&& pip uninstall dataclasses
@@ -306,7 +311,7 @@ RUN pip install --no-cache-dir -U \
306311
# The test binaries requires cuda driver library which could be found in conda
307312
# So update the linker path to point to it to avoid -Lcuda not found
308313
RUN cd /tmp \
309-
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
314+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b R${GDRCOPY_VERSION} \
310315
&& cd gdrcopy \
311316
&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
312317
&& CUDA=${CUDA_HOME} make install \
@@ -396,7 +401,7 @@ RUN pip install --no-cache-dir -U \
396401
${TORCHTEXT_URL} \
397402
${TORCHDATA_URL} \
398403
torchtnt \
399-
triton \
404+
triton==3.2.0 \
400405
s3torchconnector \
401406
# fastai hasn't released a version compatible with torch 2.6.0
402407
# https://github.com/fastai/fastai/issues/4068
@@ -405,7 +410,8 @@ RUN pip install --no-cache-dir -U \
405410
# pin numpy requirement for fastai dependency
406411
# requires explicit declaration of spacy, thic, blis
407412
spacy \
408-
thinc \
413+
#thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
414+
thinc==8.3.4 \
409415
blis \
410416
numpy \
411417
&& pip uninstall -y dataclasses
@@ -414,7 +420,7 @@ RUN pip install --no-cache-dir -U \
414420
# The test binaries requires cuda driver library which could be found in conda
415421
# So update the linker path to point to it to avoid -Lcuda not found
416422
RUN cd /tmp \
417-
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
423+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b R${GDRCOPY_VERSION} \
418424
&& cd gdrcopy \
419425
&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
420426
&& CUDA=${CUDA_HOME} make install \
@@ -442,7 +448,7 @@ RUN pip install --no-cache-dir -U \
442448
"sagemaker>=2,<3" \
443449
"sagemaker-experiments<1" \
444450
sagemaker-pytorch-training \
445-
sagemaker-training
451+
"sagemaker-training==4.9.0"
446452

447453
# Install extra packages
448454
RUN pip install --no-cache-dir -U \

pytorch/training/docker/2.6/py3/cu126/Dockerfile.sagemaker.gpu.core_packages.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"accelerate": {
3-
"version_specifier": "==1.4.0",
3+
"version_specifier": "==1.8.0",
44
"skip": "True"
55
},
66
"flash-attn": {
@@ -12,7 +12,7 @@
1212
"skip": "True"
1313
},
1414
"s3torchconnector": {
15-
"version_specifier": "==1.3.2",
15+
"version_specifier": "==1.4.1",
1616
"skip": "True"
1717
},
1818
"torch": {

test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def test_pytorch_2_6_gpu(
3636
(common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
3737
(common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
3838
(common_cases.pytorch_telemetry_framework_gpu, (pytorch_training, ec2_connection)),
39+
(common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
40+
(common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
3941
]
4042

4143
if "sagemaker" in pytorch_training:
@@ -126,6 +128,8 @@ def test_pytorch_2_6_cpu(pytorch_training___2__6, ec2_connection, cpu_only):
126128
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
127129
(common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
128130
(common_cases.pytorch_telemetry_framework_cpu, (pytorch_training, ec2_connection)),
131+
(common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
132+
(common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
129133
]
130134

131135
if "sagemaker" in pytorch_training:

0 commit comments

Comments
 (0)