[pytorch] [build] [training] [ec2, sagemaker] Upgrade pytorch 2.6 gdrcopy to 2.5 and add telemetry (#4919)

jinyan-li1 · web-flow · commit a25bfe5bca57 · 2025-06-20T11:21:47.000-07:00
Upgrade pytorch 2.6 gdrcopy to 2.5 and add telemetry integration
diff --git a/pytorch/training/buildspec-2-6-ec2.yml b/pytorch/training/buildspec-2-6-ec2.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.6.0
 short_version: &SHORT_VERSION "2.6"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
diff --git a/pytorch/training/buildspec-2-6-sm.yml b/pytorch/training/buildspec-2-6-sm.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.6.0
 short_version: &SHORT_VERSION "2.6"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
diff --git a/pytorch/training/docker/2.6/py3/Dockerfile.cpu b/pytorch/training/docker/2.6/py3/Dockerfile.cpu
@@ -198,6 +198,10 @@ COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py
 
 RUN chmod +x /usr/local/bin/deep_learning_container.py
 
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+RUN chmod +x /usr/local/bin/bash_telemetry.sh
+RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
+
 # Removing the cache as it is needed for security verification
 RUN rm -rf /root/.cache | true
 
@@ -244,7 +248,8 @@ RUN pip install --no-cache-dir -U \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    thinc \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4 \
     blis \
     numpy \
  && pip uninstall -y dataclasses
@@ -262,6 +267,10 @@ RUN HOME_DIR=/root \
 # Removing the cache as it is needed for security verification
 RUN rm -rf /root/.cache | true
 
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
+
 # Starts framework
 CMD ["/bin/bash"]
 
@@ -313,7 +322,8 @@ RUN pip install --no-cache-dir -U \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    thinc \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4 \
     blis \
     numpy \
  && pip uninstall -y dataclasses
@@ -324,7 +334,7 @@ RUN pip install --no-cache-dir -U \
     "sagemaker>=2,<3" \
     "sagemaker-experiments<1" \
     sagemaker-pytorch-training \
-    sagemaker-training
+    "sagemaker-training==4.9.0"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
diff --git a/pytorch/training/docker/2.6/py3/Dockerfile.ec2.cpu.core_packages.json b/pytorch/training/docker/2.6/py3/Dockerfile.ec2.cpu.core_packages.json
@@ -1,10 +1,10 @@
 {
   "accelerate": {
-    "version_specifier": "==1.4.0",
+    "version_specifier": "==1.8.0",
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.3.2",
+    "version_specifier": "==1.4.1",
     "skip": "True"
   },
   "torch": {
diff --git a/pytorch/training/docker/2.6/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/training/docker/2.6/py3/Dockerfile.sagemaker.cpu.core_packages.json
@@ -1,10 +1,10 @@
 {
   "accelerate": {
-    "version_specifier": "==1.4.0",
+    "version_specifier": "==1.8.0",
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.3.2",
+    "version_specifier": "==1.4.1",
     "skip": "True"
   },
   "torch": {
diff --git a/pytorch/training/docker/2.6/py3/cu126/Dockerfile.ec2.gpu.core_packages.json b/pytorch/training/docker/2.6/py3/cu126/Dockerfile.ec2.gpu.core_packages.json
@@ -1,6 +1,6 @@
 {
   "accelerate": {
-    "version_specifier": "==1.4.0",
+    "version_specifier": "==1.8.0",
     "skip": "True"
   },
   "flash-attn": {
@@ -12,7 +12,7 @@
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.3.2",
+    "version_specifier": "==1.4.1",
     "skip": "True"
   },
   "torch": {
@@ -51,5 +51,8 @@
   },
   "awscli": {
     "version_specifier": "<2"
+  },
+  "triton": {
+    "version_specifier": "==3.2.0"
   }
 }
diff --git a/pytorch/training/docker/2.6/py3/cu126/Dockerfile.gpu b/pytorch/training/docker/2.6/py3/cu126/Dockerfile.gpu
@@ -7,7 +7,7 @@ ARG CUDA_VERSION=12.6.3
 ARG CUDNN_VERSION=9.7.0.66
 ARG NCCL_VERSION=2.23.4
 ARG EFA_VERSION=1.38.0
-ARG GDRCOPY_VERSION=2.4.4
+ARG GDRCOPY_VERSION=2.5
 ARG TE_VERSION=2.0
 ARG FLASH_ATTN_VERSION=2.7.3
 
@@ -247,6 +247,10 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py
 COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
 RUN chmod +x /usr/local/bin/start_cuda_compat.sh
 
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+RUN chmod +x /usr/local/bin/bash_telemetry.sh
+RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
+
 # Removing the cache as it is needed for security verification
 RUN rm -rf /root/.cache | true
 
@@ -288,7 +292,7 @@ RUN pip install --no-cache-dir -U \
     ${TORCHTEXT_URL} \
     ${TORCHDATA_URL} \
     torchtnt \
-    triton \
+    triton==3.2.0 \
     s3torchconnector \
     # fastai hasn't released a version compatible with torch 2.6.0
     # https://github.com/fastai/fastai/issues/4068
@@ -297,7 +301,8 @@ RUN pip install --no-cache-dir -U \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    thinc \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4 \
     blis \
     numpy \
  && pip uninstall dataclasses
@@ -306,7 +311,7 @@ RUN pip install --no-cache-dir -U \
 # The test binaries requires cuda driver library which could be found in conda
 # So update the linker path to point to it to avoid -Lcuda not found
 RUN cd /tmp \
- && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b R${GDRCOPY_VERSION} \
  && cd gdrcopy \
  && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
  && CUDA=${CUDA_HOME} make install \
@@ -396,7 +401,7 @@ RUN pip install --no-cache-dir -U \
     ${TORCHTEXT_URL} \
     ${TORCHDATA_URL} \
     torchtnt \
-    triton \
+    triton==3.2.0 \
     s3torchconnector \
     # fastai hasn't released a version compatible with torch 2.6.0
     # https://github.com/fastai/fastai/issues/4068
@@ -405,7 +410,8 @@ RUN pip install --no-cache-dir -U \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    thinc \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4 \
     blis \
     numpy \
  && pip uninstall -y dataclasses
@@ -414,7 +420,7 @@ RUN pip install --no-cache-dir -U \
 # The test binaries requires cuda driver library which could be found in conda
 # So update the linker path to point to it to avoid -Lcuda not found
 RUN cd /tmp \
- && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b R${GDRCOPY_VERSION} \
  && cd gdrcopy \
  && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
  && CUDA=${CUDA_HOME} make install \
@@ -442,7 +448,7 @@ RUN pip install --no-cache-dir -U \
     "sagemaker>=2,<3" \
     "sagemaker-experiments<1" \
     sagemaker-pytorch-training \
-    sagemaker-training
+    "sagemaker-training==4.9.0"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
diff --git a/pytorch/training/docker/2.6/py3/cu126/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/training/docker/2.6/py3/cu126/Dockerfile.sagemaker.gpu.core_packages.json
@@ -1,6 +1,6 @@
 {
   "accelerate": {
-    "version_specifier": "==1.4.0",
+    "version_specifier": "==1.8.0",
     "skip": "True"
   },
   "flash-attn": {
@@ -12,7 +12,7 @@
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.3.2",
+    "version_specifier": "==1.4.1",
     "skip": "True"
   },
   "torch": {
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py
@@ -36,6 +36,8 @@ def test_pytorch_2_6_gpu(
         (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
         (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_telemetry_framework_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
@@ -126,6 +128,8 @@ def test_pytorch_2_6_cpu(pytorch_training___2__6, ec2_connection, cpu_only):
         (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_telemetry_framework_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training: