Fix TF entrypoint and clean up patch script (#4972)

sallyseok · web-flow · commit ecdd9532fbf2 · 2025-06-30T10:31:44.000-07:00
* Fix TF entrypoint and clean up patch script * Reorder patch script * fix * debugging * fix * retest security * Restore ['dlc_developer_config.toml'] dlc_developer_config.toml: ('Restore to ' 'https://raw.githubusercontent.com/aws/deep-learning-containers/master/dlc_developer_config.toml') * revert buildspec
diff --git a/miscellaneous_dockerfiles/Dockerfile.autopatch b/miscellaneous_dockerfiles/Dockerfile.autopatch
@@ -8,7 +8,7 @@ ARG LATEST_RELEASED_IMAGE_URI=""
 COPY patching-info /opt/aws/dlc/patching-info
 COPY miscellaneous_scripts /opt/aws/dlc/miscellaneous_scripts
 
-COPY new-torchserve-entrypoint /tmp/new-torchserve-entrypoint
+COPY new-tf-entrypoint /tmp/new-tf-entrypoint
 COPY new_start_with_right_hostname /tmp/new_start_with_right_hostname
 COPY new_pytorch_inference_start_cuda_compat /tmp/new_pytorch_inference_start_cuda_compat
 COPY new_pytorch_training_start_cuda_compat /tmp/new_pytorch_training_start_cuda_compat
diff --git a/miscellaneous_scripts/dockerfile_patch_script.sh b/miscellaneous_scripts/dockerfile_patch_script.sh
@@ -12,7 +12,7 @@ if [ ! -d $PATCHING_INFO_PATH/patch-details-archive ] ; then \
     echo $LATEST_RELEASED_IMAGE_SHA >> $PATCHING_INFO_PATH/patch-details-archive/first_image_sha.txt ; \
 fi
 
-## We use > instead of >> since we want to override the contents of the previous file.
+# We use > instead of >> since we want to override the contents of the previous file
 echo $LATEST_RELEASED_IMAGE_SHA > $PATCHING_INFO_PATH/patch-details-archive/last_released_image_sha.txt
 
 # If patch-details is present, move it to patch-details-archive and add image_sha to the folder
@@ -27,6 +27,19 @@ fi
 # Rename the patch-details-current folder to patch-details
 mv $PATCHING_INFO_PATH/patch-details-current $PATCHING_INFO_PATH/patch-details
 
+# Language patching
+chmod +x $PATCHING_INFO_PATH/patch-details/install_script_language.sh && \
+$PATCHING_INFO_PATH/patch-details/install_script_language.sh
+
+
+##### Temporary Fixes #####
+
+# For TF 2.18 training sm gpu, replace entrypoint
+if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/tensorflow-training:2\.18\.0-gpu(.+)sagemaker ]]; then
+    mv /tmp/new-tf-entrypoint /usr/local/bin/dockerd-entrypoint.py
+    chmod +x /usr/local/bin/dockerd-entrypoint.py
+fi
+
 # For PT 2.4, 2.5 and 2.6 inference, install openssh-client to make mpi4py working
 if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference:2\.[4-6]\.[0-9]+-gpu ]]; then
     apt update && apt install -y --no-install-recommends openssh-client openssh-server && echo "Installed openssh-client openssh-server"
@@ -39,29 +52,19 @@ elif [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazon
     curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
 fi
 
-# Install packages and derive history and package diff data
-chmod +x $PATCHING_INFO_PATH/patch-details/install_script_language.sh && \
-$PATCHING_INFO_PATH/patch-details/install_script_language.sh
-
 # Upgrade sagemaker-training
 if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training:2\.[4-6](.+)sagemaker ]]; then
     pip install -U "sagemaker-training>4.7.4" "protobuf>=4.25.8,<6"
 fi
 
-# For PT inference sagemaker images, replace torchserve-entrypoint.py with the latest one
-# replace start_cuda_compat.sh if it's a gpu image
-if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference(.+)gpu(.+)sagemaker ]]; then
-    mv /tmp/new-torchserve-entrypoint /usr/local/bin/dockerd-entrypoint.py
+# For PT inference gpu sagemaker images, replace start_cuda_compat.sh
+if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference:2\.[4-6]\.[0-9]+-gpu(.+)sagemaker ]]; then
     mv /tmp/new_pytorch_inference_start_cuda_compat /usr/local/bin/start_cuda_compat.sh
-    chmod +x /usr/local/bin/dockerd-entrypoint.py
     chmod +x /usr/local/bin/start_cuda_compat.sh
-elif [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference(.+)sagemaker ]]; then
-    mv /tmp/new-torchserve-entrypoint /usr/local/bin/dockerd-entrypoint.py
-    chmod +x /usr/local/bin/dockerd-entrypoint.py
 fi
 
 # For PT training gpu sagemaker images, add dynamic cuda compat mounting script to entrypoint
-if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training(.+)gpu(.+)sagemaker ]]; then
+if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training:2\.[4-6]\.[0-9]+-gpu(.+)sagemaker ]]; then
     mv /tmp/new_start_with_right_hostname /usr/local/bin/start_with_right_hostname.sh
     mv /tmp/new_pytorch_training_start_cuda_compat /usr/local/bin/start_cuda_compat.sh
     chmod +x /usr/local/bin/start_with_right_hostname.sh
@@ -74,7 +77,8 @@ if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaw
     rm -rf /usr/local/cuda/bin/nvdisasm*
 fi
 
-pip cache purge
+###########################
+
 
 ## Update GPG key in case Nginx exists
 VARIABLE=$(apt-key list 2>&1  |  { grep -c nginx || true; }) && \
@@ -84,14 +88,12 @@ if [ $VARIABLE != 0 ]; then \
     apt-key add /usr/share/keyrings/nginx-archive-keyring.gpg;
 fi
 
+# OS patching
 chmod +x $PATCHING_INFO_PATH/patch-details/install_script_os.sh && \
 $PATCHING_INFO_PATH/patch-details/install_script_os.sh
 
-rm -rf /var/lib/apt/lists/* && \
-  apt-get clean
-
+# Derive history and package diff data
 python /opt/aws/dlc/miscellaneous_scripts/derive_history.py
-
 python /opt/aws/dlc/miscellaneous_scripts/extract_apt_patch_data.py --save-result-path $PATCHING_INFO_PATH/patch-details/os_summary.json --mode_type modify
 
 set -e
@@ -106,4 +108,9 @@ HOME_DIR=/root \
     && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python \
     && rm -rf ${HOME_DIR}/oss_compliance* || exit
 
+# Clean up
+echo "Cleaning up"
+pip cache purge
+rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
 rm -rf /tmp/* && rm -rf /opt/aws/dlc/miscellaneous_scripts
diff --git a/src/patch_helper.py b/src/patch_helper.py
@@ -256,9 +256,14 @@ def conduct_autopatch_build_setup(pre_push_image_object: DockerImage, download_p
         "build_artifacts",
     )
 
-    torchserve_entrypoint_path = os.path.join(
-        pytorch_inference_artifacts_path,
-        "torchserve-entrypoint.py",
+    tf_entrypoint_path = os.path.join(
+        os.sep,
+        get_cloned_folder_path(),
+        "tensorflow",
+        "training",
+        "docker",
+        "build_artifacts",
+        "dockerd-entrypoint.py",
     )
 
     start_with_right_hostname_path = os.path.join(
@@ -304,9 +309,9 @@ def conduct_autopatch_build_setup(pre_push_image_object: DockerImage, download_p
             "source": complete_patching_info_dump_location,
             "target": "patching-info",
         },
-        "new-torchserve-entrypoint": {
-            "source": torchserve_entrypoint_path,
-            "target": "new-torchserve-entrypoint",
+        "new-tf-entrypoint": {
+            "source": tf_entrypoint_path,
+            "target": "new-tf-entrypoint",
         },
         "new_start_with_right_hostname": {
             "source": start_with_right_hostname_path,
diff --git a/test/dlc_tests/sanity/test_anaconda.py b/test/dlc_tests/sanity/test_anaconda.py
@@ -15,7 +15,7 @@ def test_repo_anaconda_not_present(image):
 
         # First check to see if image has conda installed, if not, skip test since no packages installed from conda present
         conda_present = test_utils.run_cmd_on_container(
-            container_name, ctx, 'find . -name conda -not -path "**/.github/*"'
+            container_name, ctx, 'find . -name conda -not -path "**/.github/*" -ignore_readdir_race'
         ).stdout.strip()
         if not conda_present:
             pytest.skip(f"Image {image} does not have conda installed, skipping test.")