Skip to content

Commit ecdd953

Browse files
authored
Fix TF entrypoint and clean up patch script (#4972)
* Fix TF entrypoint and clean up patch script * Reorder patch script * fix * debugging * fix * retest security * Restore ['dlc_developer_config.toml'] dlc_developer_config.toml: ('Restore to ' 'https://raw.githubusercontent.com/aws/deep-learning-containers/master/dlc_developer_config.toml') * revert buildspec
1 parent ac8313e commit ecdd953

File tree

4 files changed

+39
-27
lines changed

4 files changed

+39
-27
lines changed

miscellaneous_dockerfiles/Dockerfile.autopatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ ARG LATEST_RELEASED_IMAGE_URI=""
88
COPY patching-info /opt/aws/dlc/patching-info
99
COPY miscellaneous_scripts /opt/aws/dlc/miscellaneous_scripts
1010

11-
COPY new-torchserve-entrypoint /tmp/new-torchserve-entrypoint
11+
COPY new-tf-entrypoint /tmp/new-tf-entrypoint
1212
COPY new_start_with_right_hostname /tmp/new_start_with_right_hostname
1313
COPY new_pytorch_inference_start_cuda_compat /tmp/new_pytorch_inference_start_cuda_compat
1414
COPY new_pytorch_training_start_cuda_compat /tmp/new_pytorch_training_start_cuda_compat

miscellaneous_scripts/dockerfile_patch_script.sh

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ if [ ! -d $PATCHING_INFO_PATH/patch-details-archive ] ; then \
1212
echo $LATEST_RELEASED_IMAGE_SHA >> $PATCHING_INFO_PATH/patch-details-archive/first_image_sha.txt ; \
1313
fi
1414

15-
## We use > instead of >> since we want to override the contents of the previous file.
15+
# We use > instead of >> since we want to override the contents of the previous file
1616
echo $LATEST_RELEASED_IMAGE_SHA > $PATCHING_INFO_PATH/patch-details-archive/last_released_image_sha.txt
1717

1818
# If patch-details is present, move it to patch-details-archive and add image_sha to the folder
@@ -27,6 +27,19 @@ fi
2727
# Rename the patch-details-current folder to patch-details
2828
mv $PATCHING_INFO_PATH/patch-details-current $PATCHING_INFO_PATH/patch-details
2929

30+
# Language patching
31+
chmod +x $PATCHING_INFO_PATH/patch-details/install_script_language.sh && \
32+
$PATCHING_INFO_PATH/patch-details/install_script_language.sh
33+
34+
35+
##### Temporary Fixes #####
36+
37+
# For TF 2.18 training sm gpu, replace entrypoint
38+
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/tensorflow-training:2\.18\.0-gpu(.+)sagemaker ]]; then
39+
mv /tmp/new-tf-entrypoint /usr/local/bin/dockerd-entrypoint.py
40+
chmod +x /usr/local/bin/dockerd-entrypoint.py
41+
fi
42+
3043
# For PT 2.4, 2.5 and 2.6 inference, install openssh-client to make mpi4py working
3144
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference:2\.[4-6]\.[0-9]+-gpu ]]; then
3245
apt update && apt install -y --no-install-recommends openssh-client openssh-server && echo "Installed openssh-client openssh-server"
@@ -39,29 +52,19 @@ elif [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazon
3952
curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
4053
fi
4154

42-
# Install packages and derive history and package diff data
43-
chmod +x $PATCHING_INFO_PATH/patch-details/install_script_language.sh && \
44-
$PATCHING_INFO_PATH/patch-details/install_script_language.sh
45-
4655
# Upgrade sagemaker-training
4756
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training:2\.[4-6](.+)sagemaker ]]; then
4857
pip install -U "sagemaker-training>4.7.4" "protobuf>=4.25.8,<6"
4958
fi
5059

51-
# For PT inference sagemaker images, replace torchserve-entrypoint.py with the latest one
52-
# replace start_cuda_compat.sh if it's a gpu image
53-
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference(.+)gpu(.+)sagemaker ]]; then
54-
mv /tmp/new-torchserve-entrypoint /usr/local/bin/dockerd-entrypoint.py
60+
# For PT inference gpu sagemaker images, replace start_cuda_compat.sh
61+
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference:2\.[4-6]\.[0-9]+-gpu(.+)sagemaker ]]; then
5562
mv /tmp/new_pytorch_inference_start_cuda_compat /usr/local/bin/start_cuda_compat.sh
56-
chmod +x /usr/local/bin/dockerd-entrypoint.py
5763
chmod +x /usr/local/bin/start_cuda_compat.sh
58-
elif [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-inference(.+)sagemaker ]]; then
59-
mv /tmp/new-torchserve-entrypoint /usr/local/bin/dockerd-entrypoint.py
60-
chmod +x /usr/local/bin/dockerd-entrypoint.py
6164
fi
6265

6366
# For PT training gpu sagemaker images, add dynamic cuda compat mounting script to entrypoint
64-
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training(.+)gpu(.+)sagemaker ]]; then
67+
if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaws\.com/pytorch-training:2\.[4-6]\.[0-9]+-gpu(.+)sagemaker ]]; then
6568
mv /tmp/new_start_with_right_hostname /usr/local/bin/start_with_right_hostname.sh
6669
mv /tmp/new_pytorch_training_start_cuda_compat /usr/local/bin/start_cuda_compat.sh
6770
chmod +x /usr/local/bin/start_with_right_hostname.sh
@@ -74,7 +77,8 @@ if [[ $LATEST_RELEASED_IMAGE_URI =~ ^763104351884\.dkr\.ecr\.us-west-2\.amazonaw
7477
rm -rf /usr/local/cuda/bin/nvdisasm*
7578
fi
7679

77-
pip cache purge
80+
###########################
81+
7882

7983
## Update GPG key in case Nginx exists
8084
VARIABLE=$(apt-key list 2>&1 | { grep -c nginx || true; }) && \
@@ -84,14 +88,12 @@ if [ $VARIABLE != 0 ]; then \
8488
apt-key add /usr/share/keyrings/nginx-archive-keyring.gpg;
8589
fi
8690

91+
# OS patching
8792
chmod +x $PATCHING_INFO_PATH/patch-details/install_script_os.sh && \
8893
$PATCHING_INFO_PATH/patch-details/install_script_os.sh
8994

90-
rm -rf /var/lib/apt/lists/* && \
91-
apt-get clean
92-
95+
# Derive history and package diff data
9396
python /opt/aws/dlc/miscellaneous_scripts/derive_history.py
94-
9597
python /opt/aws/dlc/miscellaneous_scripts/extract_apt_patch_data.py --save-result-path $PATCHING_INFO_PATH/patch-details/os_summary.json --mode_type modify
9698

9799
set -e
@@ -106,4 +108,9 @@ HOME_DIR=/root \
106108
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python \
107109
&& rm -rf ${HOME_DIR}/oss_compliance* || exit
108110

111+
# Clean up
112+
echo "Cleaning up"
113+
pip cache purge
114+
rm -rf /var/lib/apt/lists/* && \
115+
apt-get clean
109116
rm -rf /tmp/* && rm -rf /opt/aws/dlc/miscellaneous_scripts

src/patch_helper.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,14 @@ def conduct_autopatch_build_setup(pre_push_image_object: DockerImage, download_p
256256
"build_artifacts",
257257
)
258258

259-
torchserve_entrypoint_path = os.path.join(
260-
pytorch_inference_artifacts_path,
261-
"torchserve-entrypoint.py",
259+
tf_entrypoint_path = os.path.join(
260+
os.sep,
261+
get_cloned_folder_path(),
262+
"tensorflow",
263+
"training",
264+
"docker",
265+
"build_artifacts",
266+
"dockerd-entrypoint.py",
262267
)
263268

264269
start_with_right_hostname_path = os.path.join(
@@ -304,9 +309,9 @@ def conduct_autopatch_build_setup(pre_push_image_object: DockerImage, download_p
304309
"source": complete_patching_info_dump_location,
305310
"target": "patching-info",
306311
},
307-
"new-torchserve-entrypoint": {
308-
"source": torchserve_entrypoint_path,
309-
"target": "new-torchserve-entrypoint",
312+
"new-tf-entrypoint": {
313+
"source": tf_entrypoint_path,
314+
"target": "new-tf-entrypoint",
310315
},
311316
"new_start_with_right_hostname": {
312317
"source": start_with_right_hostname_path,

test/dlc_tests/sanity/test_anaconda.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def test_repo_anaconda_not_present(image):
1515

1616
# First check to see if image has conda installed, if not, skip test since no packages installed from conda present
1717
conda_present = test_utils.run_cmd_on_container(
18-
container_name, ctx, 'find . -name conda -not -path "**/.github/*"'
18+
container_name, ctx, 'find . -name conda -not -path "**/.github/*" -ignore_readdir_race'
1919
).stdout.strip()
2020
if not conda_present:
2121
pytest.skip(f"Image {image} does not have conda installed, skipping test.")

0 commit comments

Comments
 (0)