Skip to content

Commit 77d91cb

Browse files
post startup script: Add timeout to get domain exec role creds call to prevent hanging (#634)
Co-authored-by: apcho <[email protected]>
1 parent f636c45 commit 77d91cb

File tree

1 file changed

+12
-7
lines changed

1 file changed

+12
-7
lines changed

template/v2/dirs/etc/sagemaker-ui/sagemaker_ui_post_startup.sh

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,14 @@ set +x
115115

116116
# Note: The $? check immediately follows the sagemaker-studio command to ensure we're checking its exit status.
117117
# Adding commands between these lines could lead to incorrect error handling.
118-
response=$( sagemaker-studio credentials get-domain-execution-role-credential-in-space --domain-id "$dataZoneDomainId" --profile default)
118+
response=$(timeout 30 sagemaker-studio credentials get-domain-execution-role-credential-in-space --domain-id "$dataZoneDomainId" --profile default)
119119
responseStatus=$?
120120

121121
set -x
122122

123123
if [ $responseStatus -ne 0 ]; then
124124
echo "Failed to fetch domain execution role credentials. Will skip adding new credentials profile: DomainExecutionRoleCreds."
125-
write_status_to_file "error" "Network issue detected. Your domain may be using a public subnet, which affects IDE functionality. Please contact your administrator."
125+
write_status_to_file "error" "Network issue detected. Your domain may be using a public subnet, which affects IDE functionality. Please contact your admin."
126126
else
127127
aws configure set credential_process "sagemaker-studio credentials get-domain-execution-role-credential-in-space --domain-id $dataZoneDomainId --profile default" --profile DomainExecutionRoleCreds
128128
echo "Successfully configured DomainExecutionRoleCreds profile"
@@ -177,18 +177,23 @@ else
177177
echo readonly LOGNAME >> ~/.bashrc
178178
fi
179179

180-
set -e
181-
182-
# write unexpected error to file if any of the remaining scripts fail.
183-
trap 'write_status_to_file "error" "An unexpected error occurred. Please stop and restart your space to retry."' ERR
184-
185180
# Generate sagemaker pysdk intelligent default config
186181
nohup python /etc/sagemaker/sm_pysdk_default_config.py &
187182
# Only run the following commands if SAGEMAKER_APP_TYPE_LOWERCASE is jupyterlab
188183
if [ "${SAGEMAKER_APP_TYPE_LOWERCASE}" = "jupyterlab" ]; then
184+
# do not fail immediately for non-zero exit code returned
185+
# by start-workflows-container. An expected non-zero exit
186+
# code will be returned if there is not a minimum of 2
187+
# CPU cores available.
189188
# Start workflows local runner
190189
bash /etc/sagemaker-ui/workflows/start-workflows-container.sh
191190

191+
# ensure functions inherit traps and fail immediately
192+
set -eE
193+
194+
# write unexpected error to file if any of the remaining scripts fail.
195+
trap 'write_status_to_file "error" "An unexpected error occurred. Please stop and restart your space to retry."' ERR
196+
192197
# Install conda and pip dependencies if lib mgmt config existing
193198
bash /etc/sagemaker-ui/libmgmt/install-lib.sh $HOME/src
194199

0 commit comments

Comments
 (0)