11#! /bin/bash
22
3- # Define the container name
3+ # Define the container name and image version
44CONTAINER_NAME=" dcgm-exporter"
5+ DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
6+ IMAGE=" nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} "
7+
8+ # Maximum number of retries
9+ MAX_RETRIES=5
10+ RETRY_DELAY=5 # Initial delay in seconds
511
612# Check if the container exists and is running
713if docker ps --filter " name=$CONTAINER_NAME " --filter " status=running" | grep -q " $CONTAINER_NAME " ; then
814 echo " Container $CONTAINER_NAME is already running."
15+ exit 0
916else
1017 echo " Container $CONTAINER_NAME is not running or does not exist..."
1118 echo " Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
1219 docker rm -f $CONTAINER_NAME && echo " Container $CONTAINER_NAME has been removed."
20+ echo " Proceeding with script..."
21+ fi
22+
23+ # Check for GPU, then proceed with script
24+ if nvidia-smi > /dev/null 2>&1 ; then
25+ echo " NVIDIA GPU found. Proceeding with script..."
1326
14- # Check for GPU, then proceed with script
15- if nvidia-smi > /dev/null 2>&1 ; then
16- echo " NVIDIA GPU found. Proceeding with script... "
27+ # Get the instance-type from EC2 instance metadata
28+ TOKEN= $( curl -X PUT " http://169.254.169.254/latest/api/token " -H " X-aws-ec2-metadata-token-ttl-seconds: 21600 " )
29+ INSTANCE_TYPE= $( curl -H " X-aws-ec2-metadata-token: $TOKEN " -s http://169.254.169.254/latest/meta-data/instance-type )
1730
18- # Get the instance-type from EC2 instance metadata
19- TOKEN=$( curl -X PUT " http://169.254.169.254/latest/api/token" -H " X-aws-ec2-metadata-token-ttl-seconds: 21600" )
20- INSTANCE_TYPE=$( curl -H " X-aws-ec2-metadata-token: $TOKEN " -s http://169.254.169.254/latest/meta-data/instance-type)
21-
22- DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
31+ echo " Instance Type is recognized as $INSTANCE_TYPE , setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION "
2332
24- echo " Instance Type is recognized as $INSTANCE_TYPE , setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION "
33+ # Retry logic for pulling the image
34+ attempt=0
35+ while [ $attempt -lt $MAX_RETRIES ]; do
36+ echo " Attempting to pull image ($attempt /$MAX_RETRIES )..."
37+ if sudo docker pull " $IMAGE " ; then
38+ echo " Successfully pulled image."
39+ break
40+ else
41+ attempt=$(( attempt + 1 ))
42+ if [ $attempt -lt $MAX_RETRIES ]; then
43+ echo " Pull failed. Retrying in $RETRY_DELAY seconds..."
44+ sleep $RETRY_DELAY
45+ RETRY_DELAY=$(( RETRY_DELAY * 2 )) # Exponential backoff
46+ else
47+ echo " Failed to pull Docker image after $MAX_RETRIES attempts. Exiting..."
48+ exit 1
49+ fi
50+ fi
51+ done
2552
26- # Run the DCGM Exporter Docker container
27- sudo docker run -d --restart always \
53+ # Run the DCGM Exporter Docker container
54+ if sudo docker run -d --restart always \
2855 --name $CONTAINER_NAME \
2956 --gpus all \
3057 --net host \
3158 --cap-add SYS_ADMIN \
32- nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
33- -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo " Failed to run DCGM Exporter Docker container" ; exit 1; }
34-
59+ $IMAGE \
60+ -f /etc/dcgm-exporter/dcp-metrics-included.csv; then
3561 echo " Running DCGM exporter in a Docker container on port 9400..."
3662 else
37- echo " NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully... "
38- exit 0
63+ echo " Failed to run DCGM Exporter Docker container "
64+ exit 1
3965 fi
40- fi
66+ else
67+ echo " NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
68+ exit 0
69+ fi
0 commit comments