From 4fa80065549d14e77d31afd7a76c1fa9a465ac3a Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Wed, 28 Feb 2024 21:20:53 -0800 Subject: [PATCH 1/9] Add base container image Jax TCPX --- .../jax-tcpx-base-image/Dockerfile | 42 +++++++++++++++++++ .../jax-tcpx-base-image/README.md | 3 ++ .../jax-tcpx-base-image/entrypoint.sh | 9 ++++ 3 files changed, 54 insertions(+) create mode 100644 sample_workloads/jax-tcpx-base-image/Dockerfile create mode 100644 sample_workloads/jax-tcpx-base-image/README.md create mode 100644 sample_workloads/jax-tcpx-base-image/entrypoint.sh diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile new file mode 100644 index 00000000..0c4688ff --- /dev/null +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -0,0 +1,42 @@ +FROM python:3.10-slim + +ENV VERSION=0.4.25 + +RUN pip install --no-cache-dir --upgrade pip + +RUN pip install --no-cache-dir --upgrade "jax[cuda12_pip]==${VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + +WORKDIR /workspace/ + +ADD entrypoint.sh . + +ENTRYPOINT ["bash", "entrypoint.sh"] + +# Environment variables required by TCPX +ENV NCCL_NVLS_ENABLE=0 +ENV NCCL_CROSS_NIC=0 +ENV NCCL_ALGO=Ring +ENV NCCL_PROTO=Simple +ENV NCCL_DEBUG=INFO +ENV NCCL_NET_GDR_LEVEL=PIX +ENV NCCL_P2P_PXN_LEVEL=0 +ENV NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64" +ENV NCCL_GPUDIRECTTCPX_FORCE_ACK=0 +ENV NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 +ENV NCCL_DYNAMIC_CHUNK_SIZE=524288 +ENV NCCL_P2P_NET_CHUNKSIZE=524288 +ENV NCCL_P2P_PCI_CHUNKSIZE=524288 +ENV NCCL_P2P_NVL_CHUNKSIZE=1048576 +ENV NCCL_NSOCKS_PERTHREAD=4 +ENV NCCL_SOCKET_NTHREADS=1 +ENV NCCL_MAX_NCHANNELS=12 +ENV NCCL_MIN_NCHANNELS=12 +ENV NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000 +ENV NCCL_SOCKET_IFNAME=eth0 +ENV NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" +ENV NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191" +ENV NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 +ENV NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 +# Might require adjusting based on where the TCPX socket is mounted +ENV NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=/run/tcpx diff --git a/sample_workloads/jax-tcpx-base-image/README.md b/sample_workloads/jax-tcpx-base-image/README.md new file mode 100644 index 00000000..9e01c406 --- /dev/null +++ b/sample_workloads/jax-tcpx-base-image/README.md @@ -0,0 +1,3 @@ +# Base image with JAX and TCPX config + +A base image to use with Jax diff --git a/sample_workloads/jax-tcpx-base-image/entrypoint.sh b/sample_workloads/jax-tcpx-base-image/entrypoint.sh new file mode 100644 index 00000000..f80c8f5b --- /dev/null +++ b/sample_workloads/jax-tcpx-base-image/entrypoint.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +function on_script_completion { + # semaphore to cleanly exit hardware utilization monitor + touch /run/tcpx/workload_terminated +} +trap on_script_completion EXIT + +exec "$@" From 311d263ef529cde8809483e5846f817eaa7082b0 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Wed, 28 Feb 2024 21:35:00 -0800 Subject: [PATCH 2/9] change to ARG --- sample_workloads/jax-tcpx-base-image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index 0c4688ff..5285009e 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.10-slim -ENV VERSION=0.4.25 +ARG VERSION=0.4.25 RUN pip install --no-cache-dir --upgrade pip From f2da51dfa09453ce3890bcf1095a0e6a3e4e4c67 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Thu, 29 Feb 2024 10:18:10 -0800 Subject: [PATCH 3/9] add cloudbuild example --- sample_workloads/jax-tcpx-base-image/README.md | 13 ++++++++++++- .../jax-tcpx-base-image/cloudbuild.yaml | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 sample_workloads/jax-tcpx-base-image/cloudbuild.yaml diff --git a/sample_workloads/jax-tcpx-base-image/README.md b/sample_workloads/jax-tcpx-base-image/README.md index 9e01c406..17cb8aad 100644 --- a/sample_workloads/jax-tcpx-base-image/README.md +++ b/sample_workloads/jax-tcpx-base-image/README.md @@ -1,3 +1,14 @@ # Base image with JAX and TCPX config -A base image to use with Jax +A base image to use with Jax and optimized TCPX config + +Image location: +``` +us-docker.pkg.dev/$PROJECT_ID/jax-gpu/base-tcpx:0.4.21 +``` + +## Pushing new image +``` +gcloud builds submit --config=cloudbuild.yaml \ + --substitutions=_VERSION=0.4.21 +``` \ No newline at end of file diff --git a/sample_workloads/jax-tcpx-base-image/cloudbuild.yaml b/sample_workloads/jax-tcpx-base-image/cloudbuild.yaml new file mode 100644 index 00000000..edcd6959 --- /dev/null +++ b/sample_workloads/jax-tcpx-base-image/cloudbuild.yaml @@ -0,0 +1,6 @@ +# Build and push image to Artifact Registry +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '--build-arg', 'VERSION=${_VERSION}', '-t', 'us-docker.pkg.dev/$PROJECT_ID/jax-gpu/base-tcpx:${_VERSION}', '.' ] +images: +- 'us-docker.pkg.dev/$PROJECT_ID/jax-gpu/base-tcpx:${_VERSION}' \ No newline at end of file From 8c6272d989bb1a7f156738055307aaaf83431f0a Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Thu, 29 Feb 2024 10:30:34 -0800 Subject: [PATCH 4/9] update README --- sample_workloads/jax-tcpx-base-image/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_workloads/jax-tcpx-base-image/README.md b/sample_workloads/jax-tcpx-base-image/README.md index 17cb8aad..70bdef20 100644 --- a/sample_workloads/jax-tcpx-base-image/README.md +++ b/sample_workloads/jax-tcpx-base-image/README.md @@ -10,5 +10,5 @@ us-docker.pkg.dev/$PROJECT_ID/jax-gpu/base-tcpx:0.4.21 ## Pushing new image ``` gcloud builds submit --config=cloudbuild.yaml \ - --substitutions=_VERSION=0.4.21 + --substitutions=_VERSION=0.4.21 --project gce-ai-infra ``` \ No newline at end of file From d91ada6b9cc028d87a8b518d181245873fddd11a Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 1 Mar 2024 17:56:04 -0800 Subject: [PATCH 5/9] switch to ubuntu as base image --- sample_workloads/jax-tcpx-base-image/Dockerfile | 9 +++++++-- sample_workloads/jax-tcpx-base-image/entrypoint.sh | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index 5285009e..707f5050 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -1,8 +1,13 @@ -FROM python:3.10-slim +FROM ubuntu:22.04 ARG VERSION=0.4.25 -RUN pip install --no-cache-dir --upgrade pip +# Install python and pip +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip && \ + rm -rf /var/lib/apt/lists/* + RUN pip install --no-cache-dir --upgrade "jax[cuda12_pip]==${VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/sample_workloads/jax-tcpx-base-image/entrypoint.sh b/sample_workloads/jax-tcpx-base-image/entrypoint.sh index f80c8f5b..3b4686d4 100644 --- a/sample_workloads/jax-tcpx-base-image/entrypoint.sh +++ b/sample_workloads/jax-tcpx-base-image/entrypoint.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -x + function on_script_completion { # semaphore to cleanly exit hardware utilization monitor touch /run/tcpx/workload_terminated From 436275d0ae18c6ac74887de19e66fd282d76cd0a Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 1 Mar 2024 20:54:56 -0800 Subject: [PATCH 6/9] Revert "switch to ubuntu as base image" This reverts commit d91ada6b9cc028d87a8b518d181245873fddd11a. --- sample_workloads/jax-tcpx-base-image/Dockerfile | 9 ++------- sample_workloads/jax-tcpx-base-image/entrypoint.sh | 2 -- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index 707f5050..5285009e 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -1,13 +1,8 @@ -FROM ubuntu:22.04 +FROM python:3.10-slim ARG VERSION=0.4.25 -# Install python and pip -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3 \ - python3-pip && \ - rm -rf /var/lib/apt/lists/* - +RUN pip install --no-cache-dir --upgrade pip RUN pip install --no-cache-dir --upgrade "jax[cuda12_pip]==${VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/sample_workloads/jax-tcpx-base-image/entrypoint.sh b/sample_workloads/jax-tcpx-base-image/entrypoint.sh index 3b4686d4..f80c8f5b 100644 --- a/sample_workloads/jax-tcpx-base-image/entrypoint.sh +++ b/sample_workloads/jax-tcpx-base-image/entrypoint.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -set -x - function on_script_completion { # semaphore to cleanly exit hardware utilization monitor touch /run/tcpx/workload_terminated From 762cd8534ca4285d5e5071f3fc82031aa71a94c0 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 1 Mar 2024 17:56:04 -0800 Subject: [PATCH 7/9] switch to ubuntu as base image --- sample_workloads/jax-tcpx-base-image/Dockerfile | 9 +++++++-- sample_workloads/jax-tcpx-base-image/entrypoint.sh | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index 5285009e..707f5050 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -1,8 +1,13 @@ -FROM python:3.10-slim +FROM ubuntu:22.04 ARG VERSION=0.4.25 -RUN pip install --no-cache-dir --upgrade pip +# Install python and pip +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip && \ + rm -rf /var/lib/apt/lists/* + RUN pip install --no-cache-dir --upgrade "jax[cuda12_pip]==${VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/sample_workloads/jax-tcpx-base-image/entrypoint.sh b/sample_workloads/jax-tcpx-base-image/entrypoint.sh index f80c8f5b..3b4686d4 100644 --- a/sample_workloads/jax-tcpx-base-image/entrypoint.sh +++ b/sample_workloads/jax-tcpx-base-image/entrypoint.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -x + function on_script_completion { # semaphore to cleanly exit hardware utilization monitor touch /run/tcpx/workload_terminated From c6bba7398631397e888c098041a02a8f10d3fc16 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 1 Mar 2024 21:00:15 -0800 Subject: [PATCH 8/9] change version --- sample_workloads/jax-tcpx-base-image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index 707f5050..cf76355a 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:22.04 -ARG VERSION=0.4.25 +ARG VERSION=0.4.21 # Install python and pip RUN apt-get update && apt-get install -y --no-install-recommends \ From 3bf0aee961a4176d53768658e439d8bb9edf37f7 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 1 Mar 2024 22:31:37 -0800 Subject: [PATCH 9/9] use absolute path in case child image uses workdir --- sample_workloads/jax-tcpx-base-image/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample_workloads/jax-tcpx-base-image/Dockerfile b/sample_workloads/jax-tcpx-base-image/Dockerfile index cf76355a..7c56a406 100644 --- a/sample_workloads/jax-tcpx-base-image/Dockerfile +++ b/sample_workloads/jax-tcpx-base-image/Dockerfile @@ -13,9 +13,9 @@ RUN pip install --no-cache-dir --upgrade "jax[cuda12_pip]==${VERSION}" -f https: WORKDIR /workspace/ -ADD entrypoint.sh . +ADD entrypoint.sh /entrypoint.sh -ENTRYPOINT ["bash", "entrypoint.sh"] +ENTRYPOINT ["bash", "/entrypoint.sh"] # Environment variables required by TCPX ENV NCCL_NVLS_ENABLE=0