Skip to content

Commit 68d2d3f

Browse files
committed
Introduce remote caching and optimize docker
Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
1 parent 7261e73 commit 68d2d3f

File tree

4 files changed

+147
-72
lines changed

4 files changed

+147
-72
lines changed

.github/actions/test-template/action.yml

Lines changed: 8 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ inputs:
5353
runs:
5454
using: "composite"
5555
steps:
56+
- name: Checkout repository
57+
uses: actions/checkout@v2
58+
with:
59+
path: NeMo-Automodel
60+
5661
- name: Install Azure CLI
5762
if: ${{ inputs.has-azure-credentials == 'true' }}
5863
shell: bash
@@ -69,78 +74,11 @@ runs:
6974
tenant-id: ${{ inputs.azure-tenant-id }}
7075
subscription-id: ${{ inputs.azure-subscription-id }}
7176

72-
- name: Azure Fileshare
73-
if: ${{ inputs.has-azure-credentials == 'true' && inputs.is-unit-test == 'false' }}
74-
shell: bash
75-
id: azure-fileshare
76-
run: |
77-
echo "::group::Mount SMB drive"
78-
sudo apt update
79-
sudo apt install -y cifs-utils
80-
81-
RESOURCE_GROUP_NAME="azure-gpu-vm-runner_group"
82-
STORAGE_ACCOUNT_NAME="nemocistorageaccount2"
83-
FILE_SHARE_NAME="fileshare"
84-
85-
MNT_ROOT="/media"
86-
MNT_PATH="$MNT_ROOT/$STORAGE_ACCOUNT_NAME/$FILE_SHARE_NAME"
87-
88-
echo "MNT_PATH=$MNT_PATH" | tee -a "$GITHUB_OUTPUT"
89-
90-
sudo mkdir -p $MNT_PATH
91-
92-
# Create a folder to store the credentials for this storage account and
93-
# any other that you might set up.
94-
CREDENTIAL_ROOT="/etc/smbcredentials"
95-
sudo mkdir -p "/etc/smbcredentials"
96-
97-
# Get the storage account key for the indicated storage account.
98-
# You must be logged in with az login and your user identity must have
99-
# permissions to list the storage account keys for this command to work.
100-
STORAGE_ACCOUNT_KEY=$(az storage account keys list \
101-
--resource-group $RESOURCE_GROUP_NAME \
102-
--account-name $STORAGE_ACCOUNT_NAME \
103-
--query "[0].value" --output tsv | tr -d '"')
104-
105-
# Create the credential file for this individual storage account
106-
SMB_CREDENTIAL_FILE="$CREDENTIAL_ROOT/$STORAGE_ACCOUNT_NAME.cred"
107-
if [ ! -f $SMB_CREDENTIAL_FILE ]; then
108-
echo "username=$STORAGE_ACCOUNT_NAME" | sudo tee $SMB_CREDENTIAL_FILE > /dev/null
109-
echo "password=$STORAGE_ACCOUNT_KEY" | sudo tee -a $SMB_CREDENTIAL_FILE > /dev/null
110-
else
111-
echo "The credential file $SMB_CREDENTIAL_FILE already exists, and was not modified."
112-
fi
113-
114-
# Change permissions on the credential file so only root can read or modify the password file.
115-
sudo chmod 600 $SMB_CREDENTIAL_FILE
116-
117-
# This command assumes you have logged in with az login
118-
HTTP_ENDPOINT=$(az storage account show --resource-group $RESOURCE_GROUP_NAME --name $STORAGE_ACCOUNT_NAME --query "primaryEndpoints.file" --output tsv | tr -d '"')
119-
SMB_PATH=$(echo $HTTP_ENDPOINT | cut -c7-${#HTTP_ENDPOINT})$FILE_SHARE_NAME
120-
121-
STORAGE_ACCOUNT_KEY=$(az storage account keys list --resource-group $RESOURCE_GROUP_NAME --account-name $STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv | tr -d '"')
122-
123-
sudo mount -t cifs $SMB_PATH $MNT_PATH -o credentials=$SMB_CREDENTIAL_FILE,serverino,nosharesock,actimeo=30,mfsymlinks
124-
125-
ls -al $MNT_PATH/TestData
126-
echo "::endgroup::"
127-
128-
- name: Checkout repository
129-
uses: actions/checkout@v2
130-
with:
131-
path: NeMo-Automodel
132-
133-
- name: Build container
77+
- name: Azure ACR Login
78+
if: ${{ inputs.has-azure-credentials == 'true' }}
13479
shell: bash
135-
env:
136-
GH_TOKEN: ${{ inputs.PAT }}
13780
run: |
138-
echo "::group::Build test container"
139-
docker system prune -af
140-
docker build -f docker/Dockerfile \
141-
--build-arg BASE_IMAGE=pytorch \
142-
--target automodel_final -t automodel .
143-
echo "::endgroup::"
81+
az acr login --name nemoci
14482
14583
- name: Start container
14684
shell: bash
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Build container
16+
on:
17+
workflow_call:
18+
inputs:
19+
repo_name:
20+
description: "The name of the repo to build container"
21+
required: true
22+
default: ""
23+
type: string
24+
25+
jobs:
26+
steps:
27+
- name: Checkout
28+
uses: actions/checkout@v4
29+
30+
- name: Setup python
31+
uses: actions/setup-python@v5
32+
with:
33+
python-version: 3.12
34+
35+
- name: Get PR info
36+
id: get-pr-info
37+
if: startsWith(github.ref, 'refs/heads/pull-request/')
38+
uses: nv-gha-runners/get-pr-info@main
39+
40+
- name: Install Azure CLI
41+
shell: bash
42+
run: |
43+
echo "::group::Install Azure CLI"
44+
# Create systemd override for proper dependencies
45+
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
46+
echo "::endgroup::"
47+
48+
- name: Azure Login
49+
uses: azure/login@v2
50+
with:
51+
client-id: ${{ secrets.AZURE_CLIENT_ID }}
52+
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
53+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
54+
55+
- name: Azure ACR Login
56+
shell: bash
57+
run: |
58+
az acr login --name nemoci
59+
60+
- name: Install GH CLI
61+
shell: bash
62+
run: |
63+
apt-get update
64+
apt-get install -y gh
65+
66+
- name: Get last merged PR
67+
id: cache_from
68+
env:
69+
GH_TOKEN: ${{ github.token }}
70+
REPO_NAME: ${{ inputs.repo_name }}
71+
run: |
72+
LAST_PRS=$(gh api graphql -f query='
73+
query {
74+
repository(owner: "NVIDIA-NeMo", name: "$REPO_NAME") {
75+
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
76+
nodes {
77+
number
78+
}
79+
}
80+
}
81+
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
82+
echo "type=registry,ref=${{ env.container-registry }}/$REPO_NAME:$number-buildcache,mode=max"
83+
done)
84+
85+
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
86+
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
87+
echo "EOF" | tee -a $GITHUB_OUTPUT
88+
89+
- name: Set up Docker Buildx
90+
uses: docker/setup-buildx-action@v3
91+
92+
- name: Build and push
93+
uses: docker/build-push-action@v5
94+
env:
95+
REPO_NAME: ${{ inputs.repo_name }}
96+
with:
97+
file: ./docker/Dockerfile.ci
98+
push: true
99+
context: .
100+
build-args: |
101+
BASE_IMAGE=pytorch
102+
cache-from: |
103+
type=registry,ref=${{ env.container-registry }}/$REPO_NAME:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
104+
type=registry,ref=${{ env.container-registry }}/$REPO_NAME:main-buildcache,mode=max
105+
${{ steps.cache_from.outputs.LAST_PRS }}
106+
cache-to: |
107+
type=registry,ref=${{ env.container-registry }}/$REPO_NAME:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
108+
no-cache: false
109+
tags: |
110+
${{ env.container-registry }}/$REPO_NAME:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
111+
${{ env.container-registry }}/$REPO_NAME:${{ github.sha }}
112+
secrets: |
113+
GH_TOKEN=${{ secrets.PAT }}

.github/workflows/cicd-main.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,21 @@ jobs:
129129
run: |
130130
echo "Running CI tests"
131131
132+
cicd-container-build:
133+
needs: [pre-flight, cicd-wait-in-queue]
134+
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
135+
environment: nemo-ci
136+
if: |
137+
(
138+
success()
139+
|| needs.pre-flight.outputs.is_ci_workload == 'true'
140+
|| needs.pre-flight.outputs.force_run_all == 'true'
141+
)
142+
&& !cancelled()
143+
uses: ./.github/workflows/_build_container.yml
144+
with:
145+
repo_name: "Automodel"
146+
132147
cicd-unit-tests:
133148
strategy:
134149
fail-fast: false

docker/Dockerfile

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,25 @@ RUN if [ "$INSTALL_DEEPEP" = "True" ]; then \
123123

124124
FROM automodel_dep as automodel_final
125125

126+
WORKDIR /opt/Automodel
127+
128+
COPY pyproject.toml uv.lock /opt/Automodel/
129+
COPY nemo_automodel/__init__.py nemo_automodel/package_info.py /opt/Automodel/nemo_automodel
130+
126131
# Install Automodel
127132
ARG BASE_IMAGE=cuda
128133
ARG AUTOMODEL_INSTALL=all
129-
COPY . /opt/Automodel
130-
RUN cd /opt/Automodel && \
134+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
135+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
136+
--mount=type=cache,target=/root/.cache/uv \
131137
if [ "$BASE_IMAGE" = "pytorch" ]; then \
132138
sed -i '/\[tool\.uv\]/r /opt/Automodel/docker/common/uv-pytorch.toml' pyproject.toml && \
133139
mv /opt/Automodel/docker/common/uv-pytorch.lock /opt/Automodel/uv.lock; \
134140
fi && \
135141
uv sync --locked --extra $AUTOMODEL_INSTALL --all-groups
142+
uv cache prune
143+
144+
COPY . /opt/Automodel
136145

137146
COPY <<EOF /opt/venv/env.sh
138147
export UV_PROJECT_ENVIRONMENT=/opt/venv

0 commit comments

Comments
 (0)