Skip to content

Commit 4492999

Browse files
Copilotmawad-amd
andcommitted
Sync with main, enforce no cache modifiers on remote stores, update tests
Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
1 parent 56ca569 commit 4492999

File tree

148 files changed

+14090
-2482
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+14090
-2482
lines changed

.devcontainer/devcontainer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// Creates a stable agent socket at ~/.ssh/ssh-agent.sock and optionally loads ~/.ssh/id_rsa.
1010
"initializeCommand": "bash -lc \"bash '${localWorkspaceFolder}/.devcontainer/ensure-ssh-agent.sh'\"",
1111
"runArgs": [
12-
"--name=${localEnv:USER}-iris-dev",
12+
"--name=${localEnv:USER}-${localWorkspaceFolderBasename}-dev",
1313
"--network=host",
1414
"--device=/dev/kfd",
1515
"--device=/dev/dri",

.devcontainer/ensure-ssh-agent.sh

100644100755
Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,29 @@ SOCK="${HOME}/.ssh/ssh-agent.sock"
1414

1515
mkdir -p "${HOME}/.ssh"
1616

17+
# Check if socket exists AND has keys loaded
1718
if [[ -S "${SOCK}" ]]; then
18-
exit 0
19+
if SSH_AUTH_SOCK="${SOCK}" ssh-add -l >/dev/null 2>&1; then
20+
# Agent is running and has keys loaded
21+
exit 0
22+
fi
23+
24+
# Check if agent is alive but just has no keys
25+
if SSH_AUTH_SOCK="${SOCK}" ssh-add -l 2>&1 | grep -q "no identities"; then
26+
# Agent is alive, just needs keys loaded - continue to key loading below
27+
:
28+
else
29+
# Agent is dead or socket is stale, remove it
30+
rm -f "${SOCK}" 2>/dev/null || true
31+
fi
1932
fi
2033

21-
rm -f "${SOCK}"
22-
ssh-agent -a "${SOCK}" -t 8h >/dev/null
34+
# Start agent if socket doesn't exist
35+
if [[ ! -S "${SOCK}" ]]; then
36+
ssh-agent -a "${SOCK}" -t 8h >/dev/null || true
37+
fi
2338

39+
# Load SSH key if it exists
2440
if [[ -f "${HOME}/.ssh/id_rsa" ]]; then
2541
SSH_AUTH_SOCK="${SOCK}" ssh-add "${HOME}/.ssh/id_rsa" >/dev/null 2>&1 || true
2642
fi

.github/scripts/acquire_gpus.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: MIT
3+
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# Acquire GPUs for CI workflows - to be called as a workflow step
6+
# Usage: acquire_gpus.sh <num_gpus>
7+
#
8+
# Exports GPU_DEVICES environment variable to $GITHUB_ENV for use in subsequent steps
9+
10+
set -e
11+
12+
NUM_GPUS=$1
13+
14+
if [ -z "$NUM_GPUS" ]; then
15+
echo "[ERROR] Missing required argument"
16+
echo "Usage: $0 <num_gpus>"
17+
exit 1
18+
fi
19+
20+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
21+
22+
echo "[ACQUIRE-GPUS] Acquiring $NUM_GPUS GPU(s)"
23+
source "$SCRIPT_DIR/gpu_allocator.sh"
24+
acquire_gpus "$NUM_GPUS"
25+
26+
echo "[ACQUIRE-GPUS] Allocated GPUs: $GPU_DEVICES"
27+
echo "[ACQUIRE-GPUS] GPU allocation details:"
28+
echo " GPU_DEVICES=$GPU_DEVICES"
29+
echo " ALLOCATED_GPU_BITMAP=$ALLOCATED_GPU_BITMAP"
30+
31+
# Export to GITHUB_ENV so subsequent steps can use these variables
32+
if [ -n "$GITHUB_ENV" ]; then
33+
{
34+
echo "GPU_DEVICES=$GPU_DEVICES"
35+
echo "ALLOCATED_GPU_BITMAP=$ALLOCATED_GPU_BITMAP"
36+
} >> "$GITHUB_ENV"
37+
echo "[ACQUIRE-GPUS] Exported variables to GITHUB_ENV"
38+
else
39+
echo "[ACQUIRE-GPUS] WARNING: GITHUB_ENV not set, variables not exported"
40+
fi

.github/scripts/container_build.sh

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,34 +32,34 @@ echo "✅ /dev/shm size OK (${shm_size_gb}GB)"
3232
if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
3333
echo "[INFO] Building with Apptainer..."
3434

35-
# Create persistent Apptainer directory
36-
mkdir -p ~/apptainer
37-
38-
# Define paths
39-
IMAGE_PATH=~/apptainer/iris-dev.sif
40-
DEF_FILE=apptainer/iris.def
41-
CHECKSUM_FILE=~/apptainer/iris-dev.sif.checksum
42-
4335
# Verify def file exists
36+
DEF_FILE=apptainer/iris.def
4437
if [ ! -f "$DEF_FILE" ]; then
4538
echo "[ERROR] Definition file $DEF_FILE not found"
4639
exit 1
4740
fi
4841

49-
# Calculate checksum of the def file
50-
NEW_CHECKSUM=$(sha256sum "$DEF_FILE" | awk '{print $1}')
42+
# Calculate checksum of the def file to use as subdirectory name
43+
DEF_CHECKSUM=$(sha256sum "$DEF_FILE" | awk '{print $1}')
44+
45+
# Create persistent Apptainer directory with checksum subdirectory
46+
mkdir -p "${HOME}/iris-apptainer-images/${DEF_CHECKSUM}"
47+
48+
# Define paths
49+
IMAGE_PATH="${HOME}/iris-apptainer-images/${DEF_CHECKSUM}/iris-dev.sif"
50+
CHECKSUM_FILE="${HOME}/iris-apptainer-images/${DEF_CHECKSUM}/iris-dev.sif.checksum"
5151

5252
# Check if image exists and has a valid checksum
5353
REBUILD_NEEDED=true
5454
if [ -f "$IMAGE_PATH" ] && [ -f "$CHECKSUM_FILE" ]; then
5555
OLD_CHECKSUM=$(head -n1 "$CHECKSUM_FILE" 2>/dev/null)
5656
# Validate checksum format (64 hex characters for SHA256)
57-
if [[ "$OLD_CHECKSUM" =~ ^[a-f0-9]{64}$ ]] && [ "$OLD_CHECKSUM" = "$NEW_CHECKSUM" ]; then
58-
echo "[INFO] Def file unchanged (checksum: $NEW_CHECKSUM)"
57+
if [[ "$OLD_CHECKSUM" =~ ^[a-f0-9]{64}$ ]] && [ "$OLD_CHECKSUM" = "$DEF_CHECKSUM" ]; then
58+
echo "[INFO] Def file unchanged (checksum: $DEF_CHECKSUM)"
5959
echo "[INFO] Skipping rebuild, using existing image at $IMAGE_PATH"
6060
REBUILD_NEEDED=false
6161
else
62-
echo "[INFO] Def file changed (old: ${OLD_CHECKSUM:-<invalid>}, new: $NEW_CHECKSUM)"
62+
echo "[INFO] Def file changed (old: ${OLD_CHECKSUM:-<invalid>}, new: $DEF_CHECKSUM)"
6363
echo "[INFO] Rebuilding Apptainer image..."
6464
fi
6565
else
@@ -70,9 +70,9 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
7070
if [ "$REBUILD_NEEDED" = true ]; then
7171
if apptainer build --force "$IMAGE_PATH" "$DEF_FILE"; then
7272
# Store the checksum only if build succeeded
73-
echo "$NEW_CHECKSUM" > "$CHECKSUM_FILE"
73+
echo "$DEF_CHECKSUM" > "$CHECKSUM_FILE"
7474
echo "[INFO] Built image: $IMAGE_PATH"
75-
echo "[INFO] Checksum saved: $NEW_CHECKSUM"
75+
echo "[INFO] Checksum saved: $DEF_CHECKSUM"
7676
else
7777
echo "[ERROR] Apptainer build failed"
7878
exit 1
@@ -83,14 +83,19 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
8383
echo "[INFO] Checking Docker images..."
8484
# Use GitHub variable if set, otherwise default to iris-dev
8585
IMAGE_NAME=${DOCKER_IMAGE_NAME:-"iris-dev"}
86-
86+
8787
# Check if the image exists
8888
if docker image inspect "$IMAGE_NAME" &> /dev/null; then
8989
echo "[INFO] Using existing Docker image: $IMAGE_NAME"
9090
else
91-
echo "[WARNING] Docker image $IMAGE_NAME not found"
92-
echo "[INFO] Please build it using: ./build_triton_image.sh"
93-
echo "[INFO] Or pull it if available from registry"
91+
echo "[INFO] Docker image $IMAGE_NAME not found, building..."
92+
DOCKER_DIR="$(dirname "$(realpath "$0")")/../../docker"
93+
if docker build -t "$IMAGE_NAME" "$DOCKER_DIR"; then
94+
echo "[INFO] Built Docker image: $IMAGE_NAME"
95+
else
96+
echo "[ERROR] Docker build failed"
97+
exit 1
98+
fi
9499
fi
95100
fi
96101

.github/scripts/container_exec.sh

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,21 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
5151
# Find image
5252
if [ -n "$CUSTOM_IMAGE" ]; then
5353
IMAGE="$CUSTOM_IMAGE"
54-
elif [ -f ~/apptainer/iris-dev.sif ]; then
55-
IMAGE=~/apptainer/iris-dev.sif
56-
elif [ -f apptainer/images/iris.sif ]; then
57-
IMAGE="apptainer/images/iris.sif"
5854
else
59-
echo "[ERROR] Apptainer image not found" >&2
60-
exit 1
55+
# Calculate checksum of def file to find the correct subdirectory
56+
DEF_FILE=apptainer/iris.def
57+
if [ ! -f "$DEF_FILE" ]; then
58+
echo "[ERROR] Definition file $DEF_FILE not found" >&2
59+
exit 1
60+
fi
61+
DEF_CHECKSUM=$(sha256sum "$DEF_FILE" | awk '{print $1}')
62+
63+
if [ -f "${HOME}/iris-apptainer-images/${DEF_CHECKSUM}/iris-dev.sif" ]; then
64+
IMAGE="${HOME}/iris-apptainer-images/${DEF_CHECKSUM}/iris-dev.sif"
65+
else
66+
echo "[ERROR] Apptainer image not found" >&2
67+
exit 1
68+
fi
6169
fi
6270

6371
# Create temporary overlay in workspace with unique name based on PID and timestamp
@@ -99,24 +107,11 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
99107
fi
100108

101109
# Build run command with proper GPU access
102-
# Get video and render group IDs from host
103-
VIDEO_GID=$(getent group video | cut -d: -f3)
104-
RENDER_GID=$(getent group render | cut -d: -f3)
105-
106110
RUN_CMD="docker run --rm --network=host --device=/dev/kfd --device=/dev/dri"
107111
RUN_CMD="$RUN_CMD --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
108112
RUN_CMD="$RUN_CMD -v ${PWD}:/iris_workspace -w /iris_workspace"
109113
RUN_CMD="$RUN_CMD --shm-size=16G --ulimit memlock=-1 --ulimit stack=67108864"
110-
RUN_CMD="$RUN_CMD --user $(id -u):$(id -g)"
111-
112-
# Add video and render groups for GPU access
113-
if [ -n "$VIDEO_GID" ]; then
114-
RUN_CMD="$RUN_CMD --group-add $VIDEO_GID"
115-
fi
116-
if [ -n "$RENDER_GID" ]; then
117-
RUN_CMD="$RUN_CMD --group-add $RENDER_GID"
118-
fi
119-
114+
120115
RUN_CMD="$RUN_CMD -e HOME=/iris_workspace"
121116
RUN_CMD="$RUN_CMD --entrypoint bash"
122117

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"31_message_passing": {
3+
"required_ranks": 2
4+
}
5+
}

0 commit comments

Comments
 (0)