Skip to content

Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3

Update Slurm workflows to use GitHub-hosted runners with SSH (#953)

Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3

name: PR Review and Slurm Test
on:
pull_request:
branches:
- main
types: [opened, synchronize, reopened]
env:
AWS_REGION: us-east-1
SLURM_HOST: p5en.smml.aiml.aws.dev
SLURM_USER: ghactions
RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-training
AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
permissions:
id-token: write
contents: read
pull-requests: read
jobs:
code-review:
name: Code Review and Analysis
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout PR Code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Code Analysis Tools
run: |
pip install pylint flake8 bandit semgrep
npm install -g @microsoft/eslint-formatter-sarif
- name: Run Static Analysis
id: static-analysis
run: |
echo "::group::Running Static Analysis"
# Create results directory
mkdir -p review-results
# Python linting
if find . -name "*.py" -type f | grep -q .; then
echo "Checking Python files..."
pylint --output-format=json $(find . -name "*.py" -type f) > review-results/pylint-results.json 2>/dev/null || true
flake8 --format=json --output-file=review-results/flake8-results.json . 2>/dev/null || true
fi
# Shell script checking
if find . -name "*.sh" -type f | grep -q .; then
echo "Checking Shell scripts..."
for script in $(find . -name "*.sh" -type f); do
bash -n "$script" 2>&1 | tee -a review-results/shell-check.log || true
done
fi
# Check for common errors
echo "Checking for common issues..."
# Check for hardcoded secrets
if grep -r -i "password\|secret\|token\|key" --include="*.py" --include="*.sh" --include="*.json" . | grep -v "example\|test\|mock" | grep -E "(=|:).*[a-zA-Z0-9]{20,}" > review-results/potential-secrets.log 2>/dev/null; then
echo "⚠️ Potential hardcoded secrets found" >> review-results/issues.log
fi
# Check for syntax errors in Python
python -m py_compile $(find . -name "*.py" -type f) 2>&1 | tee review-results/python-syntax-errors.log || true
echo "::endgroup::"
- name: Check for Incompatibilities
id: compatibility-check
run: |
echo "::group::Checking for Incompatibilities"
# Check for API compatibility issues
if [ -f "requirements.txt" ]; then
echo "Checking requirements.txt for version conflicts..."
pip install --dry-run -r requirements.txt 2>&1 | tee review-results/pip-conflicts.log || true
fi
# Check for deprecated functions
if find . -name "*.py" -type f | grep -q .; then
grep -r "deprecated\|DeprecationWarning" --include="*.py" . > review-results/deprecation-warnings.log 2>/dev/null || true
fi
# Check Dockerfile syntax
if [ -f "Dockerfile" ]; then
echo "Checking Dockerfile..."
docker build --dry-run -t test-build . 2>&1 | tee review-results/dockerfile-check.log || true
fi
echo "::endgroup::"
- name: Upload Review Results
uses: actions/upload-artifact@v4
with:
name: code-review-results
path: review-results/
retention-days: 30
security-scan:
name: Security Best Practices Review
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout PR Code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Run Security Scans
id: security-scan
run: |
echo "::group::Running Security Scans"
mkdir -p security-results
# Bandit for Python security issues
if find . -name "*.py" -type f | grep -q .; then
pip install bandit
bandit -r . -f json -o security-results/bandit-results.json || true
bandit -r . -f txt -o security-results/bandit-report.txt || true
fi
# Check for hardcoded credentials
if command -v trufflehog &> /dev/null; then
trufflehog filesystem . --json > security-results/trufflehog-results.json 2>/dev/null || true
else
# Manual check for common patterns
grep -r -E "(password|passwd|pwd)\s*=\s*[\"'][^\"']{8,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . > security-results/credentials-check.log 2>/dev/null || true
grep -r -E "(api_key|apikey|api-key)\s*=\s*[\"'][^\"']{10,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null || true
grep -r -E "(secret|token)\s*=\s*[\"'][^\"']{15,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null || true
fi
# Check for insecure configurations
echo "Checking for insecure configurations..."
# Check for HTTP instead of HTTPS
grep -r "http://" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" --include="*.tf" . | grep -v "localhost\|127.0.0.1\|example.com" > security-results/insecure-http.log 2>/dev/null || true
# Check for overly permissive file permissions in scripts
if [ -f "install.sh" ]; then
if grep -E "chmod.*777|chmod.*a\+rw" install.sh > /dev/null 2>&1; then
echo "⚠️ Overly permissive file permissions found in install.sh" >> security-results/permission-issues.log
fi
fi
# Check for eval/exec of user input
grep -r "eval\|exec" --include="*.py" --include="*.sh" . | grep -v "# " | head -20 > security-results/code-execution-risks.log 2>/dev/null || true
# Check Dockerfile security
if [ -f "Dockerfile" ]; then
# Check for running as root
if ! grep -q "USER" Dockerfile; then
echo "⚠️ Dockerfile does not specify USER - container runs as root" >> security-results/dockerfile-security.log
fi
# Check for latest tag usage
if grep -E "FROM.*:latest" Dockerfile > /dev/null 2>&1; then
echo "⚠️ Dockerfile uses 'latest' tag - use specific versions" >> security-results/dockerfile-security.log
fi
# Check for ADD vs COPY
if grep -E "^ADD" Dockerfile > /dev/null 2>&1; then
echo "⚠️ Dockerfile uses ADD - prefer COPY for better security" >> security-results/dockerfile-security.log
fi
fi
echo "::endgroup::"
- name: Generate Security Report
run: |
echo "::group::Generating Security Report"
cat > security-results/security-report.md << 'EOF'
# Security Review Report
## Summary
EOF
# Count issues
CRITICAL=0
HIGH=0
MEDIUM=0
LOW=0
if [ -f "security-results/bandit-results.json" ]; then
CRITICAL=$(jq '[.results[] | select(.issue_severity == "CRITICAL")] | length' security-results/bandit-results.json 2>/dev/null || echo 0)
HIGH=$(jq '[.results[] | select(.issue_severity == "HIGH")] | length' security-results/bandit-results.json 2>/dev/null || echo 0)
MEDIUM=$(jq '[.results[] | select(.issue_severity == "MEDIUM")] | length' security-results/bandit-results.json 2>/dev/null || echo 0)
LOW=$(jq '[.results[] | select(.issue_severity == "LOW")] | length' security-results/bandit-results.json 2>/dev/null || echo 0)
fi
cat >> security-results/security-report.md << EOF
- Critical Issues: $CRITICAL
- High Issues: $HIGH
- Medium Issues: $MEDIUM
- Low Issues: $LOW
## Detailed Findings
EOF
if [ -f "security-results/bandit-report.txt" ]; then
cat >> security-results/security-report.md << EOF
### Bandit Security Scan
\`\`\`
$(cat security-results/bandit-report.txt)
\`\`\`
EOF
fi
if [ -f "security-results/credentials-check.log" ] && [ -s "security-results/credentials-check.log" ]; then
cat >> security-results/security-report.md << EOF
### Potential Hardcoded Credentials
\`\`\`
$(cat security-results/credentials-check.log)
\`\`\`
EOF
fi
if [ -f "security-results/dockerfile-security.log" ]; then
cat >> security-results/security-report.md << EOF
### Dockerfile Security Issues
\`\`\`
$(cat security-results/dockerfile-security.log)
\`\`\`
EOF
fi
echo "::endgroup::"
- name: Upload Security Results
uses: actions/upload-artifact@v4
with:
name: security-scan-results
path: security-results/
retention-days: 30
- name: Check Security Gate
run: |
if [ -f "security-results/bandit-results.json" ]; then
CRITICAL=$(jq '[.results[] | select(.issue_severity == "CRITICAL")] | length' security-results/bandit-results.json 2>/dev/null || echo 0)
if [ "$CRITICAL" -gt 0 ]; then
echo "❌ Critical security issues found!"
exit 1
fi
fi
echo "✅ No critical security issues found"
version-check:
name: Version Requirements Check
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Checkout PR Code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Check Driver and Library Versions
id: version-check
run: |
echo "::group::Checking Version Requirements"
mkdir -p version-results
# Define minimum versions
MIN_EFA="1.47.0"
MIN_NCCL="2.28"
MIN_CUDA="13.0"
echo "Minimum required versions:" > version-results/version-report.txt
echo " EFA Installer: $MIN_EFA" >> version-results/version-report.txt
echo " NCCL: $MIN_NCCL" >> version-results/version-report.txt
echo " CUDA: $MIN_CUDA" >> version-results/version-report.txt
echo "" >> version-results/version-report.txt
VIOLATIONS=0
# Check Dockerfile
if [ -f "Dockerfile" ]; then
echo "Checking Dockerfile..." >> version-results/version-report.txt
# Check for EFA
if grep -i "efa" Dockerfile > /dev/null 2>&1; then
EFA_VERSION=$(grep -i "efa" Dockerfile | grep -oE "[0-9]+\.[0-9]+\.[0-9]+" | head -1)
if [ -n "$EFA_VERSION" ]; then
echo " Found EFA version: $EFA_VERSION" >> version-results/version-report.txt
if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_VERSION" | sort -V | head -n1)" != "$MIN_EFA" ]; then
echo " ❌ EFA version $EFA_VERSION is below minimum $MIN_EFA" >> version-results/version-report.txt
VIOLATIONS=$((VIOLATIONS + 1))
else
echo " ✅ EFA version $EFA_VERSION meets requirement" >> version-results/version-report.txt
fi
fi
fi
# Check for NCCL
if grep -i "nccl" Dockerfile > /dev/null 2>&1; then
NCCL_VERSION=$(grep -i "nccl" Dockerfile | grep -oE "[0-9]+\.[0-9]+(\.[0-9]+)?" | head -1)
if [ -n "$NCCL_VERSION" ]; then
echo " Found NCCL version: $NCCL_VERSION" >> version-results/version-report.txt
if [ "$(printf '%s\n' "$MIN_NCCL" "$NCCL_VERSION" | sort -V | head -n1)" != "$MIN_NCCL" ]; then
echo " ❌ NCCL version $NCCL_VERSION is below minimum $MIN_NCCL" >> version-results/version-report.txt
VIOLATIONS=$((VIOLATIONS + 1))
else
echo " ✅ NCCL version $NCCL_VERSION meets requirement" >> version-results/version-report.txt
fi
fi
fi
# Check for CUDA
if grep -i "cuda" Dockerfile > /dev/null 2>&1; then
CUDA_VERSION=$(grep -i "cuda" Dockerfile | grep -oE "[0-9]+\.[0-9]+" | head -1)
if [ -n "$CUDA_VERSION" ]; then
echo " Found CUDA version: $CUDA_VERSION" >> version-results/version-report.txt
if [ "$(printf '%s\n' "$MIN_CUDA" "$CUDA_VERSION" | sort -V | head -n1)" != "$MIN_CUDA" ]; then
echo " ❌ CUDA version $CUDA_VERSION is below minimum $MIN_CUDA" >> version-results/version-report.txt
VIOLATIONS=$((VIOLATIONS + 1))
else
echo " ✅ CUDA version $CUDA_VERSION meets requirement" >> version-results/version-report.txt
fi
fi
fi
fi
# Check requirements.txt
if [ -f "requirements.txt" ]; then
echo "" >> version-results/version-report.txt
echo "Checking requirements.txt..." >> version-results/version-report.txt
# Check for relevant packages
if grep -i "nvidia\|cuda\|cupy" requirements.txt > /dev/null 2>&1; then
grep -i "nvidia\|cuda\|cupy" requirements.txt >> version-results/version-report.txt
fi
fi
# Check sbatch scripts
for script in $(find . -name "*.sbatch" -o -name "*.sh" | xargs grep -l "sbatch\|srun" 2>/dev/null); do
echo "" >> version-results/version-report.txt
echo "Checking sbatch script: $script" >> version-results/version-report.txt
# Check for module loads
if grep -E "module load|module use" "$script" > /dev/null 2>&1; then
grep -E "module load|module use" "$script" >> version-results/version-report.txt
# Check for specific module versions
if grep -i "efa" "$script" | grep -oE "[0-9]+\.[0-9]+" > /dev/null 2>&1; then
EFA_MOD=$(grep -i "efa" "$script" | grep -oE "[0-9]+\.[0-9]+" | head -1)
if [ -n "$EFA_MOD" ]; then
if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_MOD" | sort -V | head -n1)" != "$MIN_EFA" ]; then
echo " ❌ EFA module version $EFA_MOD is below minimum $MIN_EFA" >> version-results/version-report.txt
VIOLATIONS=$((VIOLATIONS + 1))
fi
fi
fi
fi
done
# Check for environment configuration files
if [ -f "environment.yml" ]; then
echo "" >> version-results/version-report.txt
echo "Checking environment.yml..." >> version-results/version-report.txt
grep -E "cuda|nccl|efa" environment.yml >> version-results/version-report.txt 2>/dev/null || true
fi
if [ -f "pyproject.toml" ]; then
echo "" >> version-results/version-report.txt
echo "Checking pyproject.toml..." >> version-results/version-report.txt
grep -E "cuda|nccl|efa" pyproject.toml >> version-results/version-report.txt 2>/dev/null || true
fi
# Create JSON report
cat > version-results/version-check.json << EOF
{
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"pr_number": "${{ github.event.pull_request.number }}",
"requirements": {
"efa_minimum": "$MIN_EFA",
"nccl_minimum": "$MIN_NCCL",
"cuda_minimum": "$MIN_CUDA"
},
"violations": $VIOLATIONS,
"status": "$([ $VIOLATIONS -eq 0 ] && echo "PASS" || echo "FAIL")"
}
EOF
echo "" >> version-results/version-report.txt
echo "Total violations: $VIOLATIONS" >> version-results/version-report.txt
cat version-results/version-report.txt
echo "::endgroup::"
if [ $VIOLATIONS -gt 0 ]; then
echo "❌ Version requirements not met"
exit 1
fi
- name: Upload Version Check Results
uses: actions/upload-artifact@v4
with:
name: version-check-results
path: version-results/
retention-days: 30
slurm-test:
name: Slurm Cluster Testing
runs-on: ubuntu-latest
timeout-minutes: 130
needs: [code-review, security-scan, version-check]
if: ${{ always() && needs.code-review.result == 'success' && needs.security-scan.result == 'success' && needs.version-check.result == 'success' }}
steps:
- name: Checkout PR Code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup SSH Key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
chmod 600 ~/.ssh/slurm_key
ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null || true
- name: Prepare Test Environment
run: |
echo "::group::Preparing Test Environment"
# Create test directory name
TEST_DIR="pr-${{ github.event.pull_request.number }}-$(date +%Y%m%d-%H%M%S)"
echo "TEST_DIR=$TEST_DIR" >> $GITHUB_ENV
echo "RESULTS_FILE=${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json" >> $GITHUB_ENV
# Create local test directory
mkdir -p test-artifacts
# Copy PR code to test directory
cp -r . test-artifacts/source-code
echo "Test directory: $TEST_DIR"
echo "Results file: ${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json"
echo "::endgroup::"
- name: Transfer Code to Slurm Cluster
run: |
echo "::group::Transferring Code to Slurm Cluster"
# Create remote directory
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "mkdir -p ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}"
# Transfer code
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -r test-artifacts/source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/
echo "Code transferred successfully"
echo "::endgroup::"
- name: Execute Tests on Slurm
id: slurm-test
timeout-minutes: 120
run: |
echo "::group::Executing Tests on Slurm Cluster"
# Create test execution script
cat > test-artifacts/run-tests.sh << 'TESTSCRIPT'
#!/bin/bash
#SBATCH --job-name=pr-test-${{ github.event.pull_request.number }}
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1
#SBATCH --time=02:00:00
#SBATCH --output=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.out
#SBATCH --error=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.err
set -e
cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}
# Initialize results JSON
cat > test-results.json << 'EOF'
{
"pr_number": "${{ github.event.pull_request.number }}",
"test_date": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"slurm_job_id": "$SLURM_JOB_ID",
"nodes_used": "$SLURM_NNODES",
"tests": {},
"status": "RUNNING"
}
EOF
# Check for README.md and follow instructions
if [ -f "README.md" ]; then
echo "Found README.md - following test instructions"
fi
# Run unit tests if available
echo "Running unit tests..."
START_TIME=$(date +%s)
if [ -f "requirements.txt" ]; then
pip install -r requirements.txt --user || true
fi
if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
pip install -e . --user || true
fi
UNIT_TEST_OUTPUT=""
UNIT_TEST_STATUS="SKIPPED"
if [ -f "pytest.ini" ] || [ -d "tests" ] || find . -name "test_*.py" -type f | grep -q .; then
pip install pytest --user || true
if python3 -m pytest --tb=short -v > unit-test-output.log 2>&1; then
UNIT_TEST_STATUS="PASSED"
UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
else
UNIT_TEST_STATUS="FAILED"
UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
fi
elif [ -f "Makefile" ] && grep -q "test" Makefile; then
if make test > unit-test-output.log 2>&1; then
UNIT_TEST_STATUS="PASSED"
UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
else
UNIT_TEST_STATUS="FAILED"
UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
fi
fi
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
# Update results
python3 << PYEOF
import json
with open('test-results.json', 'r') as f:
data = json.load(f)
data['tests']['unit_tests'] = {
'status': '$UNIT_TEST_STATUS',
'duration_seconds': $DURATION,
'output': """$UNIT_TEST_OUTPUT"""
}
with open('test-results.json', 'w') as f:
json.dump(data, f, indent=2)
PYEOF
# Run execution tests
echo "Running execution tests..."
START_TIME=$(date +%s)
EXEC_TEST_OUTPUT=""
EXEC_TEST_STATUS="SKIPPED"
# Check for execution test scripts
if [ -f "execute.py" ]; then
if python3 execute.py > exec-test-output.log 2>&1; then
EXEC_TEST_STATUS="PASSED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
else
EXEC_TEST_STATUS="FAILED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
fi
elif [ -f "run.sh" ]; then
chmod +x run.sh
if ./run.sh > exec-test-output.log 2>&1; then
EXEC_TEST_STATUS="PASSED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
else
EXEC_TEST_STATUS="FAILED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
fi
elif [ -f "main.py" ]; then
if python3 main.py > exec-test-output.log 2>&1; then
EXEC_TEST_STATUS="PASSED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
else
EXEC_TEST_STATUS="FAILED"
EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
fi
fi
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
# Update results
python3 << PYEOF
import json
with open('test-results.json', 'r') as f:
data = json.load(f)
data['tests']['execution_tests'] = {
'status': '$EXEC_TEST_STATUS',
'duration_seconds': $DURATION,
'output': """$EXEC_TEST_OUTPUT"""
}
# Determine overall status
overall_status = "PASSED"
for test_name, test_data in data['tests'].items():
if test_data['status'] == 'FAILED':
overall_status = 'FAILED'
break
elif test_data['status'] == 'SKIPPED':
overall_status = 'PARTIAL'
data['status'] = overall_status
data['completion_time'] = "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
with open('test-results.json', 'w') as f:
json.dump(data, f, indent=2)
PYEOF
# Copy results to final location
cp test-results.json ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }}
echo "Tests completed. Results saved to: ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }}"
# Exit with error if tests failed
if [ "$UNIT_TEST_STATUS" == "FAILED" ] || [ "$EXEC_TEST_STATUS" == "FAILED" ]; then
exit 1
fi
TESTSCRIPT
chmod +x test-artifacts/run-tests.sh
# Transfer and submit job
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no test-artifacts/run-tests.sh ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/
# Submit job and capture job ID
JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }} && sbatch run-tests.sh" | grep -oE '[0-9]+')
echo "Submitted Slurm job: $JOB_ID"
echo "SLURM_JOB_ID=$JOB_ID" >> $GITHUB_ENV
# Wait for job completion with timeout
echo "Waiting for job completion (timeout: 120 minutes)..."
START_WAIT=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_WAIT))
if [ $ELAPSED -gt 7200 ]; then
echo "Timeout reached - cancelling job"
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel $JOB_ID"
exit 1
fi
JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "squeue -j $JOB_ID -h -o %T" 2>/dev/null || echo "COMPLETED")
if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
echo "Job completed"
break
elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
echo "Job failed with status: $JOB_STATUS"
exit 1
fi
echo "Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) minutes)"
sleep 60
done
echo "::endgroup::"
- name: Retrieve Test Results
if: always()
run: |
echo "::group::Retrieving Test Results"
# Create local results directory
mkdir -p test-results
# Copy results from Slurm cluster
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/test-results.json test-results/ 2>/dev/null || true
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.out test-results/ 2>/dev/null || true
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.err test-results/ 2>/dev/null || true
# Copy the final results file
scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }} test-results/ 2>/dev/null || true
# Display results
if [ -f "test-results/test-results.json" ]; then
echo "Test Results:"
cat test-results/test-results.json | python3 -m json.tool || cat test-results/test-results.json
fi
echo "::endgroup::"
- name: Upload Test Results
if: always()
uses: actions/upload-artifact@v4
with:
name: slurm-test-results
path: test-results/
retention-days: 30
- name: Cleanup Slurm Resources
if: always()
run: |
echo "::group::Cleaning up Slurm Resources"
# Cancel job if still running
if [ -n "${{ env.SLURM_JOB_ID }}" ]; then
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel ${{ env.SLURM_JOB_ID }}" 2>/dev/null || true
fi
echo "Cleanup completed"
echo "::endgroup::"
- name: Final Status Check
run: |
if [ -f "test-results/test-results.json" ]; then
STATUS=$(python3 -c "import json; data=json.load(open('test-results/test-results.json')); print(data.get('status', 'UNKNOWN'))")
if [ "$STATUS" == "FAILED" ]; then
echo "❌ Tests failed"
exit 1
elif [ "$STATUS" == "PASSED" ]; then
echo "✅ All tests passed"
else
echo "⚠️ Tests status: $STATUS"
fi
else
echo "❌ No test results found"
exit 1
fi
notify-on-failure:
name: Send Failure Notification
runs-on: ubuntu-latest
needs: [code-review, security-scan, version-check, slurm-test]
if: failure()
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Send Email Notification
run: |
# Get the email from GitHub secret to avoid exposing it in logs
NOTIFICATION_EMAIL="${{ secrets.NOTIFICATION_EMAIL }}"
if [ -z "$NOTIFICATION_EMAIL" ]; then
echo "No notification email configured"
exit 0
fi
# Determine which job failed
FAILED_JOBS=""
if [ "${{ needs.code-review.result }}" == "failure" ]; then
FAILED_JOBS="$FAILED_JOBS- Code Review
"

Check failure on line 782 in .github/workflows/pr-review-and-slurm-test.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/pr-review-and-slurm-test.yml

Invalid workflow file

You have an error in your yaml syntax on line 782
fi
if [ "${{ needs.security-scan.result }}" == "failure" ]; then
FAILED_JOBS="$FAILED_JOBS- Security Scan
"
fi
if [ "${{ needs.version-check.result }}" == "failure" ]; then
FAILED_JOBS="$FAILED_JOBS- Version Check
"
fi
if [ "${{ needs.slurm-test.result }}" == "failure" ]; then
FAILED_JOBS="$FAILED_JOBS- Slurm Test
"
fi
# Send email using AWS SES
aws ses send-email \
--from "github-actions@aws.dev" \
--to "$NOTIFICATION_EMAIL" \
--subject "PR #${{ github.event.pull_request.number }} - Workflow Failed" \
--text "Pull Request #${{ github.event.pull_request.number }} has failed workflow checks.
Repository: ${{ github.repository }}
PR Title: ${{ github.event.pull_request.title }}
Author: ${{ github.event.pull_request.user.login }}
Branch: ${{ github.event.pull_request.head.ref }}
Failed Jobs:
$FAILED_JOBS
View details: ${{ github.event.pull_request.html_url }}
Workflow Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
- name: Fallback Notification
if: failure()
run: |
echo "::warning::Workflow failed for PR #${{ github.event.pull_request.number }}"
echo "Failed jobs may require manual review"