.github/workflows/pr-review-and-slurm-test.yml

Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3

Workflow file for this run

.github/workflows/pr-review-and-slurm-test.yml at 7d81153

	name: PR Review and Slurm Test

	on:
	pull_request:
	branches:
	- main
	types: [opened, synchronize, reopened]

	env:
	AWS_REGION: us-east-1
	SLURM_HOST: p5en.smml.aiml.aws.dev
	SLURM_USER: ghactions
	RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-training
	AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole

	permissions:
	id-token: write
	contents: read
	pull-requests: read

	jobs:
	code-review:
	name: Code Review and Analysis
	runs-on: ubuntu-latest
	timeout-minutes: 30

	steps:
	- name: Checkout PR Code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install Code Analysis Tools
	run: \|
	pip install pylint flake8 bandit semgrep
	npm install -g @microsoft/eslint-formatter-sarif

	- name: Run Static Analysis
	id: static-analysis
	run: \|
	echo "::group::Running Static Analysis"

	# Create results directory
	mkdir -p review-results

	# Python linting
	if find . -name "*.py" -type f \| grep -q .; then
	echo "Checking Python files..."
	pylint --output-format=json $(find . -name "*.py" -type f) > review-results/pylint-results.json 2>/dev/null \|\| true
	flake8 --format=json --output-file=review-results/flake8-results.json . 2>/dev/null \|\| true
	fi

	# Shell script checking
	if find . -name "*.sh" -type f \| grep -q .; then
	echo "Checking Shell scripts..."
	for script in $(find . -name "*.sh" -type f); do
	bash -n "$script" 2>&1 \| tee -a review-results/shell-check.log \|\| true
	done
	fi

	# Check for common errors
	echo "Checking for common issues..."

	# Check for hardcoded secrets
	if grep -r -i "password\\|secret\\|token\\|key" --include=".py" --include=".sh" --include=".json" . \| grep -v "example\\|test\\|mock" \| grep -E "(=\|:).[a-zA-Z0-9]{20,}" > review-results/potential-secrets.log 2>/dev/null; then
	echo "⚠️ Potential hardcoded secrets found" >> review-results/issues.log
	fi

	# Check for syntax errors in Python
	python -m py_compile $(find . -name "*.py" -type f) 2>&1 \| tee review-results/python-syntax-errors.log \|\| true

	echo "::endgroup::"

	- name: Check for Incompatibilities
	id: compatibility-check
	run: \|
	echo "::group::Checking for Incompatibilities"

	# Check for API compatibility issues
	if [ -f "requirements.txt" ]; then
	echo "Checking requirements.txt for version conflicts..."
	pip install --dry-run -r requirements.txt 2>&1 \| tee review-results/pip-conflicts.log \|\| true
	fi

	# Check for deprecated functions
	if find . -name "*.py" -type f \| grep -q .; then
	grep -r "deprecated\\|DeprecationWarning" --include="*.py" . > review-results/deprecation-warnings.log 2>/dev/null \|\| true
	fi

	# Check Dockerfile syntax
	if [ -f "Dockerfile" ]; then
	echo "Checking Dockerfile..."
	docker build --dry-run -t test-build . 2>&1 \| tee review-results/dockerfile-check.log \|\| true
	fi

	echo "::endgroup::"

	- name: Upload Review Results
	uses: actions/upload-artifact@v4
	with:
	name: code-review-results
	path: review-results/
	retention-days: 30

	security-scan:
	name: Security Best Practices Review
	runs-on: ubuntu-latest
	timeout-minutes: 30

	steps:
	- name: Checkout PR Code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Run Security Scans
	id: security-scan
	run: \|
	echo "::group::Running Security Scans"

	mkdir -p security-results

	# Bandit for Python security issues
	if find . -name "*.py" -type f \| grep -q .; then
	pip install bandit
	bandit -r . -f json -o security-results/bandit-results.json \|\| true
	bandit -r . -f txt -o security-results/bandit-report.txt \|\| true
	fi

	# Check for hardcoded credentials
	if command -v trufflehog &> /dev/null; then
	trufflehog filesystem . --json > security-results/trufflehog-results.json 2>/dev/null \|\| true
	else
	# Manual check for common patterns
	grep -r -E "(password\|passwd\|pwd)\s=\s[\"'][^\"']{8,}[\"']" --include=".py" --include=".sh" --include=".json" --include=".yaml" --include="*.yml" . > security-results/credentials-check.log 2>/dev/null \|\| true
	grep -r -E "(api_key\|apikey\|api-key)\s=\s[\"'][^\"']{10,}[\"']" --include=".py" --include=".sh" --include=".json" --include=".yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null \|\| true
	grep -r -E "(secret\|token)\s=\s[\"'][^\"']{15,}[\"']" --include=".py" --include=".sh" --include=".json" --include=".yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null \|\| true
	fi

	# Check for insecure configurations
	echo "Checking for insecure configurations..."

	# Check for HTTP instead of HTTPS
	grep -r "http://" --include=".py" --include=".sh" --include=".json" --include=".yaml" --include=".yml" --include=".tf" . \| grep -v "localhost\\|127.0.0.1\\|example.com" > security-results/insecure-http.log 2>/dev/null \|\| true

	# Check for overly permissive file permissions in scripts
	if [ -f "install.sh" ]; then
	if grep -E "chmod.777\|chmod.a\+rw" install.sh > /dev/null 2>&1; then
	echo "⚠️ Overly permissive file permissions found in install.sh" >> security-results/permission-issues.log
	fi
	fi

	# Check for eval/exec of user input
	grep -r "eval\\|exec" --include=".py" --include=".sh" . \| grep -v "# " \| head -20 > security-results/code-execution-risks.log 2>/dev/null \|\| true

	# Check Dockerfile security
	if [ -f "Dockerfile" ]; then
	# Check for running as root
	if ! grep -q "USER" Dockerfile; then
	echo "⚠️ Dockerfile does not specify USER - container runs as root" >> security-results/dockerfile-security.log
	fi

	# Check for latest tag usage
	if grep -E "FROM.*:latest" Dockerfile > /dev/null 2>&1; then
	echo "⚠️ Dockerfile uses 'latest' tag - use specific versions" >> security-results/dockerfile-security.log
	fi

	# Check for ADD vs COPY
	if grep -E "^ADD" Dockerfile > /dev/null 2>&1; then
	echo "⚠️ Dockerfile uses ADD - prefer COPY for better security" >> security-results/dockerfile-security.log
	fi
	fi

	echo "::endgroup::"

	- name: Generate Security Report
	run: \|
	echo "::group::Generating Security Report"

	cat > security-results/security-report.md << 'EOF'
	# Security Review Report

	## Summary
	EOF

	# Count issues
	CRITICAL=0
	HIGH=0
	MEDIUM=0
	LOW=0

	if [ -f "security-results/bandit-results.json" ]; then
	CRITICAL=$(jq '[.results[] \| select(.issue_severity == "CRITICAL")] \| length' security-results/bandit-results.json 2>/dev/null \|\| echo 0)
	HIGH=$(jq '[.results[] \| select(.issue_severity == "HIGH")] \| length' security-results/bandit-results.json 2>/dev/null \|\| echo 0)
	MEDIUM=$(jq '[.results[] \| select(.issue_severity == "MEDIUM")] \| length' security-results/bandit-results.json 2>/dev/null \|\| echo 0)
	LOW=$(jq '[.results[] \| select(.issue_severity == "LOW")] \| length' security-results/bandit-results.json 2>/dev/null \|\| echo 0)
	fi

	cat >> security-results/security-report.md << EOF
	- Critical Issues: $CRITICAL
	- High Issues: $HIGH
	- Medium Issues: $MEDIUM
	- Low Issues: $LOW

	## Detailed Findings
	EOF

	if [ -f "security-results/bandit-report.txt" ]; then
	cat >> security-results/security-report.md << EOF

	### Bandit Security Scan
	\`\`\`
	$(cat security-results/bandit-report.txt)
	\`\`\`
	EOF
	fi

	if [ -f "security-results/credentials-check.log" ] && [ -s "security-results/credentials-check.log" ]; then
	cat >> security-results/security-report.md << EOF

	### Potential Hardcoded Credentials
	\`\`\`
	$(cat security-results/credentials-check.log)
	\`\`\`
	EOF
	fi

	if [ -f "security-results/dockerfile-security.log" ]; then
	cat >> security-results/security-report.md << EOF

	### Dockerfile Security Issues
	\`\`\`
	$(cat security-results/dockerfile-security.log)
	\`\`\`
	EOF
	fi

	echo "::endgroup::"

	- name: Upload Security Results
	uses: actions/upload-artifact@v4
	with:
	name: security-scan-results
	path: security-results/
	retention-days: 30

	- name: Check Security Gate
	run: \|
	if [ -f "security-results/bandit-results.json" ]; then
	CRITICAL=$(jq '[.results[] \| select(.issue_severity == "CRITICAL")] \| length' security-results/bandit-results.json 2>/dev/null \|\| echo 0)
	if [ "$CRITICAL" -gt 0 ]; then
	echo "❌ Critical security issues found!"
	exit 1
	fi
	fi
	echo "✅ No critical security issues found"

	version-check:
	name: Version Requirements Check
	runs-on: ubuntu-latest
	timeout-minutes: 15

	steps:
	- name: Checkout PR Code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Check Driver and Library Versions
	id: version-check
	run: \|
	echo "::group::Checking Version Requirements"

	mkdir -p version-results

	# Define minimum versions
	MIN_EFA="1.47.0"
	MIN_NCCL="2.28"
	MIN_CUDA="13.0"

	echo "Minimum required versions:" > version-results/version-report.txt
	echo " EFA Installer: $MIN_EFA" >> version-results/version-report.txt
	echo " NCCL: $MIN_NCCL" >> version-results/version-report.txt
	echo " CUDA: $MIN_CUDA" >> version-results/version-report.txt
	echo "" >> version-results/version-report.txt

	VIOLATIONS=0

	# Check Dockerfile
	if [ -f "Dockerfile" ]; then
	echo "Checking Dockerfile..." >> version-results/version-report.txt

	# Check for EFA
	if grep -i "efa" Dockerfile > /dev/null 2>&1; then
	EFA_VERSION=$(grep -i "efa" Dockerfile \| grep -oE "[0-9]+\.[0-9]+\.[0-9]+" \| head -1)
	if [ -n "$EFA_VERSION" ]; then
	echo " Found EFA version: $EFA_VERSION" >> version-results/version-report.txt
	if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_VERSION" \| sort -V \| head -n1)" != "$MIN_EFA" ]; then
	echo " ❌ EFA version $EFA_VERSION is below minimum $MIN_EFA" >> version-results/version-report.txt
	VIOLATIONS=$((VIOLATIONS + 1))
	else
	echo " ✅ EFA version $EFA_VERSION meets requirement" >> version-results/version-report.txt
	fi
	fi
	fi

	# Check for NCCL
	if grep -i "nccl" Dockerfile > /dev/null 2>&1; then
	NCCL_VERSION=$(grep -i "nccl" Dockerfile \| grep -oE "[0-9]+\.[0-9]+(\.[0-9]+)?" \| head -1)
	if [ -n "$NCCL_VERSION" ]; then
	echo " Found NCCL version: $NCCL_VERSION" >> version-results/version-report.txt
	if [ "$(printf '%s\n' "$MIN_NCCL" "$NCCL_VERSION" \| sort -V \| head -n1)" != "$MIN_NCCL" ]; then
	echo " ❌ NCCL version $NCCL_VERSION is below minimum $MIN_NCCL" >> version-results/version-report.txt
	VIOLATIONS=$((VIOLATIONS + 1))
	else
	echo " ✅ NCCL version $NCCL_VERSION meets requirement" >> version-results/version-report.txt
	fi
	fi
	fi

	# Check for CUDA
	if grep -i "cuda" Dockerfile > /dev/null 2>&1; then
	CUDA_VERSION=$(grep -i "cuda" Dockerfile \| grep -oE "[0-9]+\.[0-9]+" \| head -1)
	if [ -n "$CUDA_VERSION" ]; then
	echo " Found CUDA version: $CUDA_VERSION" >> version-results/version-report.txt
	if [ "$(printf '%s\n' "$MIN_CUDA" "$CUDA_VERSION" \| sort -V \| head -n1)" != "$MIN_CUDA" ]; then
	echo " ❌ CUDA version $CUDA_VERSION is below minimum $MIN_CUDA" >> version-results/version-report.txt
	VIOLATIONS=$((VIOLATIONS + 1))
	else
	echo " ✅ CUDA version $CUDA_VERSION meets requirement" >> version-results/version-report.txt
	fi
	fi
	fi
	fi

	# Check requirements.txt
	if [ -f "requirements.txt" ]; then
	echo "" >> version-results/version-report.txt
	echo "Checking requirements.txt..." >> version-results/version-report.txt

	# Check for relevant packages
	if grep -i "nvidia\\|cuda\\|cupy" requirements.txt > /dev/null 2>&1; then
	grep -i "nvidia\\|cuda\\|cupy" requirements.txt >> version-results/version-report.txt
	fi
	fi

	# Check sbatch scripts
	for script in $(find . -name ".sbatch" -o -name ".sh" \| xargs grep -l "sbatch\\|srun" 2>/dev/null); do
	echo "" >> version-results/version-report.txt
	echo "Checking sbatch script: $script" >> version-results/version-report.txt

	# Check for module loads
	if grep -E "module load\|module use" "$script" > /dev/null 2>&1; then
	grep -E "module load\|module use" "$script" >> version-results/version-report.txt

	# Check for specific module versions
	if grep -i "efa" "$script" \| grep -oE "[0-9]+\.[0-9]+" > /dev/null 2>&1; then
	EFA_MOD=$(grep -i "efa" "$script" \| grep -oE "[0-9]+\.[0-9]+" \| head -1)
	if [ -n "$EFA_MOD" ]; then
	if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_MOD" \| sort -V \| head -n1)" != "$MIN_EFA" ]; then
	echo " ❌ EFA module version $EFA_MOD is below minimum $MIN_EFA" >> version-results/version-report.txt
	VIOLATIONS=$((VIOLATIONS + 1))
	fi
	fi
	fi
	fi
	done

	# Check for environment configuration files
	if [ -f "environment.yml" ]; then
	echo "" >> version-results/version-report.txt
	echo "Checking environment.yml..." >> version-results/version-report.txt
	grep -E "cuda\|nccl\|efa" environment.yml >> version-results/version-report.txt 2>/dev/null \|\| true
	fi

	if [ -f "pyproject.toml" ]; then
	echo "" >> version-results/version-report.txt
	echo "Checking pyproject.toml..." >> version-results/version-report.txt
	grep -E "cuda\|nccl\|efa" pyproject.toml >> version-results/version-report.txt 2>/dev/null \|\| true
	fi

	# Create JSON report
	cat > version-results/version-check.json << EOF
	{
	"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
	"pr_number": "${{ github.event.pull_request.number }}",
	"requirements": {
	"efa_minimum": "$MIN_EFA",
	"nccl_minimum": "$MIN_NCCL",
	"cuda_minimum": "$MIN_CUDA"
	},
	"violations": $VIOLATIONS,
	"status": "$([ $VIOLATIONS -eq 0 ] && echo "PASS" \|\| echo "FAIL")"
	}
	EOF

	echo "" >> version-results/version-report.txt
	echo "Total violations: $VIOLATIONS" >> version-results/version-report.txt

	cat version-results/version-report.txt
	echo "::endgroup::"

	if [ $VIOLATIONS -gt 0 ]; then
	echo "❌ Version requirements not met"
	exit 1
	fi

	- name: Upload Version Check Results
	uses: actions/upload-artifact@v4
	with:
	name: version-check-results
	path: version-results/
	retention-days: 30

	slurm-test:
	name: Slurm Cluster Testing
	runs-on: ubuntu-latest
	timeout-minutes: 130
	needs: [code-review, security-scan, version-check]
	if: ${{ always() && needs.code-review.result == 'success' && needs.security-scan.result == 'success' && needs.version-check.result == 'success' }}

	steps:
	- name: Checkout PR Code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ env.AWS_ROLE_ARN }}
	aws-region: ${{ env.AWS_REGION }}

	- name: Setup SSH Key
	run: \|
	mkdir -p ~/.ssh
	echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
	chmod 600 ~/.ssh/slurm_key
	ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null \|\| true

	- name: Prepare Test Environment
	run: \|
	echo "::group::Preparing Test Environment"

	# Create test directory name
	TEST_DIR="pr-${{ github.event.pull_request.number }}-$(date +%Y%m%d-%H%M%S)"
	echo "TEST_DIR=$TEST_DIR" >> $GITHUB_ENV
	echo "RESULTS_FILE=${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json" >> $GITHUB_ENV

	# Create local test directory
	mkdir -p test-artifacts

	# Copy PR code to test directory
	cp -r . test-artifacts/source-code

	echo "Test directory: $TEST_DIR"
	echo "Results file: ${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json"

	echo "::endgroup::"

	- name: Transfer Code to Slurm Cluster
	run: \|
	echo "::group::Transferring Code to Slurm Cluster"

	# Create remote directory
	ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "mkdir -p ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}"

	# Transfer code
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -r test-artifacts/source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/

	echo "Code transferred successfully"
	echo "::endgroup::"

	- name: Execute Tests on Slurm
	id: slurm-test
	timeout-minutes: 120
	run: \|
	echo "::group::Executing Tests on Slurm Cluster"

	# Create test execution script
	cat > test-artifacts/run-tests.sh << 'TESTSCRIPT'
	#!/bin/bash
	#SBATCH --job-name=pr-test-${{ github.event.pull_request.number }}
	#SBATCH --nodes=8
	#SBATCH --ntasks-per-node=1
	#SBATCH --time=02:00:00
	#SBATCH --output=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.out
	#SBATCH --error=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.err

	set -e

	cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}

	# Initialize results JSON
	cat > test-results.json << 'EOF'
	{
	"pr_number": "${{ github.event.pull_request.number }}",
	"test_date": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
	"slurm_job_id": "$SLURM_JOB_ID",
	"nodes_used": "$SLURM_NNODES",
	"tests": {},
	"status": "RUNNING"
	}
	EOF

	# Check for README.md and follow instructions
	if [ -f "README.md" ]; then
	echo "Found README.md - following test instructions"
	fi

	# Run unit tests if available
	echo "Running unit tests..."
	START_TIME=$(date +%s)

	if [ -f "requirements.txt" ]; then
	pip install -r requirements.txt --user \|\| true
	fi

	if [ -f "setup.py" ] \|\| [ -f "pyproject.toml" ]; then
	pip install -e . --user \|\| true
	fi

	UNIT_TEST_OUTPUT=""
	UNIT_TEST_STATUS="SKIPPED"

	if [ -f "pytest.ini" ] \|\| [ -d "tests" ] \|\| find . -name "test_*.py" -type f \| grep -q .; then
	pip install pytest --user \|\| true
	if python3 -m pytest --tb=short -v > unit-test-output.log 2>&1; then
	UNIT_TEST_STATUS="PASSED"
	UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
	else
	UNIT_TEST_STATUS="FAILED"
	UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
	fi
	elif [ -f "Makefile" ] && grep -q "test" Makefile; then
	if make test > unit-test-output.log 2>&1; then
	UNIT_TEST_STATUS="PASSED"
	UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
	else
	UNIT_TEST_STATUS="FAILED"
	UNIT_TEST_OUTPUT=$(cat unit-test-output.log)
	fi
	fi

	END_TIME=$(date +%s)
	DURATION=$((END_TIME - START_TIME))

	# Update results
	python3 << PYEOF
	import json

	with open('test-results.json', 'r') as f:
	data = json.load(f)

	data['tests']['unit_tests'] = {
	'status': '$UNIT_TEST_STATUS',
	'duration_seconds': $DURATION,
	'output': """$UNIT_TEST_OUTPUT"""
	}

	with open('test-results.json', 'w') as f:
	json.dump(data, f, indent=2)
	PYEOF

	# Run execution tests
	echo "Running execution tests..."
	START_TIME=$(date +%s)

	EXEC_TEST_OUTPUT=""
	EXEC_TEST_STATUS="SKIPPED"

	# Check for execution test scripts
	if [ -f "execute.py" ]; then
	if python3 execute.py > exec-test-output.log 2>&1; then
	EXEC_TEST_STATUS="PASSED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	else
	EXEC_TEST_STATUS="FAILED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	fi
	elif [ -f "run.sh" ]; then
	chmod +x run.sh
	if ./run.sh > exec-test-output.log 2>&1; then
	EXEC_TEST_STATUS="PASSED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	else
	EXEC_TEST_STATUS="FAILED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	fi
	elif [ -f "main.py" ]; then
	if python3 main.py > exec-test-output.log 2>&1; then
	EXEC_TEST_STATUS="PASSED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	else
	EXEC_TEST_STATUS="FAILED"
	EXEC_TEST_OUTPUT=$(cat exec-test-output.log)
	fi
	fi

	END_TIME=$(date +%s)
	DURATION=$((END_TIME - START_TIME))

	# Update results
	python3 << PYEOF
	import json

	with open('test-results.json', 'r') as f:
	data = json.load(f)

	data['tests']['execution_tests'] = {
	'status': '$EXEC_TEST_STATUS',
	'duration_seconds': $DURATION,
	'output': """$EXEC_TEST_OUTPUT"""
	}

	# Determine overall status
	overall_status = "PASSED"
	for test_name, test_data in data['tests'].items():
	if test_data['status'] == 'FAILED':
	overall_status = 'FAILED'
	break
	elif test_data['status'] == 'SKIPPED':
	overall_status = 'PARTIAL'

	data['status'] = overall_status
	data['completion_time'] = "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"

	with open('test-results.json', 'w') as f:
	json.dump(data, f, indent=2)
	PYEOF

	# Copy results to final location
	cp test-results.json ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }}

	echo "Tests completed. Results saved to: ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }}"

	# Exit with error if tests failed
	if [ "$UNIT_TEST_STATUS" == "FAILED" ] \|\| [ "$EXEC_TEST_STATUS" == "FAILED" ]; then
	exit 1
	fi
	TESTSCRIPT

	chmod +x test-artifacts/run-tests.sh

	# Transfer and submit job
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no test-artifacts/run-tests.sh ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/

	# Submit job and capture job ID
	JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }} && sbatch run-tests.sh" \| grep -oE '[0-9]+')

	echo "Submitted Slurm job: $JOB_ID"
	echo "SLURM_JOB_ID=$JOB_ID" >> $GITHUB_ENV

	# Wait for job completion with timeout
	echo "Waiting for job completion (timeout: 120 minutes)..."
	START_WAIT=$(date +%s)

	while true; do
	CURRENT_TIME=$(date +%s)
	ELAPSED=$((CURRENT_TIME - START_WAIT))

	if [ $ELAPSED -gt 7200 ]; then
	echo "Timeout reached - cancelling job"
	ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel $JOB_ID"
	exit 1
	fi

	JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "squeue -j $JOB_ID -h -o %T" 2>/dev/null \|\| echo "COMPLETED")

	if [ -z "$JOB_STATUS" ] \|\| [ "$JOB_STATUS" == "COMPLETED" ]; then
	echo "Job completed"
	break
	elif [ "$JOB_STATUS" == "FAILED" ] \|\| [ "$JOB_STATUS" == "CANCELLED" ] \|\| [ "$JOB_STATUS" == "TIMEOUT" ]; then
	echo "Job failed with status: $JOB_STATUS"
	exit 1
	fi

	echo "Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) minutes)"
	sleep 60
	done

	echo "::endgroup::"

	- name: Retrieve Test Results
	if: always()
	run: \|
	echo "::group::Retrieving Test Results"

	# Create local results directory
	mkdir -p test-results

	# Copy results from Slurm cluster
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/test-results.json test-results/ 2>/dev/null \|\| true
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.out test-results/ 2>/dev/null \|\| true
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.err test-results/ 2>/dev/null \|\| true

	# Copy the final results file
	scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }} test-results/ 2>/dev/null \|\| true

	# Display results
	if [ -f "test-results/test-results.json" ]; then
	echo "Test Results:"
	cat test-results/test-results.json \| python3 -m json.tool \|\| cat test-results/test-results.json
	fi

	echo "::endgroup::"

	- name: Upload Test Results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: slurm-test-results
	path: test-results/
	retention-days: 30

	- name: Cleanup Slurm Resources
	if: always()
	run: \|
	echo "::group::Cleaning up Slurm Resources"

	# Cancel job if still running
	if [ -n "${{ env.SLURM_JOB_ID }}" ]; then
	ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel ${{ env.SLURM_JOB_ID }}" 2>/dev/null \|\| true
	fi

	echo "Cleanup completed"
	echo "::endgroup::"

	- name: Final Status Check
	run: \|
	if [ -f "test-results/test-results.json" ]; then
	STATUS=$(python3 -c "import json; data=json.load(open('test-results/test-results.json')); print(data.get('status', 'UNKNOWN'))")
	if [ "$STATUS" == "FAILED" ]; then
	echo "❌ Tests failed"
	exit 1
	elif [ "$STATUS" == "PASSED" ]; then
	echo "✅ All tests passed"
	else
	echo "⚠️ Tests status: $STATUS"
	fi
	else
	echo "❌ No test results found"
	exit 1
	fi

	notify-on-failure:
	name: Send Failure Notification
	runs-on: ubuntu-latest
	needs: [code-review, security-scan, version-check, slurm-test]
	if: failure()

	steps:
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ env.AWS_ROLE_ARN }}
	aws-region: ${{ env.AWS_REGION }}

	- name: Send Email Notification
	run: \|
	# Get the email from GitHub secret to avoid exposing it in logs
	NOTIFICATION_EMAIL="${{ secrets.NOTIFICATION_EMAIL }}"

	if [ -z "$NOTIFICATION_EMAIL" ]; then
	echo "No notification email configured"
	exit 0
	fi

	# Determine which job failed
	FAILED_JOBS=""
	if [ "${{ needs.code-review.result }}" == "failure" ]; then
	FAILED_JOBS="$FAILED_JOBS- Code Review
	"
Check failure on line 782 in .github/workflows/pr-review-and-slurm-test.yml View workflow run for this annotation GitHub Actions / .github/workflows/pr-review-and-slurm-test.yml Invalid workflow file `You have an error in your yaml syntax on line 782`
	fi
	if [ "${{ needs.security-scan.result }}" == "failure" ]; then
	FAILED_JOBS="$FAILED_JOBS- Security Scan
	"
	fi
	if [ "${{ needs.version-check.result }}" == "failure" ]; then
	FAILED_JOBS="$FAILED_JOBS- Version Check
	"
	fi
	if [ "${{ needs.slurm-test.result }}" == "failure" ]; then
	FAILED_JOBS="$FAILED_JOBS- Slurm Test
	"
	fi

	# Send email using AWS SES
	aws ses send-email \
	--from "github-actions@aws.dev" \
	--to "$NOTIFICATION_EMAIL" \
	--subject "PR #${{ github.event.pull_request.number }} - Workflow Failed" \
	--text "Pull Request #${{ github.event.pull_request.number }} has failed workflow checks.

	Repository: ${{ github.repository }}
	PR Title: ${{ github.event.pull_request.title }}
	Author: ${{ github.event.pull_request.user.login }}
	Branch: ${{ github.event.pull_request.head.ref }}

	Failed Jobs:
	$FAILED_JOBS

	View details: ${{ github.event.pull_request.html_url }}

	Workflow Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

	- name: Fallback Notification
	if: failure()
	run: \|
	echo "::warning::Workflow failed for PR #${{ github.event.pull_request.number }}"
	echo "Failed jobs may require manual review"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3

Workflow file

Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/pr-review-and-slurm-test.yml