Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PR Review and Slurm Test | ||
| on: | ||
| pull_request: | ||
| branches: | ||
| - main | ||
| types: [opened, synchronize, reopened] | ||
| env: | ||
| AWS_REGION: us-east-1 | ||
| SLURM_HOST: p5en.smml.aiml.aws.dev | ||
| SLURM_USER: ghactions | ||
| RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-training | ||
| AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole | ||
| permissions: | ||
| id-token: write | ||
| contents: read | ||
| pull-requests: read | ||
| jobs: | ||
| code-review: | ||
| name: Code Review and Analysis | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 30 | ||
| steps: | ||
| - name: Checkout PR Code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
| ref: ${{ github.event.pull_request.head.sha }} | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: '3.11' | ||
| - name: Install Code Analysis Tools | ||
| run: | | ||
| pip install pylint flake8 bandit semgrep | ||
| npm install -g @microsoft/eslint-formatter-sarif | ||
| - name: Run Static Analysis | ||
| id: static-analysis | ||
| run: | | ||
| echo "::group::Running Static Analysis" | ||
| # Create results directory | ||
| mkdir -p review-results | ||
| # Python linting | ||
| if find . -name "*.py" -type f | grep -q .; then | ||
| echo "Checking Python files..." | ||
| pylint --output-format=json $(find . -name "*.py" -type f) > review-results/pylint-results.json 2>/dev/null || true | ||
| flake8 --format=json --output-file=review-results/flake8-results.json . 2>/dev/null || true | ||
| fi | ||
| # Shell script checking | ||
| if find . -name "*.sh" -type f | grep -q .; then | ||
| echo "Checking Shell scripts..." | ||
| for script in $(find . -name "*.sh" -type f); do | ||
| bash -n "$script" 2>&1 | tee -a review-results/shell-check.log || true | ||
| done | ||
| fi | ||
| # Check for common errors | ||
| echo "Checking for common issues..." | ||
| # Check for hardcoded secrets | ||
| if grep -r -i "password\|secret\|token\|key" --include="*.py" --include="*.sh" --include="*.json" . | grep -v "example\|test\|mock" | grep -E "(=|:).*[a-zA-Z0-9]{20,}" > review-results/potential-secrets.log 2>/dev/null; then | ||
| echo "⚠️ Potential hardcoded secrets found" >> review-results/issues.log | ||
| fi | ||
| # Check for syntax errors in Python | ||
| python -m py_compile $(find . -name "*.py" -type f) 2>&1 | tee review-results/python-syntax-errors.log || true | ||
| echo "::endgroup::" | ||
| - name: Check for Incompatibilities | ||
| id: compatibility-check | ||
| run: | | ||
| echo "::group::Checking for Incompatibilities" | ||
| # Check for API compatibility issues | ||
| if [ -f "requirements.txt" ]; then | ||
| echo "Checking requirements.txt for version conflicts..." | ||
| pip install --dry-run -r requirements.txt 2>&1 | tee review-results/pip-conflicts.log || true | ||
| fi | ||
| # Check for deprecated functions | ||
| if find . -name "*.py" -type f | grep -q .; then | ||
| grep -r "deprecated\|DeprecationWarning" --include="*.py" . > review-results/deprecation-warnings.log 2>/dev/null || true | ||
| fi | ||
| # Check Dockerfile syntax | ||
| if [ -f "Dockerfile" ]; then | ||
| echo "Checking Dockerfile..." | ||
| docker build --dry-run -t test-build . 2>&1 | tee review-results/dockerfile-check.log || true | ||
| fi | ||
| echo "::endgroup::" | ||
| - name: Upload Review Results | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: code-review-results | ||
| path: review-results/ | ||
| retention-days: 30 | ||
| security-scan: | ||
| name: Security Best Practices Review | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 30 | ||
| steps: | ||
| - name: Checkout PR Code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
| ref: ${{ github.event.pull_request.head.sha }} | ||
| - name: Run Security Scans | ||
| id: security-scan | ||
| run: | | ||
| echo "::group::Running Security Scans" | ||
| mkdir -p security-results | ||
| # Bandit for Python security issues | ||
| if find . -name "*.py" -type f | grep -q .; then | ||
| pip install bandit | ||
| bandit -r . -f json -o security-results/bandit-results.json || true | ||
| bandit -r . -f txt -o security-results/bandit-report.txt || true | ||
| fi | ||
| # Check for hardcoded credentials | ||
| if command -v trufflehog &> /dev/null; then | ||
| trufflehog filesystem . --json > security-results/trufflehog-results.json 2>/dev/null || true | ||
| else | ||
| # Manual check for common patterns | ||
| grep -r -E "(password|passwd|pwd)\s*=\s*[\"'][^\"']{8,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . > security-results/credentials-check.log 2>/dev/null || true | ||
| grep -r -E "(api_key|apikey|api-key)\s*=\s*[\"'][^\"']{10,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null || true | ||
| grep -r -E "(secret|token)\s*=\s*[\"'][^\"']{15,}[\"']" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" . >> security-results/credentials-check.log 2>/dev/null || true | ||
| fi | ||
| # Check for insecure configurations | ||
| echo "Checking for insecure configurations..." | ||
| # Check for HTTP instead of HTTPS | ||
| grep -r "http://" --include="*.py" --include="*.sh" --include="*.json" --include="*.yaml" --include="*.yml" --include="*.tf" . | grep -v "localhost\|127.0.0.1\|example.com" > security-results/insecure-http.log 2>/dev/null || true | ||
| # Check for overly permissive file permissions in scripts | ||
| if [ -f "install.sh" ]; then | ||
| if grep -E "chmod.*777|chmod.*a\+rw" install.sh > /dev/null 2>&1; then | ||
| echo "⚠️ Overly permissive file permissions found in install.sh" >> security-results/permission-issues.log | ||
| fi | ||
| fi | ||
| # Check for eval/exec of user input | ||
| grep -r "eval\|exec" --include="*.py" --include="*.sh" . | grep -v "# " | head -20 > security-results/code-execution-risks.log 2>/dev/null || true | ||
| # Check Dockerfile security | ||
| if [ -f "Dockerfile" ]; then | ||
| # Check for running as root | ||
| if ! grep -q "USER" Dockerfile; then | ||
| echo "⚠️ Dockerfile does not specify USER - container runs as root" >> security-results/dockerfile-security.log | ||
| fi | ||
| # Check for latest tag usage | ||
| if grep -E "FROM.*:latest" Dockerfile > /dev/null 2>&1; then | ||
| echo "⚠️ Dockerfile uses 'latest' tag - use specific versions" >> security-results/dockerfile-security.log | ||
| fi | ||
| # Check for ADD vs COPY | ||
| if grep -E "^ADD" Dockerfile > /dev/null 2>&1; then | ||
| echo "⚠️ Dockerfile uses ADD - prefer COPY for better security" >> security-results/dockerfile-security.log | ||
| fi | ||
| fi | ||
| echo "::endgroup::" | ||
| - name: Generate Security Report | ||
| run: | | ||
| echo "::group::Generating Security Report" | ||
| cat > security-results/security-report.md << 'EOF' | ||
| # Security Review Report | ||
| ## Summary | ||
| EOF | ||
| # Count issues | ||
| CRITICAL=0 | ||
| HIGH=0 | ||
| MEDIUM=0 | ||
| LOW=0 | ||
| if [ -f "security-results/bandit-results.json" ]; then | ||
| CRITICAL=$(jq '[.results[] | select(.issue_severity == "CRITICAL")] | length' security-results/bandit-results.json 2>/dev/null || echo 0) | ||
| HIGH=$(jq '[.results[] | select(.issue_severity == "HIGH")] | length' security-results/bandit-results.json 2>/dev/null || echo 0) | ||
| MEDIUM=$(jq '[.results[] | select(.issue_severity == "MEDIUM")] | length' security-results/bandit-results.json 2>/dev/null || echo 0) | ||
| LOW=$(jq '[.results[] | select(.issue_severity == "LOW")] | length' security-results/bandit-results.json 2>/dev/null || echo 0) | ||
| fi | ||
| cat >> security-results/security-report.md << EOF | ||
| - Critical Issues: $CRITICAL | ||
| - High Issues: $HIGH | ||
| - Medium Issues: $MEDIUM | ||
| - Low Issues: $LOW | ||
| ## Detailed Findings | ||
| EOF | ||
| if [ -f "security-results/bandit-report.txt" ]; then | ||
| cat >> security-results/security-report.md << EOF | ||
| ### Bandit Security Scan | ||
| \`\`\` | ||
| $(cat security-results/bandit-report.txt) | ||
| \`\`\` | ||
| EOF | ||
| fi | ||
| if [ -f "security-results/credentials-check.log" ] && [ -s "security-results/credentials-check.log" ]; then | ||
| cat >> security-results/security-report.md << EOF | ||
| ### Potential Hardcoded Credentials | ||
| \`\`\` | ||
| $(cat security-results/credentials-check.log) | ||
| \`\`\` | ||
| EOF | ||
| fi | ||
| if [ -f "security-results/dockerfile-security.log" ]; then | ||
| cat >> security-results/security-report.md << EOF | ||
| ### Dockerfile Security Issues | ||
| \`\`\` | ||
| $(cat security-results/dockerfile-security.log) | ||
| \`\`\` | ||
| EOF | ||
| fi | ||
| echo "::endgroup::" | ||
| - name: Upload Security Results | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: security-scan-results | ||
| path: security-results/ | ||
| retention-days: 30 | ||
| - name: Check Security Gate | ||
| run: | | ||
| if [ -f "security-results/bandit-results.json" ]; then | ||
| CRITICAL=$(jq '[.results[] | select(.issue_severity == "CRITICAL")] | length' security-results/bandit-results.json 2>/dev/null || echo 0) | ||
| if [ "$CRITICAL" -gt 0 ]; then | ||
| echo "❌ Critical security issues found!" | ||
| exit 1 | ||
| fi | ||
| fi | ||
| echo "✅ No critical security issues found" | ||
| version-check: | ||
| name: Version Requirements Check | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 15 | ||
| steps: | ||
| - name: Checkout PR Code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
| ref: ${{ github.event.pull_request.head.sha }} | ||
| - name: Check Driver and Library Versions | ||
| id: version-check | ||
| run: | | ||
| echo "::group::Checking Version Requirements" | ||
| mkdir -p version-results | ||
| # Define minimum versions | ||
| MIN_EFA="1.47.0" | ||
| MIN_NCCL="2.28" | ||
| MIN_CUDA="13.0" | ||
| echo "Minimum required versions:" > version-results/version-report.txt | ||
| echo " EFA Installer: $MIN_EFA" >> version-results/version-report.txt | ||
| echo " NCCL: $MIN_NCCL" >> version-results/version-report.txt | ||
| echo " CUDA: $MIN_CUDA" >> version-results/version-report.txt | ||
| echo "" >> version-results/version-report.txt | ||
| VIOLATIONS=0 | ||
| # Check Dockerfile | ||
| if [ -f "Dockerfile" ]; then | ||
| echo "Checking Dockerfile..." >> version-results/version-report.txt | ||
| # Check for EFA | ||
| if grep -i "efa" Dockerfile > /dev/null 2>&1; then | ||
| EFA_VERSION=$(grep -i "efa" Dockerfile | grep -oE "[0-9]+\.[0-9]+\.[0-9]+" | head -1) | ||
| if [ -n "$EFA_VERSION" ]; then | ||
| echo " Found EFA version: $EFA_VERSION" >> version-results/version-report.txt | ||
| if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_VERSION" | sort -V | head -n1)" != "$MIN_EFA" ]; then | ||
| echo " ❌ EFA version $EFA_VERSION is below minimum $MIN_EFA" >> version-results/version-report.txt | ||
| VIOLATIONS=$((VIOLATIONS + 1)) | ||
| else | ||
| echo " ✅ EFA version $EFA_VERSION meets requirement" >> version-results/version-report.txt | ||
| fi | ||
| fi | ||
| fi | ||
| # Check for NCCL | ||
| if grep -i "nccl" Dockerfile > /dev/null 2>&1; then | ||
| NCCL_VERSION=$(grep -i "nccl" Dockerfile | grep -oE "[0-9]+\.[0-9]+(\.[0-9]+)?" | head -1) | ||
| if [ -n "$NCCL_VERSION" ]; then | ||
| echo " Found NCCL version: $NCCL_VERSION" >> version-results/version-report.txt | ||
| if [ "$(printf '%s\n' "$MIN_NCCL" "$NCCL_VERSION" | sort -V | head -n1)" != "$MIN_NCCL" ]; then | ||
| echo " ❌ NCCL version $NCCL_VERSION is below minimum $MIN_NCCL" >> version-results/version-report.txt | ||
| VIOLATIONS=$((VIOLATIONS + 1)) | ||
| else | ||
| echo " ✅ NCCL version $NCCL_VERSION meets requirement" >> version-results/version-report.txt | ||
| fi | ||
| fi | ||
| fi | ||
| # Check for CUDA | ||
| if grep -i "cuda" Dockerfile > /dev/null 2>&1; then | ||
| CUDA_VERSION=$(grep -i "cuda" Dockerfile | grep -oE "[0-9]+\.[0-9]+" | head -1) | ||
| if [ -n "$CUDA_VERSION" ]; then | ||
| echo " Found CUDA version: $CUDA_VERSION" >> version-results/version-report.txt | ||
| if [ "$(printf '%s\n' "$MIN_CUDA" "$CUDA_VERSION" | sort -V | head -n1)" != "$MIN_CUDA" ]; then | ||
| echo " ❌ CUDA version $CUDA_VERSION is below minimum $MIN_CUDA" >> version-results/version-report.txt | ||
| VIOLATIONS=$((VIOLATIONS + 1)) | ||
| else | ||
| echo " ✅ CUDA version $CUDA_VERSION meets requirement" >> version-results/version-report.txt | ||
| fi | ||
| fi | ||
| fi | ||
| fi | ||
| # Check requirements.txt | ||
| if [ -f "requirements.txt" ]; then | ||
| echo "" >> version-results/version-report.txt | ||
| echo "Checking requirements.txt..." >> version-results/version-report.txt | ||
| # Check for relevant packages | ||
| if grep -i "nvidia\|cuda\|cupy" requirements.txt > /dev/null 2>&1; then | ||
| grep -i "nvidia\|cuda\|cupy" requirements.txt >> version-results/version-report.txt | ||
| fi | ||
| fi | ||
| # Check sbatch scripts | ||
| for script in $(find . -name "*.sbatch" -o -name "*.sh" | xargs grep -l "sbatch\|srun" 2>/dev/null); do | ||
| echo "" >> version-results/version-report.txt | ||
| echo "Checking sbatch script: $script" >> version-results/version-report.txt | ||
| # Check for module loads | ||
| if grep -E "module load|module use" "$script" > /dev/null 2>&1; then | ||
| grep -E "module load|module use" "$script" >> version-results/version-report.txt | ||
| # Check for specific module versions | ||
| if grep -i "efa" "$script" | grep -oE "[0-9]+\.[0-9]+" > /dev/null 2>&1; then | ||
| EFA_MOD=$(grep -i "efa" "$script" | grep -oE "[0-9]+\.[0-9]+" | head -1) | ||
| if [ -n "$EFA_MOD" ]; then | ||
| if [ "$(printf '%s\n' "$MIN_EFA" "$EFA_MOD" | sort -V | head -n1)" != "$MIN_EFA" ]; then | ||
| echo " ❌ EFA module version $EFA_MOD is below minimum $MIN_EFA" >> version-results/version-report.txt | ||
| VIOLATIONS=$((VIOLATIONS + 1)) | ||
| fi | ||
| fi | ||
| fi | ||
| fi | ||
| done | ||
| # Check for environment configuration files | ||
| if [ -f "environment.yml" ]; then | ||
| echo "" >> version-results/version-report.txt | ||
| echo "Checking environment.yml..." >> version-results/version-report.txt | ||
| grep -E "cuda|nccl|efa" environment.yml >> version-results/version-report.txt 2>/dev/null || true | ||
| fi | ||
| if [ -f "pyproject.toml" ]; then | ||
| echo "" >> version-results/version-report.txt | ||
| echo "Checking pyproject.toml..." >> version-results/version-report.txt | ||
| grep -E "cuda|nccl|efa" pyproject.toml >> version-results/version-report.txt 2>/dev/null || true | ||
| fi | ||
| # Create JSON report | ||
| cat > version-results/version-check.json << EOF | ||
| { | ||
| "timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", | ||
| "pr_number": "${{ github.event.pull_request.number }}", | ||
| "requirements": { | ||
| "efa_minimum": "$MIN_EFA", | ||
| "nccl_minimum": "$MIN_NCCL", | ||
| "cuda_minimum": "$MIN_CUDA" | ||
| }, | ||
| "violations": $VIOLATIONS, | ||
| "status": "$([ $VIOLATIONS -eq 0 ] && echo "PASS" || echo "FAIL")" | ||
| } | ||
| EOF | ||
| echo "" >> version-results/version-report.txt | ||
| echo "Total violations: $VIOLATIONS" >> version-results/version-report.txt | ||
| cat version-results/version-report.txt | ||
| echo "::endgroup::" | ||
| if [ $VIOLATIONS -gt 0 ]; then | ||
| echo "❌ Version requirements not met" | ||
| exit 1 | ||
| fi | ||
| - name: Upload Version Check Results | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: version-check-results | ||
| path: version-results/ | ||
| retention-days: 30 | ||
| slurm-test: | ||
| name: Slurm Cluster Testing | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 130 | ||
| needs: [code-review, security-scan, version-check] | ||
| if: ${{ always() && needs.code-review.result == 'success' && needs.security-scan.result == 'success' && needs.version-check.result == 'success' }} | ||
| steps: | ||
| - name: Checkout PR Code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
| ref: ${{ github.event.pull_request.head.sha }} | ||
| - name: Configure AWS Credentials | ||
| uses: aws-actions/configure-aws-credentials@v4 | ||
| with: | ||
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | ||
| aws-region: ${{ env.AWS_REGION }} | ||
| - name: Setup SSH Key | ||
| run: | | ||
| mkdir -p ~/.ssh | ||
| echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key | ||
| chmod 600 ~/.ssh/slurm_key | ||
| ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null || true | ||
| - name: Prepare Test Environment | ||
| run: | | ||
| echo "::group::Preparing Test Environment" | ||
| # Create test directory name | ||
| TEST_DIR="pr-${{ github.event.pull_request.number }}-$(date +%Y%m%d-%H%M%S)" | ||
| echo "TEST_DIR=$TEST_DIR" >> $GITHUB_ENV | ||
| echo "RESULTS_FILE=${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json" >> $GITHUB_ENV | ||
| # Create local test directory | ||
| mkdir -p test-artifacts | ||
| # Copy PR code to test directory | ||
| cp -r . test-artifacts/source-code | ||
| echo "Test directory: $TEST_DIR" | ||
| echo "Results file: ${{ github.event.pull_request.number }}-$(date +%Y%m%d)-results.json" | ||
| echo "::endgroup::" | ||
| - name: Transfer Code to Slurm Cluster | ||
| run: | | ||
| echo "::group::Transferring Code to Slurm Cluster" | ||
| # Create remote directory | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "mkdir -p ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}" | ||
| # Transfer code | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -r test-artifacts/source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/ | ||
| echo "Code transferred successfully" | ||
| echo "::endgroup::" | ||
| - name: Execute Tests on Slurm | ||
| id: slurm-test | ||
| timeout-minutes: 120 | ||
| run: | | ||
| echo "::group::Executing Tests on Slurm Cluster" | ||
| # Create test execution script | ||
| cat > test-artifacts/run-tests.sh << 'TESTSCRIPT' | ||
| #!/bin/bash | ||
| #SBATCH --job-name=pr-test-${{ github.event.pull_request.number }} | ||
| #SBATCH --nodes=8 | ||
| #SBATCH --ntasks-per-node=1 | ||
| #SBATCH --time=02:00:00 | ||
| #SBATCH --output=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.out | ||
| #SBATCH --error=${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-%j.err | ||
| set -e | ||
| cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }} | ||
| # Initialize results JSON | ||
| cat > test-results.json << 'EOF' | ||
| { | ||
| "pr_number": "${{ github.event.pull_request.number }}", | ||
| "test_date": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", | ||
| "slurm_job_id": "$SLURM_JOB_ID", | ||
| "nodes_used": "$SLURM_NNODES", | ||
| "tests": {}, | ||
| "status": "RUNNING" | ||
| } | ||
| EOF | ||
| # Check for README.md and follow instructions | ||
| if [ -f "README.md" ]; then | ||
| echo "Found README.md - following test instructions" | ||
| fi | ||
| # Run unit tests if available | ||
| echo "Running unit tests..." | ||
| START_TIME=$(date +%s) | ||
| if [ -f "requirements.txt" ]; then | ||
| pip install -r requirements.txt --user || true | ||
| fi | ||
| if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then | ||
| pip install -e . --user || true | ||
| fi | ||
| UNIT_TEST_OUTPUT="" | ||
| UNIT_TEST_STATUS="SKIPPED" | ||
| if [ -f "pytest.ini" ] || [ -d "tests" ] || find . -name "test_*.py" -type f | grep -q .; then | ||
| pip install pytest --user || true | ||
| if python3 -m pytest --tb=short -v > unit-test-output.log 2>&1; then | ||
| UNIT_TEST_STATUS="PASSED" | ||
| UNIT_TEST_OUTPUT=$(cat unit-test-output.log) | ||
| else | ||
| UNIT_TEST_STATUS="FAILED" | ||
| UNIT_TEST_OUTPUT=$(cat unit-test-output.log) | ||
| fi | ||
| elif [ -f "Makefile" ] && grep -q "test" Makefile; then | ||
| if make test > unit-test-output.log 2>&1; then | ||
| UNIT_TEST_STATUS="PASSED" | ||
| UNIT_TEST_OUTPUT=$(cat unit-test-output.log) | ||
| else | ||
| UNIT_TEST_STATUS="FAILED" | ||
| UNIT_TEST_OUTPUT=$(cat unit-test-output.log) | ||
| fi | ||
| fi | ||
| END_TIME=$(date +%s) | ||
| DURATION=$((END_TIME - START_TIME)) | ||
| # Update results | ||
| python3 << PYEOF | ||
| import json | ||
| with open('test-results.json', 'r') as f: | ||
| data = json.load(f) | ||
| data['tests']['unit_tests'] = { | ||
| 'status': '$UNIT_TEST_STATUS', | ||
| 'duration_seconds': $DURATION, | ||
| 'output': """$UNIT_TEST_OUTPUT""" | ||
| } | ||
| with open('test-results.json', 'w') as f: | ||
| json.dump(data, f, indent=2) | ||
| PYEOF | ||
| # Run execution tests | ||
| echo "Running execution tests..." | ||
| START_TIME=$(date +%s) | ||
| EXEC_TEST_OUTPUT="" | ||
| EXEC_TEST_STATUS="SKIPPED" | ||
| # Check for execution test scripts | ||
| if [ -f "execute.py" ]; then | ||
| if python3 execute.py > exec-test-output.log 2>&1; then | ||
| EXEC_TEST_STATUS="PASSED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| else | ||
| EXEC_TEST_STATUS="FAILED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| fi | ||
| elif [ -f "run.sh" ]; then | ||
| chmod +x run.sh | ||
| if ./run.sh > exec-test-output.log 2>&1; then | ||
| EXEC_TEST_STATUS="PASSED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| else | ||
| EXEC_TEST_STATUS="FAILED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| fi | ||
| elif [ -f "main.py" ]; then | ||
| if python3 main.py > exec-test-output.log 2>&1; then | ||
| EXEC_TEST_STATUS="PASSED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| else | ||
| EXEC_TEST_STATUS="FAILED" | ||
| EXEC_TEST_OUTPUT=$(cat exec-test-output.log) | ||
| fi | ||
| fi | ||
| END_TIME=$(date +%s) | ||
| DURATION=$((END_TIME - START_TIME)) | ||
| # Update results | ||
| python3 << PYEOF | ||
| import json | ||
| with open('test-results.json', 'r') as f: | ||
| data = json.load(f) | ||
| data['tests']['execution_tests'] = { | ||
| 'status': '$EXEC_TEST_STATUS', | ||
| 'duration_seconds': $DURATION, | ||
| 'output': """$EXEC_TEST_OUTPUT""" | ||
| } | ||
| # Determine overall status | ||
| overall_status = "PASSED" | ||
| for test_name, test_data in data['tests'].items(): | ||
| if test_data['status'] == 'FAILED': | ||
| overall_status = 'FAILED' | ||
| break | ||
| elif test_data['status'] == 'SKIPPED': | ||
| overall_status = 'PARTIAL' | ||
| data['status'] = overall_status | ||
| data['completion_time'] = "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" | ||
| with open('test-results.json', 'w') as f: | ||
| json.dump(data, f, indent=2) | ||
| PYEOF | ||
| # Copy results to final location | ||
| cp test-results.json ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }} | ||
| echo "Tests completed. Results saved to: ${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }}" | ||
| # Exit with error if tests failed | ||
| if [ "$UNIT_TEST_STATUS" == "FAILED" ] || [ "$EXEC_TEST_STATUS" == "FAILED" ]; then | ||
| exit 1 | ||
| fi | ||
| TESTSCRIPT | ||
| chmod +x test-artifacts/run-tests.sh | ||
| # Transfer and submit job | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no test-artifacts/run-tests.sh ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/ | ||
| # Submit job and capture job ID | ||
| JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "cd ${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }} && sbatch run-tests.sh" | grep -oE '[0-9]+') | ||
| echo "Submitted Slurm job: $JOB_ID" | ||
| echo "SLURM_JOB_ID=$JOB_ID" >> $GITHUB_ENV | ||
| # Wait for job completion with timeout | ||
| echo "Waiting for job completion (timeout: 120 minutes)..." | ||
| START_WAIT=$(date +%s) | ||
| while true; do | ||
| CURRENT_TIME=$(date +%s) | ||
| ELAPSED=$((CURRENT_TIME - START_WAIT)) | ||
| if [ $ELAPSED -gt 7200 ]; then | ||
| echo "Timeout reached - cancelling job" | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel $JOB_ID" | ||
| exit 1 | ||
| fi | ||
| JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "squeue -j $JOB_ID -h -o %T" 2>/dev/null || echo "COMPLETED") | ||
| if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then | ||
| echo "Job completed" | ||
| break | ||
| elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then | ||
| echo "Job failed with status: $JOB_STATUS" | ||
| exit 1 | ||
| fi | ||
| echo "Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) minutes)" | ||
| sleep 60 | ||
| done | ||
| echo "::endgroup::" | ||
| - name: Retrieve Test Results | ||
| if: always() | ||
| run: | | ||
| echo "::group::Retrieving Test Results" | ||
| # Create local results directory | ||
| mkdir -p test-results | ||
| # Copy results from Slurm cluster | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/test-results.json test-results/ 2>/dev/null || true | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.out test-results/ 2>/dev/null || true | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.TEST_DIR }}/slurm-*.err test-results/ 2>/dev/null || true | ||
| # Copy the final results file | ||
| scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.RESULTS_PATH }}/${{ env.RESULTS_FILE }} test-results/ 2>/dev/null || true | ||
| # Display results | ||
| if [ -f "test-results/test-results.json" ]; then | ||
| echo "Test Results:" | ||
| cat test-results/test-results.json | python3 -m json.tool || cat test-results/test-results.json | ||
| fi | ||
| echo "::endgroup::" | ||
| - name: Upload Test Results | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: slurm-test-results | ||
| path: test-results/ | ||
| retention-days: 30 | ||
| - name: Cleanup Slurm Resources | ||
| if: always() | ||
| run: | | ||
| echo "::group::Cleaning up Slurm Resources" | ||
| # Cancel job if still running | ||
| if [ -n "${{ env.SLURM_JOB_ID }}" ]; then | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} "scancel ${{ env.SLURM_JOB_ID }}" 2>/dev/null || true | ||
| fi | ||
| echo "Cleanup completed" | ||
| echo "::endgroup::" | ||
| - name: Final Status Check | ||
| run: | | ||
| if [ -f "test-results/test-results.json" ]; then | ||
| STATUS=$(python3 -c "import json; data=json.load(open('test-results/test-results.json')); print(data.get('status', 'UNKNOWN'))") | ||
| if [ "$STATUS" == "FAILED" ]; then | ||
| echo "❌ Tests failed" | ||
| exit 1 | ||
| elif [ "$STATUS" == "PASSED" ]; then | ||
| echo "✅ All tests passed" | ||
| else | ||
| echo "⚠️ Tests status: $STATUS" | ||
| fi | ||
| else | ||
| echo "❌ No test results found" | ||
| exit 1 | ||
| fi | ||
| notify-on-failure: | ||
| name: Send Failure Notification | ||
| runs-on: ubuntu-latest | ||
| needs: [code-review, security-scan, version-check, slurm-test] | ||
| if: failure() | ||
| steps: | ||
| - name: Configure AWS Credentials | ||
| uses: aws-actions/configure-aws-credentials@v4 | ||
| with: | ||
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | ||
| aws-region: ${{ env.AWS_REGION }} | ||
| - name: Send Email Notification | ||
| run: | | ||
| # Get the email from GitHub secret to avoid exposing it in logs | ||
| NOTIFICATION_EMAIL="${{ secrets.NOTIFICATION_EMAIL }}" | ||
| if [ -z "$NOTIFICATION_EMAIL" ]; then | ||
| echo "No notification email configured" | ||
| exit 0 | ||
| fi | ||
| # Determine which job failed | ||
| FAILED_JOBS="" | ||
| if [ "${{ needs.code-review.result }}" == "failure" ]; then | ||
| FAILED_JOBS="$FAILED_JOBS- Code Review | ||
| " | ||
| fi | ||
| if [ "${{ needs.security-scan.result }}" == "failure" ]; then | ||
| FAILED_JOBS="$FAILED_JOBS- Security Scan | ||
| " | ||
| fi | ||
| if [ "${{ needs.version-check.result }}" == "failure" ]; then | ||
| FAILED_JOBS="$FAILED_JOBS- Version Check | ||
| " | ||
| fi | ||
| if [ "${{ needs.slurm-test.result }}" == "failure" ]; then | ||
| FAILED_JOBS="$FAILED_JOBS- Slurm Test | ||
| " | ||
| fi | ||
| # Send email using AWS SES | ||
| aws ses send-email \ | ||
| --from "github-actions@aws.dev" \ | ||
| --to "$NOTIFICATION_EMAIL" \ | ||
| --subject "PR #${{ github.event.pull_request.number }} - Workflow Failed" \ | ||
| --text "Pull Request #${{ github.event.pull_request.number }} has failed workflow checks. | ||
| Repository: ${{ github.repository }} | ||
| PR Title: ${{ github.event.pull_request.title }} | ||
| Author: ${{ github.event.pull_request.user.login }} | ||
| Branch: ${{ github.event.pull_request.head.ref }} | ||
| Failed Jobs: | ||
| $FAILED_JOBS | ||
| View details: ${{ github.event.pull_request.html_url }} | ||
| Workflow Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | ||
| - name: Fallback Notification | ||
| if: failure() | ||
| run: | | ||
| echo "::warning::Workflow failed for PR #${{ github.event.pull_request.number }}" | ||
| echo "Failed jobs may require manual review" | ||