Update Slurm workflows to use GitHub-hosted runners with SSH (#953) #97
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: FSDP Regression Test (container) | ||
| # TODO: Additional configuration options as inputs (egs. number of nodes (auto change num GPUs and EFA variables accordingly), add support for g* instances etc) | ||
| on: | ||
| push: | ||
| branches: [ "main" ] | ||
| paths: | ||
| - '3.test_cases/pytorch/FSDP/**' | ||
| pull_request: | ||
| paths: | ||
| - '3.test_cases/pytorch/FSDP/**' | ||
| workflow_dispatch: | ||
| env: | ||
| AWS_REGION: us-east-1 | ||
| SLURM_HOST: p5en.smml.aiml.aws.dev | ||
| SLURM_USER: ghactions | ||
| AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole | ||
| BASE_PATH: /fsx/agents/pr-reviews | ||
| HOME_PATH: /home/ghactions | ||
| permissions: | ||
| id-token: write | ||
| contents: read | ||
| jobs: | ||
| build: | ||
| strategy: | ||
| fail-fast: true | ||
| max-parallel: 3 | ||
| matrix: | ||
| cluster: [p5, p5-smhp] | ||
| runs-on: ubuntu-latest | ||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ matrix.cluster }}-build | ||
| cancel-in-progress: false | ||
| timeout-minutes: 75 | ||
| outputs: | ||
| enroot_image_path: ${{ steps.build.outputs.enroot_image_path }} | ||
| remote_build_path: ${{ steps.setup.outputs.remote_build_path }} | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| path: source-code | ||
| - name: Configure AWS Credentials | ||
| uses: aws-actions/configure-aws-credentials@v4 | ||
| with: | ||
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | ||
| aws-region: ${{ env.AWS_REGION }} | ||
| - name: Setup SSH Key | ||
| id: setup | ||
| run: | | ||
| mkdir -p ~/.ssh | ||
| echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key | ||
| chmod 600 ~/.ssh/slurm_key | ||
| # Add host to known hosts with retry | ||
| for i in {1..5}; do | ||
| if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then | ||
| echo "SSH keyscan successful" | ||
| break | ||
| fi | ||
| echo "SSH keyscan attempt $i failed, retrying..." | ||
| sleep 5 | ||
| done | ||
| REMOTE_BUILD_PATH="${{ env.BASE_PATH }}/container-builds/${{ github.run_id }}-${{ matrix.cluster }}" | ||
| echo "remote_build_path=$REMOTE_BUILD_PATH" >> $GITHUB_OUTPUT | ||
| echo "REMOTE_BUILD_PATH=$REMOTE_BUILD_PATH" >> $GITHUB_ENV | ||
| - name: Transfer Code to Cluster | ||
| run: | | ||
| # Create remote directory | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "mkdir -p ${{ env.REMOTE_BUILD_PATH }}" | ||
| # Transfer code with retry | ||
| for i in {1..3}; do | ||
| if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \ | ||
| source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_BUILD_PATH }}/; then | ||
| echo "Code transfer successful" | ||
| break | ||
| fi | ||
| echo "Transfer attempt $i failed, retrying..." | ||
| sleep 10 | ||
| done | ||
| - name: Build container on cluster | ||
| id: build | ||
| working-directory: source-code | ||
| run: | | ||
| FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/pytorch/FSDP" | ||
| ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh" | ||
| echo "Building FSDP image on cluster..." | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF | ||
| set -e | ||
| cd $FSDP_DIR | ||
| echo "Building Docker image..." | ||
| docker build -t fsdp:pytorch . | ||
| echo "Converting to enroot image..." | ||
| mkdir -p ${{ env.BASE_PATH }}/enroot-images | ||
| enroot import -o $ENROOT_IMAGE dockerd://fsdp:pytorch | ||
| echo "Enroot image created at: $ENROOT_IMAGE" | ||
| # Clean up Docker image to save space | ||
| docker rmi fsdp:pytorch || true | ||
| EOF | ||
| echo "enroot_image_path=$ENROOT_IMAGE" >> $GITHUB_OUTPUT | ||
| echo "Container build completed successfully!" | ||
| - name: Cleanup Build Directory | ||
| if: always() | ||
| run: | | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "rm -rf ${{ env.REMOTE_BUILD_PATH }}" || true | ||
| run-tests: | ||
| needs: build | ||
| strategy: | ||
| fail-fast: false | ||
| max-parallel: 6 | ||
| matrix: | ||
| cluster: [p5, p5-smhp] | ||
| model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b] | ||
| runs-on: ubuntu-latest | ||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }} | ||
| cancel-in-progress: false | ||
| timeout-minutes: 375 | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| path: source-code | ||
| - name: Configure AWS Credentials | ||
| uses: aws-actions/configure-aws-credentials@v4 | ||
| with: | ||
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | ||
| aws-region: ${{ env.AWS_REGION }} | ||
| - name: Setup SSH Key | ||
| run: | | ||
| mkdir -p ~/.ssh | ||
| echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key | ||
| chmod 600 ~/.ssh/slurm_key | ||
| for i in {1..5}; do | ||
| if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then | ||
| break | ||
| fi | ||
| echo "SSH keyscan attempt $i failed, retrying..." | ||
| sleep 5 | ||
| done | ||
| - name: Setup Environment Variables | ||
| id: setup | ||
| run: | | ||
| BUILD_ID="${{ github.run_id }}" | ||
| REMOTE_TEST_PATH="${{ env.BASE_PATH }}/container-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}" | ||
| LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}" | ||
| CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}" | ||
| ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh" | ||
| echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT | ||
| echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT | ||
| echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT | ||
| echo "enroot_image=$ENROOT_IMAGE" >> $GITHUB_OUTPUT | ||
| echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV | ||
| echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV | ||
| echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV | ||
| echo "ENROOT_IMAGE=$ENROOT_IMAGE" >> $GITHUB_ENV | ||
| - name: Create Remote Directories | ||
| run: | | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF | ||
| mkdir -p ${{ env.REMOTE_TEST_PATH }} | ||
| mkdir -p ${{ env.LOG_DIR }} | ||
| mkdir -p ${{ env.CHECKPOINT_DIR }} | ||
| chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }} | ||
| EOF | ||
| - name: Transfer Code to Cluster | ||
| run: | | ||
| for i in {1..3}; do | ||
| if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \ | ||
| source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then | ||
| echo "Code transfer successful" | ||
| break | ||
| fi | ||
| echo "Transfer attempt $i failed, retrying..." | ||
| sleep 10 | ||
| done | ||
| - name: Prepare and Submit Slurm Job | ||
| id: submit_job | ||
| env: | ||
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| run: | | ||
| FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP" | ||
| SBATCH_FILE="slurm/${{ matrix.model_config }}-training.sbatch" | ||
| TMP_SBATCH="slurm/regression_test_${{ matrix.model_config }}_${{ matrix.cluster }}.sbatch" | ||
| # Prepare job script on cluster | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF | ||
| set -e | ||
| cd $FSDP_DIR | ||
| if [ ! -f "$SBATCH_FILE" ]; then | ||
| echo "Error: sbatch file ${SBATCH_FILE} does not exist!" | ||
| exit 1 | ||
| fi | ||
| cp "$SBATCH_FILE" "$TMP_SBATCH" | ||
| # Modify sbatch script | ||
| sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH" | ||
| sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH" | ||
| sed -i "s|#export CONTAINER_IMAGE=.*|export CONTAINER_IMAGE=${{ env.ENROOT_IMAGE }}|" "$TMP_SBATCH" | ||
| sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=/checkpoints|g" "$TMP_SBATCH" | ||
| sed -i "s|--container-mounts.*|--container-mounts \\$FSX_MOUNT,${{ env.CHECKPOINT_DIR }}:/checkpoints|" "$TMP_SBATCH" | ||
| # Submit job | ||
| echo "Submitting Slurm job..." | ||
| JOB_ID=\$(sbatch --parsable $TMP_SBATCH) | ||
| echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt | ||
| echo "Submitted job: \$JOB_ID" | ||
| EOF | ||
| # Get job ID | ||
| sleep 2 | ||
| JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2") | ||
| echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT | ||
| echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV | ||
| echo "Submitted Slurm job: $JOB_ID" | ||
| - name: Monitor Job with Real-time Logs | ||
| id: monitor_job | ||
| run: | | ||
| echo "Monitoring job ${{ env.JOB_ID }}..." | ||
| START_TIME=$(date +%s) | ||
| TIMEOUT=21600 # 6 hours | ||
| # Get initial log file name (will be updated once job starts) | ||
| LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out" | ||
| while true; do | ||
| CURRENT_TIME=$(date +%s) | ||
| ELAPSED=$((CURRENT_TIME - START_TIME)) | ||
| if [ $ELAPSED -gt $TIMEOUT ]; then | ||
| echo "Timeout reached after 6 hours" | ||
| exit 1 | ||
| fi | ||
| # Check job status | ||
| JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'") | ||
| if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then | ||
| echo "Job completed successfully" | ||
| break | ||
| elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then | ||
| echo "Job failed with status: $JOB_STATUS" | ||
| exit 1 | ||
| fi | ||
| # Stream logs in real-time | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'" | ||
| echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---" | ||
| sleep 30 | ||
| done | ||
| - name: Retrieve Logs | ||
| if: always() | ||
| run: | | ||
| echo "Retrieving logs from cluster..." | ||
| mkdir -p ./logs | ||
| # Copy logs with retry | ||
| for i in {1..3}; do | ||
| if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then | ||
| echo "Logs retrieved successfully" | ||
| break | ||
| fi | ||
| echo "Log retrieval attempt $i failed, retrying..." | ||
| sleep 10 | ||
| done | ||
| - name: Upload logs as artifacts | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: regression-logs-${{ matrix.model_config }}-${{ matrix.cluster }}-${{ github.run_id }} | ||
| path: ./logs | ||
| retention-days: 60 | ||
| - name: Cleanup | ||
| if: always() | ||
| run: | | ||
| echo "Cleaning up remote resources..." | ||
| # Cancel job if still running | ||
| if [ -n "${{ env.JOB_ID }}" ]; then | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \ | ||
| "scancel ${{ env.JOB_ID }} 2>/dev/null || true" | ||
| fi | ||
| # Clean up directories | ||
| ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \ | ||
| ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF | ||
| rm -rf ${{ env.REMOTE_TEST_PATH }} | ||
| rm -rf ${{ env.LOG_DIR }} | ||
| rm -rf ${{ env.CHECKPOINT_DIR }} | ||
| # Clean up enroot image if this is the last test for this build | ||
| if [ "${{ matrix.model_config }}" == "llama3_1_70b" ]; then | ||
| rm -f ${{ env.ENROOT_IMAGE }} || true | ||
| fi | ||
| EOF | ||
| rm -rf ./logs | ||
| echo "Cleanup completed!" | ||