Skip to content

Fix typo in val_batch_size and remove unused imports (#908) #94

Fix typo in val_batch_size and remove unused imports (#908)

Fix typo in val_batch_size and remove unused imports (#908) #94

name: FSDP Regression Test (venv)
# TODO: Additional test cases to matrix. Change max-parallel.
on:
push:
branches: [ "main" ]
paths:
- '3.test_cases/pytorch/FSDP/**'
pull_request:
paths:
- '3.test_cases/pytorch/FSDP/**'
workflow_dispatch:
env:
AWS_REGION: us-east-1
SLURM_HOST: p5en.smml.aiml.aws.dev
SLURM_USER: ghactions
AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
BASE_PATH: /fsx/agents/pr-reviews
HOME_PATH: /home/ghactions
permissions:
id-token: write
contents: read
jobs:
regression:
strategy:
fail-fast: true
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
cancel-in-progress: false
timeout-minutes: 375
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: source-code
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup SSH Key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
chmod 600 ~/.ssh/slurm_key
# Add host to known hosts with retry
for i in {1..5}; do
if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH keyscan successful"
break
fi
echo "SSH keyscan attempt $i failed, retrying..."
sleep 5
done
- name: Setup Environment Variables
id: setup
run: |
BUILD_ID="${{ github.run_id }}"
REMOTE_TEST_PATH="${{ env.BASE_PATH }}/venv-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT
echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT
echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT
echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV
echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
- name: Create Remote Directories
run: |
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
mkdir -p ${{ env.REMOTE_TEST_PATH }}
mkdir -p ${{ env.LOG_DIR }}
mkdir -p ${{ env.CHECKPOINT_DIR }}
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
EOF

Check failure on line 93 in .github/workflows/fsdp-regression-test-venv.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/fsdp-regression-test-venv.yml

Invalid workflow file

You have an error in your yaml syntax on line 93
- name: Transfer Code to Cluster
run: |
# Transfer code with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then
echo "Code transfer successful"
break
fi
echo "Transfer attempt $i failed, retrying..."
sleep 10
done
- name: Create Virtual Environment on Cluster
run: |
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
echo "Creating virtual environment on cluster..."
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_SLURM_DIR
bash ./create_venv.sh
echo "Virtual environment created successfully!"
EOF
- name: Prepare and Submit Slurm Job
id: submit_job
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
# Prepare and submit job on cluster
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_SLURM_DIR
if [ ! -f "$SBATCH_FILE" ]; then
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
exit 1
fi
cp "$SBATCH_FILE" "$TMP_SBATCH"
# Modify sbatch script
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
# Activate venv in the sbatch script
sed -i '1a source env/bin/activate' "$TMP_SBATCH"
# Submit job
echo "Submitting Slurm job..."
JOB_ID=\$(sbatch --parsable \$TMP_SBATCH)
echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt
echo "Submitted job: \$JOB_ID"
EOF
# Get job ID
sleep 2
JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2")
echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
echo "Submitted Slurm job: $JOB_ID"
- name: Monitor Job with Real-time Logs
id: monitor_job
run: |
echo "Monitoring job ${{ env.JOB_ID }}..."
START_TIME=$(date +%s)
TIMEOUT=21600 # 6 hours
LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -gt $TIMEOUT ]; then
echo "Timeout reached after 6 hours"
exit 1
fi
# Check job status
JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'")
if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
echo "Job completed successfully"
break
elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
echo "Job failed with status: $JOB_STATUS"
exit 1
fi
# Stream logs in real-time
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'"
echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---"
sleep 30
done
- name: Retrieve Logs
if: always()
run: |
echo "Retrieving logs from cluster..."
mkdir -p ./logs
# Copy logs with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then
echo "Logs retrieved successfully"
break
fi
echo "Log retrieval attempt $i failed, retrying..."
sleep 10
done
- name: Upload logs as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }}
path: ./logs
retention-days: 60
- name: Cleanup
if: always()
run: |
echo "Cleaning up remote resources..."
# Cancel job if still running
if [ -n "${{ env.JOB_ID }}" ]; then
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"scancel ${{ env.JOB_ID }} 2>/dev/null || true"
fi
# Clean up directories
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
rm -rf ${{ env.REMOTE_TEST_PATH }}
rm -rf ${{ env.LOG_DIR }}
rm -rf ${{ env.CHECKPOINT_DIR }}
EOF
rm -rf ./logs
echo "Cleanup completed!"