Skip to content

Update comment for P5 FI_* in fsdp.yaml-template (#934) #83

Update comment for P5 FI_* in fsdp.yaml-template (#934)

Update comment for P5 FI_* in fsdp.yaml-template (#934) #83

name: FSDP Regression Test (venv)
# TODO: Additional test cases to matrix. Change max-parallel.
on:
push:
branches: [ "main" ]
paths:
- '3.test_cases/pytorch/FSDP/**'
pull_request:
paths:
- '3.test_cases/pytorch/FSDP/**'
workflow_dispatch:
jobs:
regression:
strategy:
fail-fast: true
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: [self-hosted, "${{ matrix.cluster }}"]
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
cancel-in-progress: false
timeout-minutes: 360 # 6 hours for the full Llama 2 test
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
- name: Set env vars
run: |
HOME_DIR="/home/github"
BUILD_ID="${{ github.run_id }}"
FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"
LOG_DIR="${HOME_DIR}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
CHECKPOINT_DIR="${HOME_DIR}/regression-checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
echo "HOME_DIR=$HOME_DIR" >> $GITHUB_ENV
echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV
echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV
echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
echo "Env vars set successfully!"
- name: Create directories
run: |
mkdir -p ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
- name: Create virtual environment
working-directory: ${{ env.FSDP_DIR }}/slurm
run: |
bash ./create_venv.sh
echo "Virtual environment created successfully!"
- name: Run regression test
id: run_test
working-directory: ${{ env.FSDP_DIR }}/slurm
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
source env/bin/activate
SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
if [ ! -f "$SBATCH_FILE" ]; then
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
exit 1
fi
cp "$SBATCH_FILE" "$TMP_SBATCH"
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
echo "Submitting Slurm job..."
sbatch --wait ${TMP_SBATCH}
exit_code=$?
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
echo "Slurm job completed with exit code: $exit_code"
if [ $exit_code -ne 0 ]; then
echo "Slurm job failed with exit code: $exit_code"
exit $exit_code
fi
- name: Upload logs as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }}
path: ${{ env.LOG_DIR }}
retention-days: 60
- name: Cleanup
if: always()
run: |
echo "Cleaning up..."
rm -rf ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
echo "Logs and checkpoints cleaned up successfully!"