Update comment for P5 FI_* in fsdp.yaml-template (#934) #83
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: FSDP Regression Test (venv) | |
| # TODO: Additional test cases to matrix. Change max-parallel. | |
| on: | |
| push: | |
| branches: [ "main" ] | |
| paths: | |
| - '3.test_cases/pytorch/FSDP/**' | |
| pull_request: | |
| paths: | |
| - '3.test_cases/pytorch/FSDP/**' | |
| workflow_dispatch: | |
| jobs: | |
| regression: | |
| strategy: | |
| fail-fast: true | |
| max-parallel: 3 | |
| matrix: | |
| cluster: [p5, p5-smhp] | |
| model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b] | |
| runs-on: [self-hosted, "${{ matrix.cluster }}"] | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }} | |
| cancel-in-progress: false | |
| timeout-minutes: 360 # 6 hours for the full Llama 2 test | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| path: ${{ github.run_id }} | |
| - name: Set env vars | |
| run: | | |
| HOME_DIR="/home/github" | |
| BUILD_ID="${{ github.run_id }}" | |
| FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP" | |
| LOG_DIR="${HOME_DIR}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}" | |
| CHECKPOINT_DIR="${HOME_DIR}/regression-checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}" | |
| echo "HOME_DIR=$HOME_DIR" >> $GITHUB_ENV | |
| echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV | |
| echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV | |
| echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV | |
| echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV | |
| echo "Env vars set successfully!" | |
| - name: Create directories | |
| run: | | |
| mkdir -p ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }} | |
| chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }} | |
| - name: Create virtual environment | |
| working-directory: ${{ env.FSDP_DIR }}/slurm | |
| run: | | |
| bash ./create_venv.sh | |
| echo "Virtual environment created successfully!" | |
| - name: Run regression test | |
| id: run_test | |
| working-directory: ${{ env.FSDP_DIR }}/slurm | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| source env/bin/activate | |
| SBATCH_FILE="${{ matrix.model_config }}-training.sbatch" | |
| TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch" | |
| if [ ! -f "$SBATCH_FILE" ]; then | |
| echo "Error: sbatch file ${SBATCH_FILE} does not exist!" | |
| exit 1 | |
| fi | |
| cp "$SBATCH_FILE" "$TMP_SBATCH" | |
| sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH" | |
| sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH" | |
| sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH" | |
| echo "Submitting Slurm job..." | |
| sbatch --wait ${TMP_SBATCH} | |
| exit_code=$? | |
| echo "exit_code=$exit_code" >> $GITHUB_OUTPUT | |
| echo "Slurm job completed with exit code: $exit_code" | |
| if [ $exit_code -ne 0 ]; then | |
| echo "Slurm job failed with exit code: $exit_code" | |
| exit $exit_code | |
| fi | |
| - name: Upload logs as artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }} | |
| path: ${{ env.LOG_DIR }} | |
| retention-days: 60 | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| echo "Cleaning up..." | |
| rm -rf ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }} | |
| echo "Logs and checkpoints cleaned up successfully!" | |