[IntegTest]add sharding optimization integ tests (deepjavalibrary#2742)

HappyAmazonian · siddvenk · commit dcaa1870fd5a · 2025-02-27T09:03:13.000-08:00
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -135,6 +135,7 @@ jobs:
       - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
       - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
       - ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
+      - ${{ matrix.test.gh-runner && matrix.test.instance || format('JOB-{0}', 'create-runners') }}
       - ${{ matrix.test.instance }}
     timeout-minutes: 120
     needs: create-runners
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -39,9 +39,19 @@ jobs:
         with:
             djl-version: ${{ needs.build.outputs.djl_version }}
             tag-suffix: ${{ inputs.mode == 'nightly' && format('{0}-{1}', 'nightly', github.sha) || github.sha }}
+
+    optimization-integration-test:
+        needs: [build]
+        uses: ./.github/workflows/optimization_integration.yml
+        secrets: inherit
+        with:
+            djl-version: ${{ needs.build.outputs.djl_version }}
+            tag-suffix: ${{ inputs.mode == 'nightly' && format('{0}-{1}', 'nightly', github.sha) || github.sha }}
+
+            
     determine_images_to_publish:
         if: always()
-        needs: [ integration-test ]
+        needs: [ integration-test, optimization-integration-test ]
         runs-on: ubuntu-latest
         outputs:
             images: ${{ steps.generate-images.outputs.images }}
@@ -60,9 +70,10 @@ jobs:
                   if [[ "${{ needs.integration-test.outputs.failure_aarch64 }}" == "0" ]]; then
                     images+=("aarch64")
                   fi
-                  if [[ "${{ needs.integration-test.outputs.failure_lmi }}" == "0" ]]; then
+                  if [[ "${{ needs.integration-test.outputs.failure_lmi }}" == "0" &&
+                        "${{ needs.optimization-integration-test.outputs.failure_lmi }}" == "0" ]]; then
                     images+=("lmi")
-                    fi
+                  fi
                   if [[ "${{ needs.integration-test.outputs.failure_trtllm }}" == "0" ]]; then
                     images+=("tensorrt-llm")
                   fi
diff --git a/.github/workflows/optimization_integration.yml b/.github/workflows/optimization_integration.yml
@@ -0,0 +1,257 @@
+name: Optimization Integration tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      djl-version:
+        description: 'The released version of DJL.'
+        required: true
+        default: ''
+      tag-suffix:
+        description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
+        required: false
+        type: string
+        default: 'nightly'
+      image-repo:
+        description: 'The repository to fetch images from'
+        required: false
+        type: string
+        default: '185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp'
+  workflow_call:
+    inputs:
+      djl-version:
+        description: 'The released version of DJL.'
+        required: false
+        type: string
+        default: ''
+      tag-suffix:
+        description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
+        required: false
+        type: string
+        default: 'nightly'
+      image-repo:
+        description: 'The repository to fetch images from'
+        required: false
+        type: string
+        default: '185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp'
+    outputs:
+      failure_lmi:
+        value: ${{ jobs.neo-test.outputs.failure_lmi || '0' }}
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  create-optimization-runners:
+    runs-on: [self-hosted, scheduler]
+    steps:
+      - name: Create new G6 instance
+        id: create_g6
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_g6 $token djl-serving
+      - name: Create new G6 instance
+        id: create_g6_2
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_g6 $token djl-serving
+      - name: Create new P4D instance
+        id: create_p4d
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_lmic_p4d $token djl-serving
+    outputs:
+      gpu_instance_id_1: ${{ steps.create_g6.outputs.action_g6_instance_id }}
+      gpu_instance_id_2: ${{ steps.create_g6_2.outputs.action_g6_instance_id }}
+      gpu_instance_id_3: ${{ steps.create_p4d.outputs.action_lmic_p4d_instance_id }}
+
+  neo-test:
+    runs-on:
+      - ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
+      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
+      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
+      - ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
+      - ${{ matrix.test.gh-runner && matrix.test.instance || format('JOB-{0}', 'create-optimization-runners') }}
+      - ${{ matrix.test.instance }}
+    timeout-minutes: 120
+    needs: create-optimization-runners
+    strategy:
+      fail-fast: false
+      matrix:
+        test:
+          - test: MultinodeSharding
+            instance: g6
+            test_handler: vllm_neo
+            test_model_config: llama-3.1-8b-multi-node-sharding
+            test_serve_config: llama-3.1-8b
+            failure-prefix: lmi
+          - test: BasicSharding-g6
+            instance: g6
+            test_handler: vllm_neo
+            test_model_config: tiny-llama-fml
+            test_serve_config: tiny-llama-fml
+            include_fast_model_loading_s3_test: true
+            failure-prefix: lmi
+          - test: BasicSharding-p4d
+            instance: p4d
+            test_handler: vllm_neo
+            test_model_config: tiny-llama-fml
+            test_serve_config: tiny-llama-fml
+            include_fast_model_loading_s3_test: true
+            failure-prefix: lmi
+          - test: LoraSharding-g6
+            instance: g6
+            test_handler: vllm_neo
+            test_model_config: tiny-llama-lora-fml
+            test_serve_config: tiny-llama-lora-fml
+            include_fast_model_loading_s3_test: true
+            failure-prefix: lmi
+          - test: LoraSharding-p4d
+            instance: p4d
+            test_handler: vllm_neo
+            test_model_config: tiny-llama-lora-fml
+            test_serve_config: tiny-llama-lora-fml
+            include_fast_model_loading_s3_test: true
+            failure-prefix: lmi
+    outputs:
+      failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
+    steps:
+      - name: Show environment
+        run: |
+          nvidia-smi -L
+      - name: Clean env
+        run: |
+          sudo rm -rf  tests/integration/models
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - uses: actions/checkout@v4
+      - name: Set up Python3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy pillow huggingface_hub
+      - name: Install s5cmd
+        working-directory: serving/docker
+        run: sudo scripts/install_s5cmd.sh x64
+      - name: ECR Auth
+        working-directory: tests/integration
+        env:
+          TEST_DJL_VERSION: ${{ inputs.djl-version }}
+          IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
+          IMAGE_REPO: ${{ inputs.image-repo }}
+        run: |
+          ECR_REGION=$(echo "$IMAGE_REPO" | awk -F. '{print $4}')
+          if [[ -n "$ECR_REGION" ]]; then
+            aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin "$IMAGE_REPO"
+          fi
+          mkdir logs
+      - name: "Compute Image Uri"
+        id: compute-image-uri
+        env:
+          TEST_DJL_VERSION: ${{ inputs.djl-version }}
+          IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
+          IMAGE_REPO: ${{ inputs.image-repo }}
+          CONTAINER: "lmi"
+        run: |
+          DJL_VERSION=${TEST_DJL_VERSION:-"0.32.0"}
+          DJL_VERSION=$(echo $DJL_VERSION | xargs) # trim whitespace
+
+          if [ -n "$OVERRIDE_TEST_CONTAINER" ]; then
+            TEST_IMAGE_URI=$OVERRIDE_TEST_CONTAINER
+            echo "Warning: An override container has been specified - this container may not work for all tests, ensure you are only running tests compatible with the container" >&2
+          else
+            if [ -z "$IMAGE_REPO" ]; then
+                echo "Error: You must set the docker image repo via IMAGE_REPO environment variable. Ex: deepjavalibrary/djl-serving" >&2
+                exit 1
+            fi
+            CONTAINER_TAG="${DJL_VERSION}-${CONTAINER}"
+            if [ -n "$IMAGE_TAG_SUFFIX" ]; then
+                CONTAINER_TAG="${CONTAINER_TAG}-${IMAGE_TAG_SUFFIX}"
+            fi
+            TEST_IMAGE_URI="${IMAGE_REPO}:${CONTAINER_TAG}"
+          fi
+          echo "Computed image URI: $TEST_IMAGE_URI"
+          echo "TEST_IMAGE_URI=$TEST_IMAGE_URI" >> $GITHUB_OUTPUT
+
+      - name: "Model Optimization Step"
+        working-directory: tests/integration
+        run: |
+          echo ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI}}
+          # Prepare
+          sudo rm -rf models
+          python3 llm/prepare.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_model_config }}
+          ./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models lmi sm_neo_context
+
+      - name: "Local Serving Test"
+        working-directory: tests/integration
+        run: |
+          # test inference
+          ./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models/compiled lmi ${{ contains(matrix.test.test_model_config, 'multi-node') && 'multi_node' || '' }} serve
+          python3 llm/client.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
+          # clean up
+          docker rm -f $(docker ps -aq) || true
+
+      - name: "Fast Model Loading S3 test"
+        if: ${{ matrix.test.include_fast_model_loading_s3_test == 'true' }}
+        env:
+          RUN_NUMBER: ${{ github.run_number }}
+        working-directory: tests/integration
+        run: |
+          aws s3 sync $PWD/models/compiled s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2
+          sudo find "$PWD/models/compiled/" -maxdepth 1 -type d -name "sagemaker-fast-model-loader-*" -exec sudo rm -rf {} +
+          echo "SM_FAST_MODEL_LOADER_S3_URI=s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2" >> $PWD/docker_env
+          # test inference
+          ./launch_container.sh $DJL_CONTAINER_REPO:$DJLSERVING_DOCKER_TAG $PWD/models/compiled lmi serve
+          python3 llm/client.py  ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
+          # clean up
+          docker rm -f $(docker ps -aq) || true
+          sudo rm -rf  $PWD/models
+      - name: On Failure
+        id: test-failure
+        if: ${{ failure() }}
+        working-directory: tests/integration
+        run: |
+          for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
+          sudo rm -rf outputs && sudo rm -rf models
+          rm awscurl
+          ./remove_container.sh
+          failure_prefix="${{ matrix.test.failure-prefix }}"
+          echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT"
+          sudo rm -rf  $PWD/models
+      - name: Upload test logs
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-${{ matrix.test.test }}-logs
+          path: tests/integration/all_logs/
+
+  stop-runners:
+    if: always()
+    runs-on: [ self-hosted, scheduler ]
+    needs: [ create-optimization-runners, neo-test]
+    steps:
+      - name: Stop all instances
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_1 }}
+          ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_2 }}
+          ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_3 }}
+          ./stop_instance.sh $instance_id
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -1114,6 +1114,12 @@
     "llama-3.1-8b": {
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
     },
+    "llama-3.1-8b-multi-node-sharding": {
+        "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
+        "option.tensor_parallel_degree": "2",
+        "option.pipeline_parallel_degree": "2",
+        "option.load_format": "sagemaker_fast_model_loader",
+    },
     "llama-3.1-8b-awq-options": {
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.tensor_parallel_degree": "4",