djl-serving/.github/workflows/optimization_integration.yml at 3eed8a367be9a5d8bc8f679e32bf52d6a47f1421 · deepjavalibrary/djl-serving · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
name: Optimization Integration tests

on:
  workflow_dispatch:
    inputs:
      djl-version:
        description: 'The released version of DJL.'
        required: false
        default: ''
      tag-suffix:
        description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
        required: false
        type: string
        default: 'nightly'
  workflow_call:
    inputs:
      djl-version:
        description: 'The released version of DJL.'
        required: false
        type: string
        default: 'nightly'
      tag-suffix:
        description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
        required: false
        type: string
        default: ''
    outputs:
      failure_lmi:
        value: ${{ jobs.neo-test.outputs.failure_lmi || '0' }}

permissions:
  id-token: write
  contents: read

env:
  AWS_ECR_REPO: "185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"

jobs:
  create-optimization-runners:
    runs-on: [self-hosted, scheduler]
    steps:
      - name: Create new G6 instance
        id: create_g6
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_g6 $token djl-serving
      - name: Create new G6 instance
        id: create_g6_2
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_g6 $token djl-serving
      - name: Create new P4D instance
        id: create_p4d
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_lmic_p4d $token djl-serving
    outputs:
      gpu_instance_id_1: ${{ steps.create_g6.outputs.action_g6_instance_id }}
      gpu_instance_id_2: ${{ steps.create_g6_2.outputs.action_g6_instance_id }}
      gpu_instance_id_3: ${{ steps.create_p4d.outputs.action_lmic_p4d_instance_id }}

  neo-test:
    runs-on:
      - ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
      - ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
      - ${{ matrix.test.gh-runner && matrix.test.instance || format('JOB-{0}', 'create-optimization-runners') }}
      - ${{ matrix.test.instance }}
    timeout-minutes: 120
    needs: create-optimization-runners
    strategy:
      fail-fast: false
      matrix:
        test:
          - test: MultinodeSharding
            instance: g6
            test_handler: vllm_neo
            test_model_config: llama-3.1-8b-multi-node-sharding
            test_serve_config: llama-3.1-8b
            failure-prefix: lmi
          - test: BasicSharding-g6
            instance: g6
            test_handler: vllm_neo
            test_model_config: tiny-llama-fml
            test_serve_config: tiny-llama-fml
            include_fast_model_loading_s3_test: true
            failure-prefix: lmi
          - test: BasicSharding-p4d
            instance: p4d
            test_handler: vllm_neo
            test_model_config: tiny-llama-fml
            test_serve_config: tiny-llama-fml
            include_fast_model_loading_s3_test: true
            failure-prefix: lmi
          - test: LoraSharding-g6
            instance: g6
            test_handler: vllm_neo
            test_model_config: tiny-llama-lora-fml
            test_serve_config: tiny-llama-lora-fml
            include_fast_model_loading_s3_test: true
            failure-prefix: lmi
          - test: LoraSharding-p4d
            instance: p4d
            test_handler: vllm_neo
            test_model_config: tiny-llama-lora-fml
            test_serve_config: tiny-llama-lora-fml
            include_fast_model_loading_s3_test: true
            failure-prefix: lmi
    outputs:
      failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
    steps:
      - name: Show environment
        run: |
          nvidia-smi -L
      - name: Clean env
        run: |
          sudo rm -rf  tests/integration/models
          yes | docker system prune -a --volumes
          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
          echo "wait dpkg lock..."
          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
      - uses: actions/checkout@v4
      - name: Set up Python3
        uses: actions/setup-python@v5
        with:
          python-version: '3.10.x'
      - name: Install pip dependencies
        run: pip3 install requests numpy pillow huggingface_hub
      - name: Install s5cmd
        working-directory: serving/docker
        run: sudo scripts/install_s5cmd.sh x64
      - name: ECR Auth
        working-directory: tests/integration
        env:
          TEST_DJL_VERSION: ${{ inputs.djl-version }}
          OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
          IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
        run: |
          ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
          aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
          mkdir logs
      - name: "Compute Image Uri"
        id: compute-image-uri
        env:
          TEST_DJL_VERSION: ${{ inputs.djl-version }}
          OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
          IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
        run: |
          repo=$IMAGE_REPO
          container="lmi"
          if [ -z "${TEST_DJL_VERSION}" ] || [ "${TEST_DJL_VERSION}" == "nightly" ]; then
              flavor="${container}-nightly"
          elif [ "${TEST_DJL_VERSION}" == "temp" ]; then
              flavor="${container}-temp-${GITHUB_SHA}"
          else
              flavor="${container}-${TEST_DJL_VERSION}-${GITHUB_SHA}"
          fi
          # Override flavor if OVERRIDE_IMAGE_TAG_SUFFIX is set
          if [ -n "${OVERRIDE_IMAGE_TAG_SUFFIX}" ]; then
              flavor="${container}-${OVERRIDE_IMAGE_TAG_SUFFIX}"
          fi

          # Compute final image URL
          image="${repo}:${flavor}"
          echo $image
          echo "TEST_IMAGE_URI=$image" >> $GITHUB_OUTPUT

      - name: "Model Optimization Step"
        working-directory: tests/integration
        run: |
          echo ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI}}
          # Prepare
          sudo rm -rf models
          python3 llm/prepare.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_model_config }}
          ./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models lmi sm_neo_context

      - name: "Local Serving Test"
        working-directory: tests/integration
        run: |
          # test inference
          ./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models/compiled lmi ${{ contains(matrix.test.test_model_config, 'multi-node') && 'multi_node' || '' }} serve
          python3 llm/client.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
          # clean up
          docker rm -f $(docker ps -aq) || true

      - name: "Fast Model Loading S3 test"
        if: ${{ matrix.test.include_fast_model_loading_s3_test == 'true' }}
        env:
          RUN_NUMBER: ${{ github.run_number }}
        working-directory: tests/integration
        run: |
          aws s3 sync $PWD/models/compiled s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2
          sudo find "$PWD/models/compiled/" -maxdepth 1 -type d -name "sagemaker-fast-model-loader-*" -exec sudo rm -rf {} +
          echo "SM_FAST_MODEL_LOADER_S3_URI=s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2" >> $PWD/docker_env
          # test inference
          ./launch_container.sh $DJL_CONTAINER_REPO:$DJLSERVING_DOCKER_TAG $PWD/models/compiled lmi serve
          python3 llm/client.py  ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
          # clean up
          docker rm -f $(docker ps -aq) || true
          sudo rm -rf  $PWD/models
      - name: On Failure
        id: test-failure
        if: ${{ failure() }}
        working-directory: tests/integration
        run: |
          for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
          sudo rm -rf outputs && sudo rm -rf models
          rm awscurl
          ./remove_container.sh
          failure_prefix="${{ matrix.test.failure-prefix }}"
          echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT"
          sudo rm -rf  $PWD/models
      - name: Upload test logs
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: test-${{ matrix.test.test }}-logs
          path: tests/integration/all_logs/

  stop-runners:
    if: always()
    runs-on: [ self-hosted, scheduler ]
    needs: [ create-optimization-runners, neo-test]
    steps:
      - name: Stop all instances
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_1 }}
          ./stop_instance.sh $instance_id
          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_2 }}
          ./stop_instance.sh $instance_id
          instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_3 }}
          ./stop_instance.sh $instance_id