generated from amazon-archives/__template_Apache-2.0
-
Notifications
You must be signed in to change notification settings - Fork 87
246 lines (236 loc) · 10.2 KB
/
optimization_integration.yml
File metadata and controls
246 lines (236 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
name: Optimization Integration tests
on:
workflow_dispatch:
inputs:
djl-version:
description: 'The released version of DJL.'
required: false
default: ''
tag-suffix:
description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
required: false
type: string
default: 'nightly'
workflow_call:
inputs:
djl-version:
description: 'The released version of DJL.'
required: false
type: string
default: 'nightly'
tag-suffix:
description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
required: false
type: string
default: ''
outputs:
failure_lmi:
value: ${{ jobs.neo-test.outputs.failure_lmi || '0' }}
permissions:
id-token: write
contents: read
env:
AWS_ECR_REPO: "185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"
jobs:
create-optimization-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new G6 instance
id: create_g6
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new G6 instance
id: create_g6_2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new P4D instance
id: create_p4d
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_lmic_p4d $token djl-serving
outputs:
gpu_instance_id_1: ${{ steps.create_g6.outputs.action_g6_instance_id }}
gpu_instance_id_2: ${{ steps.create_g6_2.outputs.action_g6_instance_id }}
gpu_instance_id_3: ${{ steps.create_p4d.outputs.action_lmic_p4d_instance_id }}
neo-test:
runs-on:
- ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('JOB-{0}', 'create-optimization-runners') }}
- ${{ matrix.test.instance }}
timeout-minutes: 120
needs: create-optimization-runners
strategy:
fail-fast: false
matrix:
test:
- test: MultinodeSharding
instance: g6
test_handler: vllm_neo
test_model_config: llama-3.1-8b-multi-node-sharding
test_serve_config: llama-3.1-8b
failure-prefix: lmi
- test: BasicSharding-g6
instance: g6
test_handler: vllm_neo
test_model_config: tiny-llama-fml
test_serve_config: tiny-llama-fml
include_fast_model_loading_s3_test: true
failure-prefix: lmi
- test: BasicSharding-p4d
instance: p4d
test_handler: vllm_neo
test_model_config: tiny-llama-fml
test_serve_config: tiny-llama-fml
include_fast_model_loading_s3_test: true
failure-prefix: lmi
- test: LoraSharding-g6
instance: g6
test_handler: vllm_neo
test_model_config: tiny-llama-lora-fml
test_serve_config: tiny-llama-lora-fml
include_fast_model_loading_s3_test: true
failure-prefix: lmi
- test: LoraSharding-p4d
instance: p4d
test_handler: vllm_neo
test_model_config: tiny-llama-lora-fml
test_serve_config: tiny-llama-lora-fml
include_fast_model_loading_s3_test: true
failure-prefix: lmi
outputs:
failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
steps:
- name: Show environment
run: |
nvidia-smi -L
- name: Clean env
run: |
sudo rm -rf tests/integration/models
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy pillow huggingface_hub
- name: Install s5cmd
working-directory: serving/docker
run: sudo scripts/install_s5cmd.sh x64
- name: ECR Auth
working-directory: tests/integration
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
run: |
ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
mkdir logs
- name: "Compute Image Uri"
id: compute-image-uri
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
run: |
repo=$IMAGE_REPO
container="lmi"
if [ -z "${TEST_DJL_VERSION}" ] || [ "${TEST_DJL_VERSION}" == "nightly" ]; then
flavor="${container}-nightly"
elif [ "${TEST_DJL_VERSION}" == "temp" ]; then
flavor="${container}-temp-${GITHUB_SHA}"
else
flavor="${container}-${TEST_DJL_VERSION}-${GITHUB_SHA}"
fi
# Override flavor if OVERRIDE_IMAGE_TAG_SUFFIX is set
if [ -n "${OVERRIDE_IMAGE_TAG_SUFFIX}" ]; then
flavor="${container}-${OVERRIDE_IMAGE_TAG_SUFFIX}"
fi
# Compute final image URL
image="${repo}:${flavor}"
echo $image
echo "TEST_IMAGE_URI=$image" >> $GITHUB_OUTPUT
- name: "Model Optimization Step"
working-directory: tests/integration
run: |
echo ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI}}
# Prepare
sudo rm -rf models
python3 llm/prepare.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_model_config }}
./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models lmi sm_neo_context
- name: "Local Serving Test"
working-directory: tests/integration
run: |
# test inference
./launch_container.sh ${{ steps.compute-image-uri.outputs.TEST_IMAGE_URI }} $PWD/models/compiled lmi ${{ contains(matrix.test.test_model_config, 'multi-node') && 'multi_node' || '' }} serve
python3 llm/client.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
# clean up
docker rm -f $(docker ps -aq) || true
- name: "Fast Model Loading S3 test"
if: ${{ matrix.test.include_fast_model_loading_s3_test == 'true' }}
env:
RUN_NUMBER: ${{ github.run_number }}
working-directory: tests/integration
run: |
aws s3 sync $PWD/models/compiled s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2
sudo find "$PWD/models/compiled/" -maxdepth 1 -type d -name "sagemaker-fast-model-loader-*" -exec sudo rm -rf {} +
echo "SM_FAST_MODEL_LOADER_S3_URI=s3://djl-scratch-001-gamma-us-west-2/github-workflows/$RUN_NUMBER/${{ matrix.test.test_model_config }}-${{ matrix.test.instance }}-tp2" >> $PWD/docker_env
# test inference
./launch_container.sh $DJL_CONTAINER_REPO:$DJLSERVING_DOCKER_TAG $PWD/models/compiled lmi serve
python3 llm/client.py ${{ matrix.test.test_handler }} ${{ matrix.test.test_serve_config }}
# clean up
docker rm -f $(docker ps -aq) || true
sudo rm -rf $PWD/models
- name: On Failure
id: test-failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
failure_prefix="${{ matrix.test.failure-prefix }}"
echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT"
sudo rm -rf $PWD/models
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-${{ matrix.test.test }}-logs
path: tests/integration/all_logs/
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-optimization-runners, neo-test]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_2 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-optimization-runners.outputs.gpu_instance_id_3 }}
./stop_instance.sh $instance_id