[ci] Add integration tests for new large model architectures

ethnzhng · ethnzhng · commit fc7e0d53da09 · 2025-11-08T01:12:03.000Z
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -37,14 +37,18 @@ on:
     outputs:
       failure_cpu:
         value: ${{ jobs.test.outputs.failure_cpu || '0' }}
-      failure_gpu:
-        value: ${{ jobs.test.outputs.failure_gpu || '0' }}
+      failure_g6:
+        value: ${{ jobs.test.outputs.failure_g6 || '0' }}
       failure_aarch64:
         value: ${{ jobs.test.outputs.failure_aarch64 || '0' }}
       failure_lmi:
         value: ${{ jobs.test.outputs.failure_lmi || '0' }}
       failure_trtllm:
         value: ${{ jobs.test.outputs.failure_trtllm || '0' }}
+      failure_p4d:
+        value: ${{ jobs.test.outputs.failure_p4d || '0' }}
+      # failure_p4de:
+      #   value: ${{ jobs.test.outputs.failure_p4de || '0' }}
 
 
 permissions:
@@ -56,7 +60,7 @@ jobs:
     runs-on: [self-hosted, scheduler]
     steps:
       - name: Create new G6 instance
-        id: create_gpu
+        id: create_g6_1
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
@@ -65,7 +69,7 @@ jobs:
           | jq '.token' | tr -d '"' )
           ./start_instance.sh action_g6 $token djl-serving
       - name: Create new G6 instance
-        id: create_gpu2
+        id: create_g6_2
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
@@ -74,7 +78,7 @@ jobs:
           | jq '.token' | tr -d '"' )
           ./start_instance.sh action_g6 $token djl-serving
       - name: Create new G6 instance
-        id: create_gpu3
+        id: create_g6_3
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
@@ -83,7 +87,7 @@ jobs:
           | jq '.token' | tr -d '"' )
           ./start_instance.sh action_g6 $token djl-serving
       - name: Create new G6 instance
-        id: create_gpu4
+        id: create_g6_4
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
@@ -110,14 +114,33 @@ jobs:
           --fail \
           | jq '.token' | tr -d '"' )
           ./start_instance.sh action_cpu $token djl-serving
+      - name: Create new P4D instance
+        id: create_p4d_1
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_lmic_p4d $token djl-serving
+      # - name: Create new P4DE instance
+      #   id: create_p4de_1
+      #   run: |
+      #     cd /home/ubuntu/djl_benchmark_script/scripts
+      #     token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+      #     https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+      #     --fail \
+      #     | jq '.token' | tr -d '"' )
+      #     ./start_instance.sh action_lmic_p4de $token djl-serving
     outputs:
-      gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
-      gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
-      gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
-      gpu_instance_id_4: ${{ steps.create_gpu4.outputs.action_g6_instance_id }}
+      g6_instance_id_1: ${{ steps.create_g6_1.outputs.action_g6_instance_id }}
+      g6_instance_id_2: ${{ steps.create_g6_2.outputs.action_g6_instance_id }}
+      g6_instance_id_3: ${{ steps.create_g6_3.outputs.action_g6_instance_id }}
+      g6_instance_id_4: ${{ steps.create_g6_4.outputs.action_g6_instance_id }}
       aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}
-
       cpu_instance_id: ${{ steps.create_cpu.outputs.action_cpu_instance_id }}
+      p4d_instance_id_1: ${{ steps.create_p4d_1.outputs.action_lmic_p4d_instance_id }}
+      # p4de_instance_id_1: ${{ steps.create_p4de_1.outputs.action_lmic_p4de_instance_id }}
 
   test:
     runs-on:
@@ -139,57 +162,67 @@ jobs:
           - test: TestCpuBoth
             instance: cpu
             failure-prefix: cpu
-          - test: TestGpu
+          - test: TestGpu_g6
             instance: g6
             failure-prefix: gpu
           - test: TestAarch64
             instance: aarch64
             failure-prefix: aarch64
-          # - test: TestHfHandler
+          # - test: TestHfHandler_g6
           #   instance: g6
           #   failure-prefix: lmi
-          # - test: TestTrtLlmHandler1
+          # - test: TestTrtLlmHandler1_g6
           #   instance: g6
           #   failure-prefix: trtllm
-          # - test: TestTrtLlmHandler2
+          # - test: TestTrtLlmHandler2_g6
           #   instance: g6
           #   failure-prefix: trtllm
-          - test: TestVllm1
+          - test: TestVllm1_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestVllm2
+          - test: TestVllm2_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestVllmCustomHandlers
+          - test: TestVllmCustomHandlers_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestVllmCustomFormatters
+          - test: TestVllmCustomFormatters_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestVllmLora
+          - test: TestVllmLora_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestVllmAsyncLora
+          - test: TestVllmAsyncLora_g6
             instance: g6
             failure-prefix: lmi
-          - test: TestMultiModalVllm
+          - test: TestMultiModalVllm_g6
             instance: g6
             failure-prefix: lmi
-          # - test: TestTextEmbedding
+          # - test: TestTextEmbedding_g6
           #   instance: g6
           #   failure-prefix: lmi
-          # - test: TestCorrectnessTrtLlm
+          # - test: TestCorrectnessTrtLlm_g6
           #   instance: g6
           #   failure-prefix: trtllm
-          - test: TestStatefulModel
+          - test: TestStatefulModel_g6
             instance: g6
             failure-prefix: lmi
+          # P4D instance tests
+          - test: TestVllm_p4d
+            instance: p4d
+            failure-prefix: lmi
+          # P4DE instance tests
+          # - test: TestVllm_p4de
+          #   instance: p4de
+          #   failure-prefix: lmi
     outputs:
       failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
-      failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
+      failure_g6: ${{ steps.test-failure.outputs.failure_g6 }}
       failure_aarch64: ${{ steps.test-failure.outputs.failure_aarch64 }}
       failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
       failure_trtllm: ${{ steps.test-failure.outputs.failure_trtllm }}
+      failure_p4d: ${{ steps.test-failure.outputs.failure_p4d }}
+      # failure_p4de: ${{ steps.test-failure.outputs.failure_p4de }}
 
     steps:
       - uses: actions/checkout@v4
@@ -269,16 +302,19 @@ jobs:
       - name: Stop all instances
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
+          instance_id=${{ needs.create-runners.outputs.g6_instance_id_1 }}
           ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
+          instance_id=${{ needs.create-runners.outputs.g6_instance_id_2 }}
           ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
+          instance_id=${{ needs.create-runners.outputs.g6_instance_id_3 }}
           ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_4 }}
+          instance_id=${{ needs.create-runners.outputs.g6_instance_id_4 }}
           ./stop_instance.sh $instance_id
           instance_id=${{ needs.create-runners.outputs.aarch64_instance_id }}
           ./stop_instance.sh $instance_id
-
           instance_id=${{ needs.create-runners.outputs.cpu_instance_id }}
           ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-runners.outputs.p4d_instance_id_1 }}
+          ./stop_instance.sh $instance_id
+          # instance_id=${{ needs.create-runners.outputs.p4de_instance_id_1 }}
+          # ./stop_instance.sh $instance_id
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -229,6 +229,16 @@ def get_model_name():
         "seq_length": [25],
         "tokenizer": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     },
+    "llama-4-scout-17b-16e-instruct": {
+        "batch_size": [1, 2],
+        "seq_length": [256],
+        "tokenizer": "unsloth/Llama-4-Scout-17B-16E-Instruct",
+    },
+    "minimax-m2": {
+        "batch_size": [1, 2],
+        "seq_length": [256],
+        "tokenizer": "MiniMaxAI/MiniMax-M2",
+    },
 }
 
 vllm_neo_model_spec = {
@@ -522,6 +532,10 @@ def get_model_name():
     "llama32-11b-multimodal": {
         "batch_size": [1],
     },
+    "qwen3-vl-32b-instruct": {
+        "batch_size": [1, 2],
+        "tokenizer": "Qwen/Qwen2-VL-72B-Instruct"
+    },
 }
 
 text_embedding_model_spec = {
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -442,6 +442,32 @@
         "option.max_rolling_batch_size": "1",
         "option.enforce_eager": True,
     },
+    "qwen3-vl-32b-instruct": {
+        "option.model_id": "s3://djl-llm/Qwen3-VL-32B-Instruct/",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 8,
+        "option.max_rolling_batch_size": 4,
+        "option.trust_remote_code": True,
+        "option.limit_mm_per_prompt": '{"image": 4, "video": 0}',
+    },
+    "minimax-m2": {
+        "option.model_id": "s3://djl-llm/MiniMax-M2/",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 8,
+        "option.max_rolling_batch_size": 4,
+        "option.trust_remote_code": True,
+        "option.max_model_len": 16384,
+        "option.gpu_memory_utilization": "0.9",
+    },
+    "llama-4-scout-17b-16e-instruct": {
+        "option.model_id": "s3://djl-llm/Llama-4-Scout-17B-16E-Instruct/",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 8,
+        "option.max_rolling_batch_size": 4,
+        "option.trust_remote_code": True,
+        "option.max_model_len": 16384,
+        "option.gpu_memory_utilization": "0.9",
+    },
 }
 
 vllm_neo_model_list = {
diff --git a/tests/integration/pytest.ini b/tests/integration/pytest.ini
@@ -2,8 +2,8 @@
 log_cli = true
 log_cli_level = INFO
 markers =
-    gpu: Runs on any gpu machine
-    gpu_4: Runs on a machine with at least 4 gpus (includes gpu mark)
+    gpu_4: Runs on a machine with 4 gpus
+    gpu_8: Runs on a machine with 8 gpus
 
     aarch64: Runs on aarch64
     cpu: Tests cpu
diff --git a/tests/integration/tests.py b/tests/integration/tests.py