triton-inference-server
diff --git a/‎ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt‎
Lines changed: 33 additions & 0 deletions b/‎ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎ci/L0_backend_vllm/vllm_backend/test.sh‎
Lines changed: 10 additions & 6 deletions b/‎ci/L0_backend_vllm/vllm_backend/test.sh‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py‎
Lines changed: 7 additions & 3 deletions b/‎ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎samples/basic_model/client.py‎ renamed to ‎samples/client.py‎ b/‎samples/basic_model/client.py‎ renamed to ‎samples/client.py‎
diff --git a/‎samples/ensemble_model/README.md‎
Lines changed: 0 additions & 18 deletions b/‎samples/ensemble_model/README.md‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎samples/ensemble_model/client.py‎ b/‎samples/ensemble_model/client.py‎
diff --git a/‎samples/ensemble_model/model_repository/ensemble_model/config.pbtxt‎
Lines changed: 0 additions & 57 deletions b/‎samples/ensemble_model/model_repository/ensemble_model/config.pbtxt‎
Lines changed: 0 additions & 57 deletions
diff --git a/‎samples/ensemble_model/model_repository/gpt2/1/model.json‎
Lines changed: 0 additions & 5 deletions b/‎samples/ensemble_model/model_repository/gpt2/1/model.json‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎samples/ensemble_model/model_repository/gpt2/config.pbtxt‎
Lines changed: 0 additions & 2 deletions b/‎samples/ensemble_model/model_repository/gpt2/config.pbtxt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎samples/ensemble_model/model_repository/prefix_model/1/model.py‎
Lines changed: 0 additions & 18 deletions b/‎samples/ensemble_model/model_repository/prefix_model/1/model.py‎
Lines changed: 0 additions & 18 deletions
@@ -0,0 +1,33 @@
+name: "ensemble_model"
+platform: "ensemble"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "vllm_model"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "text_input"
+      }
+      output_map {
+        key: "text_output"
+        value: "text_output"
+      }
+    }
+  ]
+}
@@ -35,8 +35,7 @@ SERVER_LOG="./vllm_backend_server.log"
 CLIENT_LOG="./vllm_backend_client.log"
 TEST_RESULT_FILE='test_results.txt'
 CLIENT_PY="./vllm_backend_test.py"
-SAMPLE_BASIC_MODELS_REPO="../../../samples/basic_model/model_repository"
-SAMPLE_ENSEMBLE_MODELS_REPO="../../../samples/ensemble_model/model_repository"
+SAMPLE_MODELS_REPO="../../../samples/model_repository"
 EXPECTED_NUM_TESTS=6
 
 # Helpers =======================================
@@ -50,7 +49,7 @@ function assert_curl_success {
 }
 
 rm -rf models && mkdir -p models
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_opt
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
 # `vllm_opt` model will be loaded on server start and stay loaded throughout
 # unittesting. To test vllm model load/unload we use a dedicated
 # `vllm_load_test`. To ensure that vllm's memory profiler will not error out
@@ -64,13 +63,18 @@ wget -P models/add_sub/1/ https://raw.githubusercontent.com/triton-inference-ser
 wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server/python_backend/main/examples/add_sub/config.pbtxt
 
 # Invalid model attribute
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_1/
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
 sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
 
 # Invalid model name
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_2/
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
 sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json
 
+
+mkdir -p models/ensemble_model/1
+
+cp -r ensemble_config.pbtxt models/ensemble_model/config.pbtxt
+
 RET=0
 
 run_server
@@ -167,4 +171,4 @@ fi
 
 collect_artifacts_from_subdir
 
-exit $RET
+exit $RET
@@ -48,6 +48,7 @@ def setUp(self):
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
         self.vllm_model_name = "vllm_opt"
         self.python_model_name = "add_sub"
+        self.enseble_model_name = "ensemble_model"
         self.vllm_load_test = "vllm_load_test"
 
     def test_vllm_triton_backend(self):
@@ -163,6 +164,12 @@ def test_exclude_input_in_output_true(self):
             expected_output=expected_output,
         )
 
+    def test_ensemble_model(self):
+        # Test to ensure that ensemble models are supported in vllm container.
+        # If ensemble support not present, triton will error out at model loading stage.
+        self.triton_client.load_model(self.enseble_model_name)
+        self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))
+
     def _test_vllm_model(
         self,
         prompts,
@@ -247,9 +254,6 @@ def _test_python_model(self):
             np.allclose(input0_data - input1_data, response.as_numpy("OUTPUT1"))
         )
 
-    def _test_ensemble_model(self):
-        pass
-
     def tearDown(self):
         self.triton_client.close()