updating tests

nnshah1 · nnshah1 · commit 91bceb8d6679 · 2024-06-04T17:56:21.000-07:00
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -50,6 +50,7 @@
     "chat": "v1/chat/completions",
     "completions": "v1/completions",
     "generate": "v2/models/{MODEL_NAME}/generate",
+    "kserve": "v2/models/{MODEL_NAME}/infer",
 }
 
 
@@ -348,10 +349,10 @@ def _add_endpoint_args(parser):
     endpoint_group.add_argument(
         "--backend",
         type=str,
-        choices=utils.get_enum_names(OutputFormat)[2:],
+        choices=["tensorrtllm", "vllm"],
         default="tensorrtllm",
         required=False,
-        help=f'When using the "triton" service-kind, '
+        help=f'When using the "kserve" endpoint type, '
         "this is the backend of the model. "
         "For the TENSORRT-LLM backend, you currently must set "
         "'exclude_input_in_output' to true in the model config to "
diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py
@@ -72,19 +72,17 @@ def test_help_version_arguments_output_and_exit(
         [
             (["--concurrency", "3"], {"concurrency": 3}),
             (
-                ["--endpoint-type", "completions", "--service-kind", "openai"],
+                ["--endpoint-type", "completions"],
                 {"endpoint": "v1/completions"},
             ),
             (
-                ["--endpoint-type", "chat", "--service-kind", "openai"],
+                ["--endpoint-type", "chat"],
                 {"endpoint": "v1/chat/completions"},
             ),
             (
                 [
                     "--endpoint-type",
                     "chat",
-                    "--service-kind",
-                    "openai",
                     "--endpoint",
                     "custom/address",
                 ],
@@ -94,8 +92,6 @@ def test_help_version_arguments_output_and_exit(
                 [
                     "--endpoint-type",
                     "chat",
-                    "--service-kind",
-                    "openai",
                     "--endpoint",
                     "   /custom/address",
                 ],
@@ -105,8 +101,6 @@ def test_help_version_arguments_output_and_exit(
                 [
                     "--endpoint-type",
                     "completions",
-                    "--service-kind",
-                    "openai",
                     "--endpoint",
                     "custom/address",
                 ],
@@ -159,9 +153,9 @@ def test_help_version_arguments_output_and_exit(
             ),
             (["--random-seed", "8"], {"random_seed": 8}),
             (["--request-rate", "9.0"], {"request_rate": 9.0}),
-            (["--service-kind", "triton"], {"service_kind": "triton"}),
+            (["--endpoint-type", "kserve"], {"service_kind": "triton"}),
             (
-                ["--service-kind", "openai", "--endpoint-type", "chat"],
+                ["--endpoint-type", "chat"],
                 {"service_kind": "openai", "endpoint": "v1/chat/completions"},
             ),
             (["--stability-percentage", "99.5"], {"stability_percentage": 99.5}),
@@ -209,25 +203,25 @@ def test_file_flags_parsed(self, monkeypatch, mocker):
         "arg, expected_path",
         [
             (
-                ["--service-kind", "openai", "--endpoint-type", "chat"],
+                ["--endpoint-type", "chat"],
                 "artifacts/test_model-openai-chat-concurrency1",
             ),
             (
-                ["--service-kind", "openai", "--endpoint-type", "completions"],
+                ["--endpoint-type", "completions"],
                 "artifacts/test_model-openai-completions-concurrency1",
             ),
             (
-                ["--service-kind", "triton", "--backend", "tensorrtllm"],
+                ["--endpoint-type", "kserve", "--backend", "tensorrtllm"],
                 "artifacts/test_model-triton-tensorrtllm-concurrency1",
             ),
             (
-                ["--service-kind", "triton", "--backend", "vllm"],
+                ["--endpoint-type", "kserve", "--backend", "vllm"],
                 "artifacts/test_model-triton-vllm-concurrency1",
             ),
             (
                 [
-                    "--service-kind",
-                    "triton",
+                    "--endpoint-type",
+                    "kserve",
                     "--backend",
                     "vllm",
                     "--concurrency",
@@ -263,8 +257,6 @@ def test_default_profile_export_filepath(
                 [
                     "--model",
                     "hello/world/test_model",
-                    "--service-kind",
-                    "openai",
                     "--endpoint-type",
                     "chat",
                 ],
@@ -351,22 +343,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys):
     @pytest.mark.parametrize(
         "args, expected_output",
         [
-            (
-                ["genai-perf", "-m", "test_model", "--service-kind", "openai"],
-                "The --endpoint-type option is required when using the 'openai' service-kind.",
-            ),
-            (
-                [
-                    "genai-perf",
-                    "-m",
-                    "test_model",
-                    "--service-kind",
-                    "openai",
-                    "--endpoint",
-                    "custom/address",
-                ],
-                "The --endpoint-type option is required when using the 'openai' service-kind.",
-            ),
             (
                 ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"],
                 "The --output-tokens-mean option is required when using --output-tokens-stddev.",
@@ -394,8 +370,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys):
                     "genai-perf",
                     "-m",
                     "test_model",
-                    "--service-kind",
-                    "openai",
                     "--endpoint-type",
                     "chat",
                     "--output-tokens-mean",
@@ -420,17 +394,15 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys):
         "args, expected_format",
         [
             (
-                ["--service-kind", "openai", "--endpoint-type", "chat"],
+                ["--endpoint-type", "chat"],
                 OutputFormat.OPENAI_CHAT_COMPLETIONS,
             ),
             (
-                ["--service-kind", "openai", "--endpoint-type", "completions"],
+                ["--endpoint-type", "completions"],
                 OutputFormat.OPENAI_COMPLETIONS,
             ),
             (
                 [
-                    "--service-kind",
-                    "openai",
                     "--endpoint-type",
                     "completions",
                     "--endpoint",
@@ -439,10 +411,10 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys):
                 OutputFormat.OPENAI_COMPLETIONS,
             ),
             (
-                ["--service-kind", "triton", "--backend", "tensorrtllm"],
+                ["--endpoint-type", "kserve", "--backend", "tensorrtllm"],
                 OutputFormat.TENSORRTLLM,
             ),
-            (["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM),
+            (["--endpoint-type", "kserve", "--backend", "vllm"], OutputFormat.VLLM),
         ],
     )
     def test_inferred_output_format(self, monkeypatch, args, expected_format):