Skip to content

Commit 8298d83

Browse files
nv-brafroot
andauthored
Merge LLM Metrics changes to main (#855)
* New Records for LLM metrics (#839) * Adding new LLM metrics * Adding base class for perf, inter_token, and time_to_first latency records * Add --llm-mode option (#842) * Adding CLI hook for LLM * Changing to use --model-type * Capture LLM metrics from genai-perf in MA (#844) * Successfully reading from LLM CSV * General cleanup * All unit tests passing * Fixing metric table typos * Fixing typos * Update constraints for LLMs (#845) * Adding LLM values to list of possible constraints * Fixing typo * Adding new output fields for LLM (#846) * Profiling model using genai-perf (#849) * Initial changes to run genai-perf in MA * Gating call to get LLM records * Fixing captilization issue * Removing debug * Adding TODO --------- Co-authored-by: root <[email protected]> * Add genai_perf CLI options to MA (#854) * Added support for genai_perf CLI * Remove dead code * Removing genai_perf collateral * Fixing codeQL issue * Adding streaming to genai_perf_config --------- Co-authored-by: root <[email protected]>
1 parent 792f2a4 commit 8298d83

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2329
-285
lines changed

docs/config.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ cpu_only_composing_models: <comma-delimited-string-list>
236236
# Skips the generation of detailed reports and tables
237237
[ skip_detailed_reports: <bool> | default: false]
238238
239+
# Type of model being profiled: generic or LLM
240+
[ model_type: <string> | default: generic]
241+
239242
# Number of top configs to show in summary plots
240243
[ num_configs_per_model: <int> | default: 3]
241244
@@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze
364367

365368
### `<constraint>`
366369

367-
A constraint, specifies the bounds that determine a successful run. There are
368-
three constraints allowed:
369-
370-
| Option Name | Units | Constraint | Description |
371-
| :----------------- | :-------: | :--------: | :--------------------------------------------------- |
372-
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
373-
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
374-
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |
370+
A constraint, specifies the bounds that determine a successful run. The table below shows examples
371+
of the types of constraints allowed:
372+
373+
| Option Name | Units | Constraint | Description |
374+
| :------------------------ | :-------: | :--------: | :----------------------------------------------------- |
375+
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
376+
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
377+
| `output_token_throughput` | tok / sec | min | Specify minimum desired output token throughput. |
378+
| `inter_token_latency_p99` | ms | max | Specify maximum tolerable input token latency. |
379+
| `time_to_first_token_p99` | ms | max | Specify maximum tolerable time to first token latency. |
380+
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |
375381

376382
<br>
377383

model_analyzer/config/generate/brute_run_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def _generate_subset(
129129
self._send_results_to_generator(index)
130130

131131
def _make_run_config(self) -> RunConfig:
132-
run_config = RunConfig(self._triton_env)
132+
run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags())
133133
for index in range(len(self._models)):
134134
run_config.add_model_run_config(self._curr_model_run_configs[index])
135135
return run_config

model_analyzer/config/input/config_command_profile.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
)
3232
from model_analyzer.constants import LOGGER_NAME
3333
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
34+
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
3435
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
3536
from model_analyzer.record.record import RecordType
3637
from model_analyzer.triton.server.server_config import TritonServerConfig
@@ -50,7 +51,9 @@
5051
DEFAULT_GPU_OUTPUT_FIELDS,
5152
DEFAULT_GPUS,
5253
DEFAULT_INFERENCE_OUTPUT_FIELDS,
54+
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
5355
DEFAULT_MAX_RETRIES,
56+
DEFAULT_MODEL_TYPE,
5457
DEFAULT_MODEL_WEIGHTING,
5558
DEFAULT_MONITORING_INTERVAL,
5659
DEFAULT_NUM_CONFIGS_PER_MODEL,
@@ -297,6 +300,15 @@ def _fill_config(self):
297300
description="Skips the generation of detailed summary reports and tables.",
298301
)
299302
)
303+
self._add_config(
304+
ConfigField(
305+
"model_type",
306+
flags=["--model-type"],
307+
field_type=ConfigPrimitive(str),
308+
default_value=DEFAULT_MODEL_TYPE,
309+
description="Type of model being profiled: generic or LLM",
310+
)
311+
)
300312

301313
self._add_repository_configs()
302314
self._add_client_configs()
@@ -364,6 +376,10 @@ def _add_profile_models_configs(self):
364376
}
365377
)
366378

379+
genai_perf_flags_scheme = ConfigObject(
380+
schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()}
381+
)
382+
367383
triton_server_environment_scheme = ConfigObject(
368384
schema={"*": ConfigPrimitive(str)}
369385
)
@@ -444,6 +460,13 @@ def _add_profile_models_configs(self):
444460
description="Allows custom configuration of the perf analyzer instances used by model analyzer.",
445461
)
446462
)
463+
self._add_config(
464+
ConfigField(
465+
"genai_perf_flags",
466+
field_type=genai_perf_flags_scheme,
467+
description="Allows custom configuration of the GenAI Perf instances used by model analyzer.",
468+
)
469+
)
447470
self._add_config(
448471
ConfigField(
449472
"triton_server_flags",
@@ -484,6 +507,11 @@ def _add_profile_models_configs(self):
484507
"min": ConfigPrimitive(int),
485508
}
486509
),
510+
"output_token_throughput": ConfigObject(
511+
schema={
512+
"min": ConfigPrimitive(int),
513+
}
514+
),
487515
"perf_latency_avg": ConfigObject(
488516
schema={
489517
"max": ConfigPrimitive(int),
@@ -514,6 +542,96 @@ def _add_profile_models_configs(self):
514542
"max": ConfigPrimitive(int),
515543
}
516544
),
545+
"inter_token_latency_p99": ConfigObject(
546+
schema={
547+
"max": ConfigPrimitive(int),
548+
}
549+
),
550+
"inter_token_latency_p95": ConfigObject(
551+
schema={
552+
"max": ConfigPrimitive(int),
553+
}
554+
),
555+
"inter_token_latency_p90": ConfigObject(
556+
schema={
557+
"max": ConfigPrimitive(int),
558+
}
559+
),
560+
"inter_token_latency_p75": ConfigObject(
561+
schema={
562+
"max": ConfigPrimitive(int),
563+
}
564+
),
565+
"inter_token_latency_p50": ConfigObject(
566+
schema={
567+
"max": ConfigPrimitive(int),
568+
}
569+
),
570+
"inter_token_latency_p25": ConfigObject(
571+
schema={
572+
"max": ConfigPrimitive(int),
573+
}
574+
),
575+
"inter_token_latency_min": ConfigObject(
576+
schema={
577+
"max": ConfigPrimitive(int),
578+
}
579+
),
580+
"inter_token_latency_max": ConfigObject(
581+
schema={
582+
"max": ConfigPrimitive(int),
583+
}
584+
),
585+
"inter_token_latency_avg": ConfigObject(
586+
schema={
587+
"max": ConfigPrimitive(int),
588+
}
589+
),
590+
"time_to_first_token_p99": ConfigObject(
591+
schema={
592+
"max": ConfigPrimitive(int),
593+
}
594+
),
595+
"time_to_first_token_p95": ConfigObject(
596+
schema={
597+
"max": ConfigPrimitive(int),
598+
}
599+
),
600+
"time_to_first_token_p90": ConfigObject(
601+
schema={
602+
"max": ConfigPrimitive(int),
603+
}
604+
),
605+
"time_to_first_token_p75": ConfigObject(
606+
schema={
607+
"max": ConfigPrimitive(int),
608+
}
609+
),
610+
"time_to_first_token_p50": ConfigObject(
611+
schema={
612+
"max": ConfigPrimitive(int),
613+
}
614+
),
615+
"time_to_first_token_p25": ConfigObject(
616+
schema={
617+
"max": ConfigPrimitive(int),
618+
}
619+
),
620+
"time_to_first_token_min": ConfigObject(
621+
schema={
622+
"max": ConfigPrimitive(int),
623+
}
624+
),
625+
"time_to_first_token_max": ConfigObject(
626+
schema={
627+
"max": ConfigPrimitive(int),
628+
}
629+
),
630+
"time_to_first_token_avg": ConfigObject(
631+
schema={
632+
"max": ConfigPrimitive(int),
633+
}
634+
),
517635
}
518636
)
519637
self._add_config(
@@ -560,6 +678,7 @@ def _add_profile_models_configs(self):
560678
"weighting": ConfigPrimitive(type_=int),
561679
"model_config_parameters": model_config_fields,
562680
"perf_analyzer_flags": perf_analyzer_flags_scheme,
681+
"genai_perf_flags": genai_perf_flags_scheme,
563682
"triton_server_flags": triton_server_flags_scheme,
564683
"triton_server_environment": triton_server_environment_scheme,
565684
"triton_docker_args": triton_docker_args_scheme,
@@ -1344,6 +1463,12 @@ def _autofill_values(self):
13441463
if not self._fields["gpu_output_fields"].is_set_by_user():
13451464
self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
13461465

1466+
# Switch default output fields if user specifies model type of LLM
1467+
# and the user didn't specify a custom output field
1468+
if self.model_type == "LLM":
1469+
if not self._fields["inference_output_fields"].is_set_by_user():
1470+
self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
1471+
13471472
new_profile_models = {}
13481473
for i, model in enumerate(self.profile_models):
13491474
new_model = {"cpu_only": (model.cpu_only() or cpu_only)}
@@ -1447,6 +1572,12 @@ def _autofill_values(self):
14471572
else:
14481573
new_model["perf_analyzer_flags"] = model.perf_analyzer_flags()
14491574

1575+
# GenAI Perf flags
1576+
if not model.genai_perf_flags():
1577+
new_model["genai_perf_flags"] = self.genai_perf_flags
1578+
else:
1579+
new_model["genai_perf_flags"] = model.genai_perf_flags()
1580+
14501581
# triton server flags
14511582
if not model.triton_server_flags():
14521583
new_model["triton_server_flags"] = self.triton_server_flags

model_analyzer/config/input/config_defaults.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
DEFAULT_PERF_OUTPUT_FLAG = False
6969
DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10
7070
DEFAULT_MEASUREMENT_MODE = "count_windows"
71+
DEFAULT_MODEL_TYPE = "generic"
7172

7273
DEFAULT_ONLINE_PLOTS = {
7374
"throughput_v_latency": {
@@ -118,6 +119,20 @@
118119
"perf_throughput",
119120
"perf_latency_p99",
120121
]
122+
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
123+
"model_name",
124+
"batch_size",
125+
"concurrency",
126+
"model_config_path",
127+
"instance_group",
128+
"max_batch_size",
129+
"satisfies_constraints",
130+
"perf_throughput",
131+
"perf_latency_p99",
132+
"inter_token_latency_p99",
133+
"time_to_first_token_p99",
134+
"output_token_throughput",
135+
]
121136
DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
122137
"model_name",
123138
"batch_size",

model_analyzer/config/input/objects/config_model_profile_spec.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
parameters=None,
3434
model_config_parameters=None,
3535
perf_analyzer_flags=None,
36+
genai_perf_flags=None,
3637
triton_server_flags=None,
3738
triton_server_environment=None,
3839
triton_docker_args=None,
@@ -58,6 +59,9 @@ def __init__(
5859
perf_analyzer_flags : dict
5960
The custom perf analyzer configuration
6061
for this model
62+
genai_perf_flags : dict
63+
The custom GenAI perf configuration
64+
for this model
6165
triton_server_flags : dict
6266
The configuration for the triton server instance launched
6367
for this model
@@ -78,6 +82,7 @@ def __init__(
7882
self._parameters = parameters
7983
self._model_config_parameters = model_config_parameters
8084
self._perf_analyzer_flags = perf_analyzer_flags
85+
self._genai_perf_flags = genai_perf_flags
8186
self._triton_server_flags = triton_server_flags
8287
self._triton_server_environment = triton_server_environment
8388
self._triton_docker_args = triton_docker_args
@@ -162,6 +167,16 @@ def perf_analyzer_flags(self):
162167

163168
return self._perf_analyzer_flags
164169

170+
def genai_perf_flags(self):
171+
"""
172+
Returns
173+
-------
174+
dict:
175+
the genai_perf_flags
176+
"""
177+
178+
return self._genai_perf_flags
179+
165180
def triton_server_flags(self):
166181
"""
167182
Returns
@@ -304,4 +319,7 @@ def __repr__(self):
304319
if self._perf_analyzer_flags:
305320
model_object["perf_analyzer_flags"] = self._perf_analyzer_flags
306321

322+
if self._genai_perf_flags:
323+
model_object["genai_perf_flags"] = self._genai_perf_flags
324+
307325
return str(model_object)

model_analyzer/config/run/run_config.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from typing import List
1818

1919
from model_analyzer.config.run.model_run_config import ModelRunConfig
20+
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
2021

2122

2223
class RunConfig:
@@ -25,16 +26,21 @@ class RunConfig:
2526
at the same time in Perf Analyzer
2627
"""
2728

28-
def __init__(self, triton_env):
29+
def __init__(self, triton_env, genai_perf_flags=None):
2930
"""
3031
Parameters
3132
----------
3233
triton_env : dict
3334
A dictionary of environment variables to set
3435
when launching tritonserver
36+
37+
genai_perf_flags: dict
38+
The set of flags used when calling genai_perf for LLM models
3539
"""
3640

3741
self._triton_env = triton_env
42+
self._genai_perf_config = GenaiPerfConfig()
43+
self._genai_perf_config.update_config(genai_perf_flags)
3844
self._model_run_configs: List[ModelRunConfig] = []
3945

4046
def add_model_run_config(self, model_run_config):
@@ -103,6 +109,9 @@ def triton_environment(self):
103109

104110
return self._triton_env
105111

112+
def genai_perf_config(self):
113+
return self._genai_perf_config
114+
106115
def models_name(self):
107116
"""Returns a single comma-joined name of the original model names"""
108117
return ",".join([mrc.model_name() for mrc in self.model_run_configs()])

model_analyzer/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,7 @@
7070

7171
# Model analyzer package name
7272
PACKAGE_NAME = "triton-model-analyzer"
73+
74+
# GENAI-PERF
75+
GENAI_PERF_CSV = "profile_export_genai_perf.csv"
76+
GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"]

0 commit comments

Comments
 (0)