Skip to content

Commit f15427e

Browse files
nv-brafmc-nvnv-hwoo
authored
Add MVP LLM support to MA (#783)
* Adding new options for LLM (#768) * Update README and versions for 23.09 branch (#761) (#767) * Adding new options for LLM * Fixing codeQL issues * Fixing codeQL issue --------- Co-authored-by: Misha Chornyi <[email protected]> * Add LLM support to Brute Search (#769) * Initial coding complete * First unit test passing * Adding test for prompt length * Refactor PACG methods * Further refactoring * Ensure early exit isn't enabled for LLM models * Fix type checking errors * Attempt at fixing codeql issue * Revert "Attempt at fixing codeql issue" This reverts commit 2619b83. * Attempt at codeQL fix * Adding deepcopy back in * Removing deepcopy in an attempt to fix codeQL errors * Update model_analyzer/config/input/config_command_profile.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Moving location of method * Changing parameter to inference load * Changing parameter to inference load * Changing prompt length to text input length * Changing max_tokens to use request-parameter * Fix input-data typo * Changing non-parameter to parameter --------- Co-authored-by: Hyunjae Woo <[email protected]> * New LLM record types (#770) * New measurement fields created. * Fixing omission in llm_metric_table * Changing name to be avg_token_to_token... * New config options based on live run (#775) * Added new config options and modified existing options * Refactoring model parameter setting * Removing magic numbers * Capture LLM metrics from PA (#774) * Initial code for aggregation of new LLM metrics * New measurement fields created. * Fixing PA unit tests * Adding hooks in metrics to capture new LLM fields * Fixing codeQL errors * Fixing type checking errors * Changes needed post-merge from other branches * Revert naming mistake (due to merge). * Changes uncovered during live testing * Fixes based on hwoo review * Fixing typo * Change to use lists and mean() * Changes based on hwoo review * Correct how periodic concurrency works in PACG (#777) * Created a new class ConfigRangeNumeric and using it for periodic-concurrency * Fixes and defaults for periodic concurrency * First unit test passing * PACG chagnes complete. Unit tests updated and passing * Removing uneeded class * Fixing codeQL and hwoo's review suggestions * Adding missing else * Llm testing live run (#778) * Created a new class ConfigRangeNumeric and using it for periodic-concurrency * Fixes and defaults for periodic concurrency * First unit test passing * PACG chagnes complete. Unit tests updated and passing * Removing uneeded class * Changes to fix live run * Minor refactor and cleanup * Removing json files * Changing to use f-string * More cleanup from hwoo CR * Removing stale code for request period * Fix nit * Changes to get LLM summary reports working (#779) * Changes to get LLM summary reports working * Addressing hwoo's CR * Adding illegal LLM checks w/ unit testing + some minor cleanup (#781) * Adding illegal LLM checks w/ unit testing + some minor cleanup * Updated with TMA * Misc LLM cleanup (#782) * General cleanup * Add ticket nums to todos * Fix for non-LLM breaking bug introduced. * summary table in progress --------- Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Hyunjae Woo <[email protected]>
1 parent 32389de commit f15427e

32 files changed

+1828
-462
lines changed

model_analyzer/analyzer.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,14 @@ def profile(
136136

137137
if not self._config.skip_summary_reports:
138138
self._create_summary_tables(verbose)
139-
self._create_summary_reports(mode)
140-
self._create_detailed_reports(mode)
139+
140+
# TODO TMA-1401: need to figure out summary reporting for LLMs
141+
if not self._config.is_llm_model():
142+
self._create_summary_reports(mode)
143+
144+
# TODO TMA-1443: need to figure out detailed reporting for LLMs
145+
if not self._config.is_llm_model():
146+
self._create_detailed_reports(mode)
141147

142148
self._check_for_perf_analyzer_errors()
143149

model_analyzer/config/generate/automatic_model_config_generator.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,7 @@ def __init__(
7979
logger.info("")
8080
AutomaticModelConfigGenerator._log_first_run = True
8181

82-
self._max_instance_count = config.run_config_search_max_instance_count
83-
self._min_instance_count = config.run_config_search_min_instance_count
84-
self._max_model_batch_size = config.run_config_search_max_model_batch_size
85-
self._min_model_batch_size = config.run_config_search_min_model_batch_size
82+
self._set_min_max_search_values(config)
8683

8784
self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"
8885

@@ -91,7 +88,7 @@ def __init__(
9188

9289
self._reset_max_batch_size()
9390

94-
if not self._early_exit_enable:
91+
if not self._early_exit_enable and not self._config.is_llm_model():
9592
raise TritonModelAnalyzerException(
9693
"Early exit disable is not supported in automatic model config generator"
9794
)
@@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
162159
config["dynamic_batching"] = {}
163160

164161
return config
162+
163+
def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
164+
self._max_instance_count = config.run_config_search_max_instance_count
165+
self._min_instance_count = config.run_config_search_min_instance_count
166+
self._max_model_batch_size = config.run_config_search_max_model_batch_size
167+
self._min_model_batch_size = config.run_config_search_min_model_batch_size
Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from model_analyzer.config.run.run_config import RunConfig
3030
from model_analyzer.constants import LOGGER_NAME
3131
from model_analyzer.device.gpu_device import GPUDevice
32-
from model_analyzer.result.parameter_search import ParameterSearch
32+
from model_analyzer.result.inference_load_search import InferenceLoadSearch
3333
from model_analyzer.result.result_manager import ResultManager
3434
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
3535
from model_analyzer.triton.client.client import TritonClient
@@ -39,10 +39,10 @@
3939
logger = logging.getLogger(LOGGER_NAME)
4040

4141

42-
class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface):
42+
class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface):
4343
"""
4444
First run BruteRunConfigGenerator for a brute search, then for
45-
automatic searches use ParameterSearch to perform a binary search
45+
automatic searches use InferenceLoadSearch to perform a binary search
4646
"""
4747

4848
def __init__(
@@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:
116116

117117
def _can_binary_search_top_results(self) -> bool:
118118
for model in self._models:
119-
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
119+
if (
120+
model.parameters()["concurrency"]
121+
or model.parameters()["request_rate"]
122+
or self._config.is_llm_model()
123+
):
120124
return False
121125

122126
return True
@@ -132,17 +136,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
132136
for result in top_results:
133137
run_config = deepcopy(result.run_config())
134138
model_parameters = self._get_model_parameters(model_name)
135-
parameter_search = ParameterSearch(
139+
inference_load_search = InferenceLoadSearch(
136140
config=self._config,
137141
model_parameters=model_parameters,
138-
skip_parameter_sweep=True,
142+
skip_inference_load_sweep=True,
139143
)
140-
for parameter in parameter_search.search_parameters():
141-
run_config = self._set_parameter(
142-
run_config, model_parameters, parameter
144+
for inference_load in inference_load_search.search_inference_loads():
145+
run_config = self._set_inference_load(
146+
run_config, model_parameters, inference_load
143147
)
144148
yield run_config
145-
parameter_search.add_run_config_measurement(self._last_measurement)
149+
inference_load_search.add_run_config_measurement(
150+
self._last_measurement
151+
)
146152

147153
def _get_model_parameters(self, model_name: str) -> Dict:
148154
for model in self._models:
@@ -151,14 +157,14 @@ def _get_model_parameters(self, model_name: str) -> Dict:
151157

152158
return {}
153159

154-
def _set_parameter(
155-
self, run_config: RunConfig, model_parameters: Dict, parameter: int
160+
def _set_inference_load(
161+
self, run_config: RunConfig, model_parameters: Dict, inference_load: int
156162
) -> RunConfig:
157163
for model_run_config in run_config.model_run_configs():
158164
perf_config = model_run_config.perf_config()
159165
if self._config.is_request_rate_specified(model_parameters):
160-
perf_config.update_config({"request-rate-range": parameter})
166+
perf_config.update_config({"request-rate-range": inference_load})
161167
else:
162-
perf_config.update_config({"concurrency-range": parameter})
168+
perf_config.update_config({"concurrency-range": inference_load})
163169

164170
return run_config

model_analyzer/config/generate/brute_run_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def __init__(
8080
self._curr_results: List = [[] for n in range(self._num_models)]
8181
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}
8282

83-
self._skip_default_config = skip_default_config
83+
self._skip_default_config = skip_default_config or config.is_llm_model()
8484

8585
def set_last_results(
8686
self, measurements: List[Optional[RunConfigMeasurement]]

model_analyzer/config/generate/generator_utils.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# limitations under the License.
1616

1717
from itertools import product
18-
from typing import Dict, List
18+
from typing import Dict, List, Optional
1919

2020

2121
class GeneratorUtils:
@@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List:
8080
@staticmethod
8181
def generate_parameter_combinations(params: Dict) -> List[Dict]:
8282
"""
83-
Generate a list of all possible subdictionaries
84-
from given dictionary. The subdictionaries will
83+
Generate a list of all possible sub-dictionaries
84+
from given dictionary. The sub-dictionaries will
8585
have all the same keys, but only one value from
8686
each key.
8787
@@ -108,9 +108,45 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
108108
The value that the generated list will not exceed
109109
"""
110110

111+
assert min_value <= max_value
112+
111113
list = []
112114
val = 1 if min_value == 0 else min_value
113115
while val <= max_value:
114116
list.append(val)
115117
val *= 2
116118
return list
119+
120+
@staticmethod
121+
def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int:
122+
if not request_parameter:
123+
return 0
124+
125+
# Format is: <parameter>:<value>:<type>
126+
# Example: max_tokens:10:int
127+
_, value, _ = request_parameter.split(":")
128+
129+
# this catches the case for non-LLM models where the user has specified request parameters
130+
try:
131+
int(value)
132+
except ValueError as _:
133+
return 0
134+
135+
return int(value)
136+
137+
@staticmethod
138+
def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
139+
if not input_data:
140+
return 0
141+
142+
# format is input-data-<num>.json
143+
_, _, text_input_length = input_data.split("-")
144+
text_input_length, _ = text_input_length.split(".")
145+
146+
# this catches the case for non-LLM models where the user has specified input data
147+
try:
148+
int(text_input_length)
149+
except ValueError as _:
150+
return 0
151+
152+
return int(text_input_length)

model_analyzer/config/generate/model_run_config_generator.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,5 +150,13 @@ def _determine_early_exit_enables(
150150
concurrency_specified = model.parameters()["concurrency"]
151151
config_parameters_exist = model.model_config_parameters()
152152

153-
self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
154-
self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
153+
if config.is_llm_model():
154+
self._pacg_early_exit_enable = False
155+
self._mcg_early_exit_enable = False
156+
else:
157+
self._pacg_early_exit_enable = (
158+
early_exit_enable or not concurrency_specified
159+
)
160+
self._mcg_early_exit_enable = (
161+
early_exit_enable or not config_parameters_exist
162+
)

0 commit comments

Comments
 (0)