Skip to content

Commit d703e17

Browse files
committed
brute working
1 parent 398c688 commit d703e17

File tree

6 files changed

+58
-26
lines changed

6 files changed

+58
-26
lines changed

model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,11 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
136136
for result in top_results:
137137
run_config = deepcopy(result.run_config())
138138
model_parameters = self._get_model_parameters(model_name)
139+
perf_analyzer_flags = self._get_model_perf_analyzer_flags(model_name)
139140
inference_load_search = InferenceLoadSearch(
140141
config=self._config,
141142
model_parameters=model_parameters,
143+
perf_analyzer_flags=perf_analyzer_flags,
142144
skip_inference_load_sweep=True,
143145
)
144146
for inference_load in inference_load_search.search_inference_loads():
@@ -157,6 +159,11 @@ def _get_model_parameters(self, model_name: str) -> Dict:
157159

158160
return {}
159161

162+
def _get_model_perf_analyzer_flags(self, model_name: str) -> Dict:
163+
for model in self._models:
164+
if model_name == model.model_name():
165+
return model.perf_analyzer_flags()
166+
160167
def _set_inference_load(
161168
self, run_config: RunConfig, model_parameters: Dict, inference_load: int
162169
) -> RunConfig:

model_analyzer/config/generate/perf_analyzer_config_generator.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,11 +218,16 @@ def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict:
218218
return {}
219219

220220
def _create_inference_load_list(self) -> List[Any]:
221-
# The three possible inference loads are request rate, concurrency or periodic concurrency
222-
# For LLM models periodic concurrency is used for non-LLM models
223-
# concurrency is the default and will be used unless the user specifies
224-
# request rate, either as a model parameter or a config option
225-
if self._cli_config.is_llm_model():
221+
# The possible inference loads are concurrency, request rate, periodic concurrency, or custom (request-intervals)
222+
# - If custom is specified, it is used
223+
# - For LLM models, periodic concurrency is used
224+
# - For non-LLM models, concurrency is the default and will be used unless
225+
# the user specifies request rate (either as a model parameter or a config option)
226+
#
227+
228+
if "request-intervals" in self._perf_analyzer_flags:
229+
return [self._perf_analyzer_flags["request-intervals"]]
230+
elif self._cli_config.is_llm_model():
226231
return self._create_periodic_concurrency_list()
227232
elif self._cli_config.is_request_rate_specified(self._model_parameters):
228233
return self._create_request_rate_list()
@@ -413,7 +418,9 @@ def _update_perf_config_based_on_perf_analyzer_flags(
413418
def _update_perf_config_based_on_inference_load(
414419
self, perf_config: PerfAnalyzerConfig, inference_load: int
415420
) -> None:
416-
if self._cli_config.is_llm_model():
421+
if "request-intervals" in self._perf_analyzer_flags:
422+
pass
423+
elif self._cli_config.is_llm_model():
417424
perf_config.update_config({"periodic-concurrency-range": inference_load})
418425
perf_config.update_config({"streaming": "True"})
419426
elif self._cli_config.is_request_rate_specified(self._model_parameters):

model_analyzer/perf_analyzer/perf_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ def extract_model_specific_parameters(self):
291291
"concurrency-range": self._args["concurrency-range"],
292292
"request-rate-range": self._args["request-rate-range"],
293293
"periodic-concurrency-range": self._args["periodic-concurrency-range"],
294+
"request-intervals": self._args["request-intervals"],
294295
"max-tokens": utils.extract_value_from_request_parameter(
295296
self._args["request-parameter"]
296297
),

model_analyzer/plots/detailed_plot.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ def __init__(self, name, title, bar_width=0.5):
8989
self._fig.set_figheight(8)
9090
self._fig.set_figwidth(12)
9191

92-
self._ax_latency.set_xlabel("Concurrent Client Requests")
9392
self._ax_latency.set_ylabel(latency_axis_label)
9493
self._ax_throughput.set_ylabel(throughput_axis_label)
9594

@@ -144,6 +143,18 @@ def add_run_config_measurement(self, run_config_measurement):
144143
]
145144
)
146145

146+
if (
147+
"request-intervals" in run_config_measurement.model_specific_pa_params()[0]
148+
and run_config_measurement.model_specific_pa_params()[0][
149+
"request-intervals"
150+
]
151+
):
152+
self._data["request-intervals"].append(
153+
run_config_measurement.model_specific_pa_params()[0][
154+
"request-intervals"
155+
]
156+
)
157+
147158
self._data["perf_throughput"].append(
148159
run_config_measurement.get_non_gpu_metric_value(tag="perf_throughput")
149160
)
@@ -164,19 +175,20 @@ def plot_data(self):
164175
on this plot's Axes object
165176
"""
166177

167-
# Need to change the default x-axis plot title for request rates
168-
if "request_rate" in self._data and self._data["request_rate"][0]:
178+
# Update the x-axis plot title
179+
if "request-intervals" in self._data and self._data["request-intervals"][0]:
180+
self._ax_latency.set_xlabel("Request Intervals File")
181+
sort_indices_key = "request-intervals"
182+
elif "request_rate" in self._data and self._data["request_rate"][0]:
169183
self._ax_latency.set_xlabel("Client Request Rate")
170-
171-
# Sort the data by request rate or concurrency
172-
if "request_rate" in self._data and self._data["request_rate"][0]:
173-
sort_indices = list(
174-
zip(*sorted(enumerate(self._data["request_rate"]), key=lambda x: x[1]))
175-
)[0]
184+
sort_indices_key = "request_rate"
176185
else:
177-
sort_indices = list(
178-
zip(*sorted(enumerate(self._data["concurrency"]), key=lambda x: x[1]))
179-
)[0]
186+
self._ax_latency.set_xlabel("Concurrent Client Requests")
187+
sort_indices_key = "concurrency"
188+
189+
sort_indices = list(
190+
zip(*sorted(enumerate(self._data[sort_indices_key]), key=lambda x: x[1]))
191+
)[0]
180192

181193
sorted_data = {
182194
key: [data_list[i] for i in sort_indices]
@@ -197,10 +209,7 @@ def plot_data(self):
197209
)
198210
bottoms = None
199211

200-
if "request_rate" in self._data:
201-
sorted_data["indices"] = list(map(str, sorted_data["request_rate"]))
202-
else:
203-
sorted_data["indices"] = list(map(str, sorted_data["concurrency"]))
212+
sorted_data["indices"] = list(map(str, sorted_data[sort_indices_key]))
204213

205214
# Plot latency breakdown with concurrency casted as string to make uniform x
206215
for metric, label in labels.items():

model_analyzer/record/metrics_manager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,11 @@ def _get_triton_metrics_gpus(self):
775775
def _print_run_config_info(self, run_config):
776776
for model_run_config in run_config.model_run_configs():
777777
perf_config = model_run_config.perf_config()
778-
if perf_config["request-rate-range"]:
778+
if perf_config["request-intervals"]:
779+
logger.info(
780+
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-intervals={perf_config['request-intervals']}"
781+
)
782+
elif perf_config["request-rate-range"]:
779783
logger.info(
780784
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
781785
)

model_analyzer/result/inference_load_search.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(
4545
self,
4646
config: ConfigCommandProfile,
4747
model_parameters: dict = {},
48+
perf_analyzer_flags: dict = {},
4849
skip_inference_load_sweep: bool = False,
4950
) -> None:
5051
"""
@@ -60,6 +61,8 @@ def __init__(
6061
model_parameters
6162
)
6263

64+
self._inference_load_is_custom = "request-intervals" in perf_analyzer_flags
65+
6366
if self._inference_load_is_request_rate:
6467
self._min_inference_load_index = int(
6568
log2(config.run_config_search_min_request_rate)
@@ -97,10 +100,11 @@ def search_inference_loads(self) -> Generator[int, None, None]:
97100
First performs an inference load sweep, and then, if necessary, perform
98101
a binary search around the point where the constraint was violated
99102
"""
100-
yield from self._perform_inference_load_sweep()
103+
if not self._inference_load_is_custom:
104+
yield from self._perform_inference_load_sweep()
101105

102-
if self._was_constraint_violated():
103-
yield from self._perform_binary_search()
106+
if self._was_constraint_violated():
107+
yield from self._perform_binary_search()
104108

105109
def _perform_inference_load_sweep(self) -> Generator[int, None, None]:
106110
for inference_load in (

0 commit comments

Comments
 (0)