brute working

tgerdesnv · tgerdesnv · commit d703e1724492 · 2024-01-17T10:20:13.000-06:00
diff --git a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
@@ -136,9 +136,11 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
             for result in top_results:
                 run_config = deepcopy(result.run_config())
                 model_parameters = self._get_model_parameters(model_name)
+                perf_analyzer_flags = self._get_model_perf_analyzer_flags(model_name)
                 inference_load_search = InferenceLoadSearch(
                     config=self._config,
                     model_parameters=model_parameters,
+                    perf_analyzer_flags=perf_analyzer_flags,
                     skip_inference_load_sweep=True,
                 )
                 for inference_load in inference_load_search.search_inference_loads():
@@ -157,6 +159,11 @@ def _get_model_parameters(self, model_name: str) -> Dict:
 
         return {}
 
+    def _get_model_perf_analyzer_flags(self, model_name: str) -> Dict:
+        for model in self._models:
+            if model_name == model.model_name():
+                return model.perf_analyzer_flags()
+
     def _set_inference_load(
         self, run_config: RunConfig, model_parameters: Dict, inference_load: int
     ) -> RunConfig:
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -218,11 +218,16 @@ def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict:
             return {}
 
     def _create_inference_load_list(self) -> List[Any]:
-        # The three possible inference loads are request rate, concurrency or periodic concurrency
-        # For LLM models periodic concurrency is used for non-LLM models
-        # concurrency is the default and will be used unless the user specifies
-        # request rate, either as a model parameter or a config option
-        if self._cli_config.is_llm_model():
+        # The possible inference loads are concurrency, request rate, periodic concurrency, or custom (request-intervals)
+        # - If custom is specified, it is used
+        # - For LLM models, periodic concurrency is used
+        # - For non-LLM models, concurrency is the default and will be used unless
+        #   the user specifies request rate (either as a model parameter or a config option)
+        #
+
+        if "request-intervals" in self._perf_analyzer_flags:
+            return [self._perf_analyzer_flags["request-intervals"]]
+        elif self._cli_config.is_llm_model():
             return self._create_periodic_concurrency_list()
         elif self._cli_config.is_request_rate_specified(self._model_parameters):
             return self._create_request_rate_list()
@@ -413,7 +418,9 @@ def _update_perf_config_based_on_perf_analyzer_flags(
     def _update_perf_config_based_on_inference_load(
         self, perf_config: PerfAnalyzerConfig, inference_load: int
     ) -> None:
-        if self._cli_config.is_llm_model():
+        if "request-intervals" in self._perf_analyzer_flags:
+            pass
+        elif self._cli_config.is_llm_model():
             perf_config.update_config({"periodic-concurrency-range": inference_load})
             perf_config.update_config({"streaming": "True"})
         elif self._cli_config.is_request_rate_specified(self._model_parameters):
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
@@ -291,6 +291,7 @@ def extract_model_specific_parameters(self):
             "concurrency-range": self._args["concurrency-range"],
             "request-rate-range": self._args["request-rate-range"],
             "periodic-concurrency-range": self._args["periodic-concurrency-range"],
+            "request-intervals": self._args["request-intervals"],
             "max-tokens": utils.extract_value_from_request_parameter(
                 self._args["request-parameter"]
             ),
diff --git a/model_analyzer/plots/detailed_plot.py b/model_analyzer/plots/detailed_plot.py
@@ -89,7 +89,6 @@ def __init__(self, name, title, bar_width=0.5):
         self._fig.set_figheight(8)
         self._fig.set_figwidth(12)
 
-        self._ax_latency.set_xlabel("Concurrent Client Requests")
         self._ax_latency.set_ylabel(latency_axis_label)
         self._ax_throughput.set_ylabel(throughput_axis_label)
 
@@ -144,6 +143,18 @@ def add_run_config_measurement(self, run_config_measurement):
                 ]
             )
 
+        if (
+            "request-intervals" in run_config_measurement.model_specific_pa_params()[0]
+            and run_config_measurement.model_specific_pa_params()[0][
+                "request-intervals"
+            ]
+        ):
+            self._data["request-intervals"].append(
+                run_config_measurement.model_specific_pa_params()[0][
+                    "request-intervals"
+                ]
+            )
+
         self._data["perf_throughput"].append(
             run_config_measurement.get_non_gpu_metric_value(tag="perf_throughput")
         )
@@ -164,19 +175,20 @@ def plot_data(self):
         on this plot's Axes object
         """
 
-        # Need to change the default x-axis plot title for request rates
-        if "request_rate" in self._data and self._data["request_rate"][0]:
+        # Update the x-axis plot title
+        if "request-intervals" in self._data and self._data["request-intervals"][0]:
+            self._ax_latency.set_xlabel("Request Intervals File")
+            sort_indices_key = "request-intervals"
+        elif "request_rate" in self._data and self._data["request_rate"][0]:
             self._ax_latency.set_xlabel("Client Request Rate")
-
-        # Sort the data by request rate or concurrency
-        if "request_rate" in self._data and self._data["request_rate"][0]:
-            sort_indices = list(
-                zip(*sorted(enumerate(self._data["request_rate"]), key=lambda x: x[1]))
-            )[0]
+            sort_indices_key = "request_rate"
         else:
-            sort_indices = list(
-                zip(*sorted(enumerate(self._data["concurrency"]), key=lambda x: x[1]))
-            )[0]
+            self._ax_latency.set_xlabel("Concurrent Client Requests")
+            sort_indices_key = "concurrency"
+
+        sort_indices = list(
+            zip(*sorted(enumerate(self._data[sort_indices_key]), key=lambda x: x[1]))
+        )[0]
 
         sorted_data = {
             key: [data_list[i] for i in sort_indices]
@@ -197,10 +209,7 @@ def plot_data(self):
         )
         bottoms = None
 
-        if "request_rate" in self._data:
-            sorted_data["indices"] = list(map(str, sorted_data["request_rate"]))
-        else:
-            sorted_data["indices"] = list(map(str, sorted_data["concurrency"]))
+        sorted_data["indices"] = list(map(str, sorted_data[sort_indices_key]))
 
         # Plot latency breakdown with concurrency casted as string to make uniform x
         for metric, label in labels.items():
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
@@ -775,7 +775,11 @@ def _get_triton_metrics_gpus(self):
     def _print_run_config_info(self, run_config):
         for model_run_config in run_config.model_run_configs():
             perf_config = model_run_config.perf_config()
-            if perf_config["request-rate-range"]:
+            if perf_config["request-intervals"]:
+                logger.info(
+                    f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-intervals={perf_config['request-intervals']}"
+                )
+            elif perf_config["request-rate-range"]:
                 logger.info(
                     f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
                 )
diff --git a/model_analyzer/result/inference_load_search.py b/model_analyzer/result/inference_load_search.py
@@ -45,6 +45,7 @@ def __init__(
         self,
         config: ConfigCommandProfile,
         model_parameters: dict = {},
+        perf_analyzer_flags: dict = {},
         skip_inference_load_sweep: bool = False,
     ) -> None:
         """
@@ -60,6 +61,8 @@ def __init__(
             model_parameters
         )
 
+        self._inference_load_is_custom = "request-intervals" in perf_analyzer_flags
+
         if self._inference_load_is_request_rate:
             self._min_inference_load_index = int(
                 log2(config.run_config_search_min_request_rate)
@@ -97,10 +100,11 @@ def search_inference_loads(self) -> Generator[int, None, None]:
         First performs an inference load sweep, and then, if necessary, perform
         a binary search around the point where the constraint was violated
         """
-        yield from self._perform_inference_load_sweep()
+        if not self._inference_load_is_custom:
+            yield from self._perform_inference_load_sweep()
 
-        if self._was_constraint_violated():
-            yield from self._perform_binary_search()
+            if self._was_constraint_violated():
+                yield from self._perform_binary_search()
 
     def _perform_inference_load_sweep(self) -> Generator[int, None, None]:
         for inference_load in (