triton-inference-server
diff --git a/‎docs/config.md‎
Lines changed: 7 additions & 5 deletions b/‎docs/config.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎docs/config_search.md‎
Lines changed: 8 additions & 2 deletions b/‎docs/config_search.md‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎model_analyzer/config/input/config_command_profile.py‎
Lines changed: 35 additions & 1 deletion b/‎model_analyzer/config/input/config_command_profile.py‎
Lines changed: 35 additions & 1 deletion
diff --git a/‎model_analyzer/config/input/config_defaults.py‎
Lines changed: 10 additions & 0 deletions b/‎model_analyzer/config/input/config_defaults.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎model_analyzer/perf_analyzer/perf_config.py‎
Lines changed: 2 additions & 1 deletion b/‎model_analyzer/perf_analyzer/perf_config.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎model_analyzer/plots/detailed_plot.py‎
Lines changed: 36 additions & 14 deletions b/‎model_analyzer/plots/detailed_plot.py‎
Lines changed: 36 additions & 14 deletions
diff --git a/‎model_analyzer/plots/plot_manager.py‎
Lines changed: 4 additions & 3 deletions b/‎model_analyzer/plots/plot_manager.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎model_analyzer/reports/report_manager.py‎
Lines changed: 16 additions & 2 deletions b/‎model_analyzer/reports/report_manager.py‎
Lines changed: 16 additions & 2 deletions
@@ -513,10 +513,11 @@ cannot be specified globally.
 
 Options available under this parameter are described in table below:
 
-| Option Name   | Description                                             | Supporting Types                                   |
-| :------------ | :------------------------------------------------------ | :------------------------------------------------- |
-| `concurrency` | Request concurrency used for generating the input load. | `<range>`, `<comma-delimited-list>`, or a `<list>` |
-| `batch_sizes` | Static batch size used for generating requests.         | `<range>`, `<comma-delimited-list>`, or a `<list>` |
+| Option Name    | Description                                             | Supporting Types                                   |
+| :------------- | :------------------------------------------------------ | :------------------------------------------------- |
+| `concurrency`  | Request concurrency used for generating the input load. | `<range>`, `<comma-delimited-list>`, or a `<list>` |
+| `request_rate` | Request rate used for generating the input load.        | `<range>`, `<comma-delimited-list>`, or a `<list>` |
+| `batch_sizes`  | Static batch size used for generating requests.         | `<range>`, `<comma-delimited-list>`, or a `<list>` |
 
 An example `<parameter>` looks like below:
 
@@ -765,6 +766,7 @@ More information about this can be found in the
 - Model Analyzer also provides certain arguments to the `perf_analyzer`
   instances it launches. They are the following:
   - `concurrency-range`
+  - `request-rate-range`
   - `batch-size`
   - `model-name`
   - `measurement-mode`
@@ -773,7 +775,7 @@ More information about this can be found in the
   - `model-repository`
   - `protocol`
   - `url`
-    If provided under the `perf_analyzer_flags` section, their values will be overriden. Caution should therefore be exercised when overriding these.
+    If provided under the `perf_analyzer_flags` section, their values will be overridden. Caution should therefore be exercised when overriding these.
     <br>
 
 ## `<triton-server-flags>`
 
@@ -117,12 +117,18 @@ You can also modify the minimum/maximum values that the automatic search space w
 
 ---
 
-### [Request Concurrency Search Space](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md#request-concurrency)
+### [Request Concurrency Search Space](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/docs/inference_load_modes.md#concurrency-mode))
 
 - `Default:` 1 to 1024 concurrencies, sweeping over powers of 2 (i.e. 1, 2, 4, 8, ...)
 - `--run-config-search-min-concurrency: <val>`: Changes the request concurrency minimum automatic search space value
 - `--run-config-search-max-concurrency: <val>`: Changes the request concurrency maximum automatic search space value
 
+### [Request Rate Search Space](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/docs/inference_load_modes.md#request-rate-mode)
+
+- `Default:` 1 to 1024 concurrencies, sweeping over powers of 2 (i.e. 1, 2, 4, 8, ...)
+- `--run-config-search-min-request-rate: <val>`: Changes the request rate minimum automatic search space value
+- `--run-config-search-max-request-rate: <val>`: Changes the request rate maximum automatic search space value
+
 ---
 
 _An example YAML config that limits the search space:_
@@ -144,7 +150,7 @@ _This will perform an Automatic Brute Search with instance group counts: 3-5, ba
 
 ### **Interaction with Remote Triton Launch Mode**
 
-When the triton launch mode is remote, _\*\*only concurrency values can be swept._\*\*<br>
+When the triton launch mode is remote, _\*\*only concurrency or request rate values can be swept._\*\*<br>
 
 Model Analyzer will ignore any model config parameters because we have no way of accessing and modifying the model repository of the remote Triton Server.
 
 
@@ -44,7 +44,8 @@
     DEFAULT_TRITON_SERVER_PATH, DEFAULT_PERF_ANALYZER_TIMEOUT, \
     DEFAULT_EXPORT_PATH, DEFAULT_FILENAME_MODEL_INFERENCE, DEFAULT_FILENAME_MODEL_GPU, \
     DEFAULT_FILENAME_SERVER_ONLY, DEFAULT_NUM_CONFIGS_PER_MODEL, DEFAULT_NUM_TOP_MODEL_CONFIGS, \
-    DEFAULT_INFERENCE_OUTPUT_FIELDS, DEFAULT_GPU_OUTPUT_FIELDS, DEFAULT_SERVER_OUTPUT_FIELDS, \
+    DEFAULT_INFERENCE_OUTPUT_FIELDS, DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS, \
+    DEFAULT_GPU_OUTPUT_FIELDS, DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS, DEFAULT_SERVER_OUTPUT_FIELDS, \
     DEFAULT_ONLINE_OBJECTIVES, DEFAULT_ONLINE_PLOTS, DEFAULT_OFFLINE_PLOTS, DEFAULT_MODEL_WEIGHTING
 
 from model_analyzer.constants import LOGGER_NAME
@@ -1074,6 +1075,15 @@ def _autofill_values(self):
                         'min': self.min_throughput
                     }})
 
+        # Switch default output fields if request rate is being used
+        # and the user didn't specify a custom output field
+        if self._using_request_rate():
+            if not self._fields['inference_output_fields'].is_set_by_user():
+                self.inference_output_fields = DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
+
+            if not self._fields['gpu_output_fields'].is_set_by_user():
+                self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
+
         new_profile_models = {}
         for i, model in enumerate(self.profile_models):
             new_model = {'cpu_only': (model.cpu_only() or cpu_only)}
@@ -1197,3 +1207,27 @@ def _autofill_values(self):
 
             new_profile_models[model.model_name()] = new_model
         self._fields['profile_models'].set_value(new_profile_models)
+
+    def _using_request_rate(self) -> bool:
+        if self.request_rate or self.request_rate_search_enable:
+            return True
+        elif self._fields['run_config_search_max_request_rate'].is_set_by_user() or \
+             self._fields['run_config_search_min_request_rate'].is_set_by_user():
+            return True
+        else:
+            return self._are_models_using_request_rate()
+
+    def _are_models_using_request_rate(self) -> bool:
+        model_using_request_rate = False
+        model_using_concurrency = False
+        for i, model in enumerate(self.profile_models):
+            if model.parameters() and 'request_rate' in model.parameters():
+                model_using_request_rate = True
+            else:
+                model_using_concurrency = True
+
+        if model_using_request_rate and model_using_concurrency:
+            raise TritonModelAnalyzerException("Parameters in all profiled models must use request-rate-range. "\
+                "Model Analyzer does not support mixing concurrency-range and request-rate-range.")
+        else:
+            return model_using_request_rate
@@ -108,11 +108,21 @@
     'instance_group', 'max_batch_size', 'satisfies_constraints',
     'perf_throughput', 'perf_latency_p99'
 ]
+DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
+    'model_name', 'batch_size', 'request_rate', 'model_config_path',
+    'instance_group', 'max_batch_size', 'satisfies_constraints',
+    'perf_throughput', 'perf_latency_p99'
+]
 DEFAULT_GPU_OUTPUT_FIELDS = [
     'model_name', 'gpu_uuid', 'batch_size', 'concurrency', 'model_config_path',
     'instance_group', 'satisfies_constraints', 'gpu_used_memory',
     'gpu_utilization', 'gpu_power_usage'
 ]
+DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS = [
+    'model_name', 'gpu_uuid', 'batch_size', 'request_rate', 'model_config_path',
+    'instance_group', 'satisfies_constraints', 'gpu_used_memory',
+    'gpu_utilization', 'gpu_power_usage'
+]
 DEFAULT_SERVER_OUTPUT_FIELDS = [
     'model_name', 'gpu_uuid', 'gpu_used_memory', 'gpu_utilization',
     'gpu_power_usage'
 
@@ -225,7 +225,8 @@ def extract_model_specific_parameters(self):
 
         return {
             'batch-size': self._options['-b'],
-            'concurrency-range': self._args['concurrency-range']
+            'concurrency-range': self._args['concurrency-range'],
+            'request-rate-range': self._args['request-rate-range']
         }
 
     @classmethod
 
@@ -103,7 +103,7 @@ def data(self):
 
     def add_run_config_measurement(self, run_config_measurement):
         """
-        Adds a measurment to this plot
+        Adds a measurement to this plot
 
         Parameters
         ----------
@@ -113,9 +113,19 @@ def add_run_config_measurement(self, run_config_measurement):
         """
 
         # TODO-TMA-568: This needs to be updated because there will be multiple model configs
-        self._data['concurrency'].append(
-            run_config_measurement.model_specific_pa_params()[0]
-            ['concurrency-range'])
+        if 'concurrency-range' in run_config_measurement.model_specific_pa_params(
+        )[0] and run_config_measurement.model_specific_pa_params(
+        )[0]['concurrency-range']:
+            self._data['concurrency'].append(
+                run_config_measurement.model_specific_pa_params()[0]
+                ['concurrency-range'])
+
+        if 'request-rate-range' in run_config_measurement.model_specific_pa_params(
+        )[0] and run_config_measurement.model_specific_pa_params(
+        )[0]['request-rate-range']:
+            self._data['request_rate'].append(
+                run_config_measurement.model_specific_pa_params()[0]
+                ['request-rate-range'])
 
         self._data['perf_throughput'].append(
             run_config_measurement.get_non_gpu_metric_value(
@@ -135,13 +145,23 @@ def plot_data(self):
         on this plot's Axes object
         """
 
-        # Sort the data by concurrency
-        concurrency_sort_indices = list(
-            zip(*sorted(enumerate(self._data['concurrency']),
-                        key=lambda x: x[1])))[0]
+        # Need to change the default x-axis plot title for request rates
+        if 'request_rate' in self._data and self._data['request_rate'][0]:
+            self._ax_latency.set_xlabel('Client Request Rate')
+
+        # Sort the data by request rate or concurrency
+        if 'request_rate' in self._data and self._data['request_rate'][0]:
+            print(f"\n\nFound request rate: {self._data['request_rate']}\n\n")
+            sort_indices = list(
+                zip(*sorted(enumerate(self._data['request_rate']),
+                            key=lambda x: x[1])))[0]
+        else:
+            sort_indices = list(
+                zip(*sorted(enumerate(self._data['concurrency']),
+                            key=lambda x: x[1])))[0]
 
         sorted_data = {
-            key: [data_list[i] for i in concurrency_sort_indices
+            key: [data_list[i] for i in sort_indices
                  ] for key, data_list in self._data.items()
         }
 
@@ -153,11 +173,14 @@ def plot_data(self):
             ]))
         bottoms = None
 
-        sorted_data['concurrency'] = list(map(str, sorted_data['concurrency']))
+        if 'request_rate' in self._data:
+            sorted_data['indices'] = list(map(str, sorted_data['request_rate']))
+        else:
+            sorted_data['indices'] = list(map(str, sorted_data['concurrency']))
 
         # Plot latency breakdown with concurrency casted as string to make uniform x
         for metric, label in labels.items():
-            self._ax_latency.bar(sorted_data['concurrency'],
+            self._ax_latency.bar(sorted_data['indices'],
                                  sorted_data[metric],
                                  width=self._bar_width,
                                  label=label,
@@ -171,7 +194,7 @@ def plot_data(self):
 
         # Plot the inference line
         inference_line = self._ax_throughput.plot(
-            sorted_data['concurrency'],
+            sorted_data['indices'],
             sorted_data['perf_throughput'],
             label='Inferences/second',
             marker='o',
@@ -190,8 +213,7 @@ def plot_data(self):
                                 bbox_to_anchor=(self._legend_x, self._legend_y),
                                 prop=dict(size=self._legend_font_size))
         # Annotate inferences
-        for x, y in zip(sorted_data['concurrency'],
-                        sorted_data['perf_throughput']):
+        for x, y in zip(sorted_data['indices'], sorted_data['perf_throughput']):
             self._ax_throughput.annotate(
                 str(round(y, 2)),
                 xy=(x, y),
 
@@ -36,7 +36,7 @@ class PlotManager:
     of plots generated by model analyzer
     """
 
-    def __init__(self, config:  Union[ConfigCommandProfile, ConfigCommandReport],
+    def __init__(self, config: Union[ConfigCommandProfile, ConfigCommandReport],
                  result_manager: ResultManager,
                  constraint_manager: ConstraintManager):
         """
@@ -63,7 +63,8 @@ def __init__(self, config:  Union[ConfigCommandProfile, ConfigCommandReport],
         os.makedirs(self._plot_export_directory, exist_ok=True)
 
         # Dict of list of plots
-        self._simple_plots: DefaultDict[str, Dict[str, SimplePlot]] = defaultdict()
+        self._simple_plots: DefaultDict[str, Dict[str,
+                                                  SimplePlot]] = defaultdict()
         self._detailed_plots: Dict[str, DetailedPlot] = {}
 
     def create_summary_plots(self):
@@ -186,7 +187,7 @@ def export_summary_plots(self):
 
     def export_detailed_plots(self):
         """
-        Write detaild plots to disk
+        Write detailed plots to disk
         """
 
         detailed_plot_dir = os.path.join(self._plot_export_directory,
 
@@ -917,8 +917,13 @@ def _build_detailed_table(self, model_config_name):
             reverse=True)
         cpu_only = model_config.cpu_only()
 
-        first_column_header = 'Request Concurrency' if self._mode == 'online' else 'Client Batch Size'
-        first_column_tag = 'concurrency-range' if self._mode == 'online' else 'batch-size'
+        if self._was_measured_with_request_rate(measurements[0]):
+            first_column_header = 'Request Rate' if self._mode == 'online' else 'Client Batch Size'
+            first_column_tag = 'request-rate-range' if self._mode == 'online' else 'batch-size'
+        else:
+            first_column_header = 'Request Concurrency' if self._mode == 'online' else 'Client Batch Size'
+            first_column_tag = 'concurrency-range' if self._mode == 'online' else 'batch-size'
+
         if not cpu_only:
             headers = [
                 first_column_header, 'p99 Latency (ms)',
@@ -1124,3 +1129,12 @@ def _cpu_metrics_were_gathered(self):
             self._cpu_metrics_gathered_sticky = used_ram != 0
 
         return self._cpu_metrics_gathered_sticky
+
+    def _was_measured_with_request_rate(
+            self, measurement: RunConfigMeasurement) -> bool:
+        if 'request-rate-range' in measurement.model_specific_pa_params(
+        )[0] and measurement.model_specific_pa_params(
+        )[0]['request-rate-range']:
+            return True
+        else:
+            return False
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,8 @@ def extract_model_specific_parameters(self):`
`225`	`225`
`226`	`226`	`return {`
`227`	`227`	`'batch-size': self._options['-b'],`
`228`		`- 'concurrency-range': self._args['concurrency-range']`
	`228`	`+ 'concurrency-range': self._args['concurrency-range'],`
	`229`	`+ 'request-rate-range': self._args['request-rate-range']`
`229`	`230`	`}`
`230`	`231`
`231`	`232`	`@classmethod`