Skip to content

Commit 80bf70a

Browse files
MrGevadominicshanshan
authored andcommitted
[NVIDIA#8391][fix] check perf by device subtype (NVIDIA#8428)
Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
1 parent 3c44ca0 commit 80bf70a

File tree

7 files changed

+151
-17
lines changed

7 files changed

+151
-17
lines changed
Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1-
network_name,perf_case_name,test_name,threshold,absolute_threshold,metric_type,perf_metric
2-
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.1,50,INFERENCE_TIME,99133.65406
3-
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.1,10,SEQ_THROUGHPUT,82.63618
4-
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.1,10,TOKEN_THROUGHPUT,10577.431520000002
5-
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.3,50,INFERENCE_TIME,214410.6447
6-
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.3,50,KV_CACHE_SIZE,68.84
7-
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.3,10,SEQ_THROUGHPUT,38.2071
8-
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.3,10,TOKEN_THROUGHPUT,4890.5035
1+
network_name,perf_case_name,test_name,threshold,absolute_threshold,metric_type,perf_metric,device_subtype
2+
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,109007.96,
3+
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,76.45,
4+
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,9785.75,
5+
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,55.64,
6+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,171845.02,H100_PCIe
7+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.17,H100_PCIe
8+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,48.09,H100_PCIe
9+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,6155.59,H100_PCIe
10+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,139897.82,H100_NVL
11+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,69.59,H100_NVL
12+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,58.63,H100_NVL
13+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,7504.07,H100_NVL
14+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,125068.76,H100
15+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.09,H100
16+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,65.50,H100
17+
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,8384.00,H100

tests/integration/defs/perf/data_export.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
# Uses tuple to do (src, dest) key names
7171
TEST_DESCRIPTION_KEYS = [
7272
"perf_case_name", "network_name", "framework", "sm_clk", "mem_clk",
73-
"gpu_idx", "network_hash", "flags"
73+
"gpu_idx", "network_hash", "flags", "device_subtype"
7474
]
7575

7676
# Tuples are used if the internal dictionary keys are different from output dictionary keys.

tests/integration/defs/perf/gpu_clock_lock.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
import psutil # type: ignore
3030
# Nvidia
3131
import pynvml # type: ignore
32-
from defs.trt_test_alternative import print_info, print_warning
32+
from defs.trt_test_alternative import print_error, print_info, print_warning
3333

34-
from .misc import clean_device_product_name
34+
from .misc import clean_device_product_name, get_device_subtype
3535

3636

3737
class InvalidGPUMonitoringResultError(RuntimeError):
@@ -124,6 +124,12 @@ def get_cpu_properties(self):
124124
def get_gpu_properties(self):
125125
return self._gpu_properties
126126

127+
def get_device_subtype(self):
128+
"""Get the device subtype for the primary GPU."""
129+
if self._gpu_properties and "device_subtype" in self._gpu_properties:
130+
return self._gpu_properties["device_subtype"]
131+
return None
132+
127133
def get_gpu_id(self):
128134
return self._gpu_id
129135

@@ -500,6 +506,10 @@ def _setup_properties(self):
500506
"device_product_name"] = clean_device_product_name(
501507
self._gpu_properties["device_product_name"])
502508

509+
# Add device subtype based on cleaned product name
510+
self._gpu_properties["device_subtype"] = get_device_subtype(
511+
self._gpu_properties["device_product_name"])
512+
503513
if "jetson" in self._gpu_properties[
504514
"device_product_name"] or "p3710" in self._gpu_properties[
505515
"device_product_name"]:

tests/integration/defs/perf/misc.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,23 @@
2121
_GPU_DEVICE_PRODUCT_NAME_MAPPING = {"A100-PCIE-80GB": "A100 80GB PCIe"}
2222

2323

24+
def get_device_subtype(device_product_name: str) -> str:
25+
"""
26+
Get device subtype based on device product name.
27+
28+
Simply converts the cleaned device product name to a consistent format
29+
by replacing spaces and hyphens with underscores.
30+
31+
Args:
32+
device_product_name: Cleaned device product name from NVML
33+
34+
Returns:
35+
Device subtype string with consistent formatting
36+
"""
37+
# Convert device name to consistent subtype format (replace spaces and hyphens with underscores)
38+
return device_product_name.replace(" ", "_").replace("-", "_")
39+
40+
2441
def clean_device_product_name(device_product_name):
2542
cleaned_name = device_product_name
2643
cleaned_name = cleaned_name.replace("NVIDIA", "").strip()

tests/integration/defs/perf/sanity_perf_check.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,72 @@ def _is_performance_regression(self, base_value: float, target_value: float,
139139
# Negative threshold: higher is better - regression if target < base
140140
return target_value < base_value
141141

142+
def _filter_by_device_subtype(
143+
self, base_perf: pd.DataFrame,
144+
current_perf: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
145+
"""
146+
Filter performance data to match device subtypes for autodeploy tests.
147+
148+
For autodeploy tests, only compare against baselines with the same device subtype.
149+
For non-autodeploy tests, use the original behavior.
150+
151+
Args:
152+
base_perf: Baseline performance DataFrame
153+
current_perf: Current performance DataFrame
154+
155+
Returns:
156+
Tuple of (filtered_base_perf, filtered_current_perf)
157+
"""
158+
# If current performance data doesn't have device_subtype column, return as-is
159+
if 'device_subtype' not in current_perf.columns:
160+
return base_perf, current_perf
161+
162+
# Get the current device subtype from current performance data
163+
current_device_subtypes = current_perf['device_subtype'].dropna(
164+
).unique()
165+
166+
if len(current_device_subtypes) == 0:
167+
# No device subtype info in current data, return as-is
168+
return base_perf, current_perf
169+
170+
current_device_subtype = current_device_subtypes[
171+
0] # Assume single device type per run
172+
print(
173+
f"Filtering performance data for device subtype: {current_device_subtype}"
174+
)
175+
176+
# Filter base performance data to only include entries with matching device subtype
177+
# or entries without device subtype info (for backward compatibility)
178+
if 'device_subtype' in base_perf.columns:
179+
# Filter base data: keep entries with matching subtype or null subtype
180+
base_filtered = base_perf[
181+
(base_perf['device_subtype'] == current_device_subtype) |
182+
(base_perf['device_subtype'].isna())].copy()
183+
else:
184+
# Base data doesn't have device subtype column, keep all entries
185+
base_filtered = base_perf.copy()
186+
187+
# For autodeploy tests, only keep current entries with device subtype
188+
autodeploy_mask = current_perf['network_name'].str.contains(
189+
'_autodeploy', na=False)
190+
current_filtered = current_perf.copy()
191+
192+
# For autodeploy tests, ensure device subtype is present
193+
if autodeploy_mask.any():
194+
autodeploy_entries = current_perf[autodeploy_mask]
195+
non_autodeploy_entries = current_perf[~autodeploy_mask]
196+
197+
# Keep only autodeploy entries that have device subtype
198+
autodeploy_with_subtype = autodeploy_entries[
199+
autodeploy_entries['device_subtype'].notna()]
200+
201+
# Combine filtered autodeploy entries with non-autodeploy entries
202+
current_filtered = pd.concat(
203+
[autodeploy_with_subtype, non_autodeploy_entries],
204+
ignore_index=True)
205+
206+
return base_filtered, current_filtered
207+
142208
def __call__(self, *args, **kwargs):
143209
# Check if the base_perf_csv file exists
144210
if not self.base_perf_csv.exists():
@@ -150,6 +216,10 @@ def __call__(self, *args, **kwargs):
150216
base_perf = load_file(self.base_perf_csv.as_posix())
151217
current_perf = load_file(self.target_perf_csv.as_posix())
152218

219+
# Filter performance data by device subtype for autodeploy tests
220+
base_perf, current_perf = self._filter_by_device_subtype(
221+
base_perf, current_perf)
222+
153223
full_diff, new_base = get_diff(base_perf, current_perf)
154224
if not full_diff.empty:
155225
self.report_diff(full_diff)

0 commit comments

Comments
 (0)