Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/inference_endpoint/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,11 @@ def _add_shared_benchmark_args(parser):
parser.add_argument(
"--report-dir", type=Path, help="Path to save detailed benchmark report"
)
parser.add_argument(
"--ensure-submission-checker-compatibility",
action="store_true",
help="Enable loadgen compatibility mode for submission checker",
)


def _add_online_specific_args(parser):
Expand Down
28 changes: 27 additions & 1 deletion src/inference_endpoint/commands/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@
from transformers import AutoTokenizer
from transformers.utils import logging as transformers_logging

from inference_endpoint.commands.utils import get_default_report_path
from inference_endpoint.commands.utils import (
generate_mlperf_log_details_submission_checker,
generate_user_conf_submission_checker,
get_default_report_path,
)
from inference_endpoint.config.runtime_settings import RuntimeSettings
from inference_endpoint.config.schema import (
APIType,
Expand Down Expand Up @@ -291,6 +295,9 @@ def _build_config_from_cli(
timeout = getattr(args, "timeout", None)
verbose_level = getattr(args, "verbose", 0)
api_type = APIType(getattr(args, "api_type", "openai"))
ensure_submission_checker_compatibility = getattr(
args, "ensure_submission_checker_compatibility", False
)
# Build BenchmarkConfig from CLI params
return BenchmarkConfig(
name=f"cli_{benchmark_mode}",
Expand Down Expand Up @@ -349,6 +356,7 @@ def _build_config_from_cli(
report_dir=report_dir,
timeout=timeout,
verbose=verbose_level > 0,
ensure_submission_checker_compatibility=ensure_submission_checker_compatibility,
)


Expand Down Expand Up @@ -712,6 +720,24 @@ def signal_handler(signum, frame):
except Exception as e:
logger.error(f"Save failed: {e}")

if config.ensure_submission_checker_compatibility:
try:
# convert the runtime_settings.json to user.conf format and
generate_user_conf_submission_checker(report_dir)
except Exception as e:
logger.error(
f"Failed to generate user conf for submission checker: {e}"
)
raise
try:
# generate mlperf_log_details.txt from summary.json
generate_mlperf_log_details_submission_checker(report_dir, strict=True)
except Exception as e:
logger.error(
f"Failed to generate mlperf_log_details.txt for submission checker: {e}"
)
raise

except KeyboardInterrupt:
logger.warning("Benchmark interrupted by user")
raise
Expand Down
94 changes: 94 additions & 0 deletions src/inference_endpoint/commands/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"""Utility commands: info, validate, init."""

import argparse
import json
import logging
import os
import platform
Expand All @@ -31,6 +32,7 @@
from pydantic import ValidationError as PydanticValidationError

from .. import __version__
from ..config.constants import ENDPOINTS_TO_LOADGEN_KEY_MAPPING
from ..config.schema import TEMPLATE_TYPE_MAP, BenchmarkConfig
from ..config.yaml_loader import ConfigError, ConfigLoader
from ..exceptions import InputValidationError, SetupError
Expand Down Expand Up @@ -313,3 +315,95 @@ def get_default_report_path() -> Path:
return Path(
f"{tempfile.gettempdir()}/reports_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)


def generate_user_conf_submission_checker(report_dir: Path) -> None:
"""Generate user.conf file for submission checker from runtime_settings.json.

Converts endpoints runtime_settings keys to loadgen keys using the mapping
defined in config.constants.ENDPOINTS_TO_LOADGEN_KEY_MAPPING.

Args:
report_dir: Path to the report directory containing runtime_settings.json.

Raises:
FileNotFoundError: If runtime_settings.json does not exist in report_dir.
"""

runtime_settings_path = report_dir / "runtime_settings.json"
user_conf_path = report_dir / "user.conf"

if not runtime_settings_path.exists():
logger.error(f"runtime_settings.json not found in {report_dir}")
raise FileNotFoundError(f"runtime_settings.json not found in {report_dir}")
try:
with open(runtime_settings_path) as f:
runtime_settings = json.load(f)

with open(user_conf_path, "w") as f:
for key, value in runtime_settings.items():
# Map endpoints key to loadgen key if mapping exists, otherwise use same key
loadgen_key = ENDPOINTS_TO_LOADGEN_KEY_MAPPING.get(key, key)
f.write(f"*.*.{loadgen_key}={value}\n")

logger.info(f"Generated user.conf at {user_conf_path}")

except Exception as e:
logger.error(f"Failed to generate user.conf: {e}")
raise


def generate_mlperf_log_details_submission_checker(
report_dir: Path, strict: bool
) -> None:
"""Generate mlperf_log_details.txt file for submission checker from summary.json.

Converts endpoints summary keys to loadgen keys using the mapping
defined in config.constants.ENDPOINTS_TO_LOADGEN_KEY_MAPPING.

Args:
report_dir: Path to the report directory containing summary.json.

Raises:
FileNotFoundError: If runtime_settings.json does not exist in report_dir.
"""

summary_path = report_dir / "summary.json"
log_details_path = report_dir / "mlperf_log_details.txt"
marker = ":::ENDPTS"

if not summary_path.exists():
logger.error(f"summary.json not found in {report_dir}")
raise FileNotFoundError(f"summary.json not found in {report_dir}")
try:
with (
open(summary_path) as summary_file,
open(log_details_path, "w") as output_file,
):
for line in summary_file:
line = line.strip()
if line.find(marker) == 0:
try:
record = json.loads(line[len(marker) :])
except json.JSONDecodeError as e:
if strict:
logger.error(f"Encountered invalid line: {line} Error: {e}")
raise
else:
logger.warning(f"Skipping invalid line: {line}")
continue
# map keys
original_key = record.get("key")
if original_key in ENDPOINTS_TO_LOADGEN_KEY_MAPPING:
record["key"] = ENDPOINTS_TO_LOADGEN_KEY_MAPPING[original_key]
output_file.write(
f"{marker} {json.dumps(record, separators=(',', ':'))}\n"
)
else:
logger.warning(f"Found invalid line {line}, skipping.")

logger.info(f"Generated mlperf_log_details.txt at {log_details_path}")

except Exception as e:
logger.error(f"Failed to generate mlperf_log_details.txt: {e}")
raise
86 changes: 86 additions & 0 deletions src/inference_endpoint/config/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Global constants and mappings for the inference endpoint package."""

# Mapping from endpoints results keys to MLPerf loadgen and submission checker supported keys
# This ensures compatibility when generating user.conf and mlperf_log_details.txt for submission checker
# Format: {"endpoints_key": "loadgen_key"}
ENDPOINTS_TO_LOADGEN_KEY_MAPPING = {
"endpoints_version": "loadgen_version",
"endpoints_git_commit_date": "loadgen_git_commit_date",
"endpoints_git_commit_hash": "loadgen_git_commit_hash",
"test_datetime": "test_datetime",
"n_samples_issued": "qsl_reported_total_count",
"n_samples_from_dataset": "qsl_reported_performance_count",
"effective_scenario": "effective_scenario",
"mode": "effective_test_mode",
"streaming": "streaming",
"output_sequence_lengths.min": "min_output_tokens",
"output_sequence_lengths.max": "max_output_tokens",
"load_pattern": "load_pattern",
"min_duration_ms": "effective_min_duration_ms",
"max_duration_ms": "effective_max_duration_ms",
"effective_target_duration_ms": "effective_target_duration_ms",
"min_sample_count": "effective_min_query_count",
"effective_sample_index_rng_seed": "effective_sample_index_rng_seed",
"effective_schedule_rng_seed": "effective_schedule_rng_seed",
"effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation",
"effective_samples_per_query": "effective_samples_per_query",
"generated_query_count": "generated_query_count",
"generated_query_duration": "generated_query_duration",
"target_qps": "effective_target_qps", # (results_summary.json)
"result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
"qps": "result_completed_samples_per_sec",
"results_sample_per_second": "results_sample_per_second",
"effective_max_concurrency": "effective_max_async_queries",
"effective_target_latency_ns": "effective_target_latency_ns",
"effective_target_latency_percentile": "effective_target_latency_percentile",
"latency.min": "result_min_latency_ns",
"latency.max": "result_max_latency_ns",
"latency.avg": "result_mean_latency_ns",
"latency.percentiles.50": "result_50.00_percentile_latency_ns",
"latency.percentiles.90": "result_90.00_percentile_latency_ns",
"latency.percentiles.95": "result_95.00_percentile_latency_ns",
"latency.percentiles.99": "result_99.00_percentile_latency_ns",
"latency.percentiles.99.9": "result_99.90_percentile_latency_ns",
"ttft.min": "result_first_token_min_latency_ns",
"ttft.max": "result_first_token_max_latency_ns",
"ttft.avg": "result_first_token_mean_latency_ns",
"ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns",
"ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns",
"ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns",
"ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns",
"ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns",
"tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns",
"tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns",
"tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns",
"tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns",
"tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
"tpot.min": "result_time_to_output_token_min",
"tpot.max": "result_time_to_output_token_max",
"tpot.avg": "result_time_to_output_token_mean",
"tps": "result_completed_tokens_per_second",
"result_validity": "result_validity",
"result_perf_constraints_met": "result_perf_constraints_met",
"result_min_duration_met": "result_min_duration_met",
"result_min_queries_met": "result_min_queries_met",
"early_stopping_met": "early_stopping_met",
"early_stopping_ttft_result": "early_stopping_ttft_result",
"early_stopping_tpot_result": "early_stopping_tpot_result",
"result.total": "result_query_count",
"result_overlatency_query_count": "result_overlatency_query_count",
"result.failed": "num_errors",
}
1 change: 1 addition & 0 deletions src/inference_endpoint/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ class BenchmarkConfig(BaseModel):
report_dir: Path | None = None
timeout: float | None = None
verbose: bool = False
ensure_submission_checker_compatibility: bool = False
# CPU affinity for loadgen and worker processes:
# - True = auto (compute optimal NUMA-aware plan)
# - False = disabled (no CPU pinning)
Expand Down
Loading
Loading