diff --git a/.github/workflows/test-resnet50.yml b/.github/workflows/test-resnet50.yml
index a025d79889..f70ce9e44f 100755
--- a/.github/workflows/test-resnet50.yml
+++ b/.github/workflows/test-resnet50.yml
@@ -35,9 +35,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
- python3 -m pip install mlcflow
- mlc pull repo mlcommons@mlperf-automations --branch=dev
-#python3 -m pip install mlc-scripts
+ python3 -m pip install mlc-scripts
- name: Test Resnet50 and end to end submission generation
run: |
- mlcr run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --adr.inference-src-loadgen.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom ${{ matrix.loadgen-flag }}
+ mlcr run,mlperf,inference,generate-run-cmds,_submission,_short,_r6.0-dev --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --adr.inference-src-loadgen.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom ${{ matrix.loadgen-flag }}
\ No newline at end of file
diff --git a/.github/workflows/test-retinanet.yml b/.github/workflows/test-retinanet.yml
index 738c5a67c0..715853b42b 100755
--- a/.github/workflows/test-retinanet.yml
+++ b/.github/workflows/test-retinanet.yml
@@ -33,9 +33,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
- python3 -m pip install mlcflow
- mlc pull repo mlcommons@mlperf-automations --branch=dev
-#python3 -m pip install mlc-scripts
+ python3 -m pip install mlc-scripts
- name: Test Retinanet and end to end submission generation
run: |
- mlcr run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.inference-src-loadgen.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom
+ mlcr run,mlperf,inference,generate-run-cmds,_submission,_short,_r6.0-dev --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.accuracy-check-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} --adr.inference-src-loadgen.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom
diff --git a/.github/workflows/test-rgat.yml b/.github/workflows/test-rgat.yml
index 5bc7628007..e84bd4ace3 100644
--- a/.github/workflows/test-rgat.yml
+++ b/.github/workflows/test-rgat.yml
@@ -32,9 +32,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
- python3 -m pip install mlcflow
- mlc pull repo mlcommons@mlperf-automations --branch=dev
-#python3 -m pip install mlc-scripts
+ python3 -m pip install mlc-scripts
- name: Test R-GAT and end to end submission generation
run: |
- mlcr run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --category=datacenter --hw_name=default --model=rgat --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom ${{ matrix.loadgen-flag }}
+ mlcr run,mlperf,inference,generate-run-cmds,_submission,_short,_r6.0-dev --quiet --submitter="MLCommons" --category=datacenter --hw_name=default --model=rgat --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src-loadgen.version=custom --adr.loadgen.version=custom ${{ matrix.loadgen-flag }}
diff --git a/.github/workflows/test-submission-generation.yml b/.github/workflows/test-submission-generation.yml
index e34d23c29e..d541025076 100644
--- a/.github/workflows/test-submission-generation.yml
+++ b/.github/workflows/test-submission-generation.yml
@@ -13,9 +13,9 @@ on:
jobs:
run-tests:
- uses: mlcommons/mlperf-automations/.github/workflows/test-mlperf-inference-submission-generation.yml@subcheckmod_change
+ uses: mlcommons/mlperf-automations/.github/workflows/test-mlperf-inference-submission-generation.yml@dev
with:
inf-branch: ${{ github.event.pull_request.head.ref }}
inf-repo-url: ${{ github.event.pull_request.head.repo.html_url }}
automation-repo: mlcommons@mlperf-automations
- automation-repo-branch: subcheckmod_change
\ No newline at end of file
+ automation-repo-branch: dev
diff --git a/tools/submission/README.md b/tools/submission/README.md
index 5a31a304a2..6d620233b2 100644
--- a/tools/submission/README.md
+++ b/tools/submission/README.md
@@ -29,7 +29,7 @@ Output directory with submission with truncated `mlperf_log_accuracy.json` files
### Summary
The input submission directory is modified with empty directories removed and low accuracy results inferred. Multistream and offline scenario results are also wherever possible. The original input directory is saved in a timestamped directory.
-## `submission_checker.py` (Mandatory)
+## `submission_checker/main.py` (Mandatory)
### Inputs
**input**: Path to the directory containing one or several submissions.
**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1.
@@ -50,17 +50,73 @@ The below input fields are off by default since v3.1 and are mandatory but can b
**skip-check-power-measure-files**: Flag to avoid checking if the required power measurement files are present
### Summary
-Checks a directory that contains one or several submissions. This script can be used by running the following command:
+Checks a directory that contains one or several submissions. This script can be used by running the following command (outside the inference repository):
```
-python3 submission_checker.py --input
+python3 -m inference.tools.submission.submission_checker.main
+ --input
[--version ]
[--submitter ]
[--csv ]
[--skip_compliance]
[--extra-model-benchmark-map ]
[--submission-exceptions]
+ [--skip-power-check]
+ [--skip-meaningful-fields-emptiness-check]
+ [--skip-check-power-measure-files]
+ [--skip-empty-files-check]
+ [--skip-extra-files-in-root-check]
+ [--skip-extra-accuracy-files-check]
+ [--scenarios-to-skip]
+ [--skip-all-systems-have-results-check]
+ [--skip-calibration-check]
```
+### implemented checks
+**performance:**
+- Check performance detailed log exists
+- Check for loadgen errors
+- Check for equal issue mode when it is required
+- Check the performance sample count used for running the benchmark
+- Check loadgen seeds are correct
+- Check latency constrain is met
+- Check minimun query count is met
+- Check minimun duration is met
+- Check network requirements
+- Check LLM latencies are met (if applies)
+- Check loadgen scenario matches with submission scenario or that result can be inferred
+
+**accuracy**
+- Check the accuracy metric is correct and over the expected threshold (or within a range if applies)
+- Check accuracy json exists and is truncated
+- Check for loadgen error
+- Check full dataset is used for the accuracy run
+
+**compliance**
+- Check compliance directory exists
+- Run performance checks for compliance run
+- Check accuracy test passes
+- Check performance test passes
+
+**measurements**
+- Check measurements files exist
+- Check the required files are there
+- Check the required fields are there
+
+**power**
+- Check the required power files are there (if the submission has power)
+- Run the external power checks
+- Check power metric can be calculated
+
+**system**
+- Check system json exists
+- Check availability is valid
+- Check system type is valid
+- Check network fields
+- Check required fields are include in system json file
+- Check submitter is correct
+- Check division is correct
+
+
### Outputs
- CSV file containing all the valid results in the directory.
- It raises several errors and logs invalid results.
diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index 453532178e..0101bbf12f 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -9,7 +9,7 @@
import shutil
import json
-import submission_checker as checker
+import submission_checker_old as checker
logging.basicConfig(level=logging.INFO)
@@ -120,6 +120,8 @@ def copy_submission_dir(src, dst, filter_submitter, keep_structure=True):
src, division, submitter, dir)):
target_dir = "results" if dir in [
"compliance", "measurements"] else dir
+ target_dir = "src" if dir in [
+ "code"] else target_dir
shutil.copytree(
os.path.join(src, division, submitter, dir),
os.path.join(dst, division, submitter, target_dir),
@@ -301,12 +303,10 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
model_compliance_path = model_results_path
model_code_path = os.path.join(
change_folder_name_in_path(
- log_path, "results", "code"), model)
+ log_path, "results", "src"), model)
if not args.nomove_failed_to_open:
target_code_path = change_first_directory_to_open(
model_code_path)
- target_code_path = change_folder_name_in_path(
- log_path, "code", "src")
target_results_path = change_first_directory_to_open(
model_results_path)
target_measurements_path = change_first_directory_to_open(
diff --git a/tools/submission/submission_checker/__init__.py b/tools/submission/submission_checker/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/submission_checker/checks/__init__.py b/tools/submission/submission_checker/checks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py
new file mode 100644
index 0000000000..9812434644
--- /dev/null
+++ b/tools/submission/submission_checker/checks/accuracy_check.py
@@ -0,0 +1,231 @@
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+import re
+import os
+
+
+class AccuracyCheck(BaseCheck):
+ """Checks accuracy-related submission artifacts and reports issues.
+
+ The `AccuracyCheck` class performs a set of validations on submission
+ accuracy outputs. It inspects the parsed MLPerf log and accompanying
+ accuracy artifacts provided via `SubmissionLogs` and the test
+ `Config` to ensure that reported accuracy metrics meet configured
+ targets and limits, that the accuracy JSON exists and is properly
+ truncated, that Loadgen did not report blocking errors, and that the
+ accuracy run covered the expected dataset size.
+
+ Main check methods:
+ - `accuracy_result_check`: Parses `accuracy.txt` lines to validate
+ reported metrics against targets, upper limits, and hash presence.
+ - `accuracy_json_check`: Ensures the accuracy JSON file exists and is
+ within allowed size limits.
+ - `loadgen_errors_check`: Fails if Loadgen reported non-ignored errors.
+ - `dataset_check`: Verifies the reported sample count matches the
+ configured dataset size unless the check is skipped.
+
+ Attributes:
+ submission_logs (SubmissionLogs): Holder for submission log paths
+ and parsed contents (accuracy logs, results, json, loader data).
+ mlperf_log: Parsed MLPerf log object used to inspect errors and
+ run metadata.
+ accuracy_result (list[str]): Lines from `accuracy.txt` used to
+ extract reported accuracy values.
+ accuracy_json (str): Path to the accuracy JSON file.
+ config (Config): Configuration helper providing target values and
+ dataset sizes.
+ """
+
+ def __init__(
+ self, log, path, config: Config, submission_logs: SubmissionLogs
+ ):
+ """Initialize the accuracy check helper.
+
+ Args:
+ log: Logger instance used to report messages.
+ path: Path to the submission being checked.
+ config (Config): Configuration provider for targets and limits.
+ submission_logs (SubmissionLogs): Parsed submission logs and
+ artifact paths (accuracy logs, results, json, loader data).
+ """
+ super().__init__(log, path)
+ self.name = "accuracy checks"
+ self.submission_logs = submission_logs
+ self.mlperf_log = self.submission_logs.accuracy_log
+ self.accuracy_result = self.submission_logs.accuracy_result
+ self.accuracy_json = self. submission_logs.accuracy_json
+ self.config = config
+ self.model = self.submission_logs.loader_data.get("benchmark", "")
+ self.model_mapping = self.submission_logs.loader_data.get(
+ "model_mapping", {})
+ self.model = self.config.get_mlperf_model(
+ self.model, self.model_mapping)
+ self.scenario_fixed = self.submission_logs.loader_data.get(
+ "scenario", "")
+ self.scenario = self.mlperf_log["effective_scenario"]
+ self.division = self.submission_logs.loader_data.get("division", "")
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register individual accuracy-related checks.
+
+ Adds the per-submission validation callables to `self.checks` in
+ the order they should be executed.
+ """
+ self.checks.append(self.accuracy_result_check)
+ self.checks.append(self.accuracy_json_check)
+ self.checks.append(self.loadgen_errors_check)
+ self.checks.append(self.dataset_check)
+
+ def accuracy_result_check(self):
+ """Validate reported accuracy metrics in `accuracy.txt`.
+
+ Parses lines from `self.accuracy_result` using configured patterns
+ and compares found values against targets and optional upper
+ limits. Also ensures a hash value is present and records the
+ observed accuracy metrics in `submission_logs.loader_data`.
+
+ Returns:
+ bool: True if accuracy checks passed (or division is 'open'),
+ False otherwise.
+ """
+
+ patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
+ self.model
+ )
+ acc = None
+ hash_val = None
+ result_acc = {}
+ acc_limit_check = True
+ all_accuracy_valid = True
+ acc_seen = [False for _ in acc_targets]
+ for line in self.accuracy_result:
+ for i, (pattern, acc_target, acc_type) in enumerate(
+ zip(patterns, acc_targets, acc_types)
+ ):
+ m = re.match(pattern, line)
+ if m:
+ acc = m.group(1)
+ m = re.match(r"^hash=([\w\d]+)$", line)
+ if m:
+ hash_val = m.group(1)
+ if acc is not None and float(acc) >= acc_target:
+ all_accuracy_valid &= True
+ acc_seen[i] = True
+ elif acc is not None:
+ all_accuracy_valid = False
+ self.log.warning(
+ "%s accuracy not met: expected=%f, found=%s",
+ self.path,
+ acc_target,
+ acc,
+ )
+ if acc:
+ result_acc[acc_type] = acc
+ acc = None
+
+ if acc_upper_limit is not None:
+ for i, (pattern, acc_limit) in enumerate(
+ zip(up_patterns, acc_limits)):
+ m = re.match(pattern, line)
+ if m:
+ acc = m.group(1)
+ m = re.match(r"^hash=([\w\d]+)$", line)
+ if m:
+ hash_val = m.group(1)
+ if (
+ acc is not None
+ and acc_upper_limit is not None
+ and float(acc) > acc_limit
+ ):
+ acc_limit_check = False
+ self.log.warning(
+ "%s accuracy not met: upper limit=%f, found=%s",
+ self.path,
+ acc_limit,
+ acc,
+ )
+ acc = None
+ if all(acc_seen) and hash_val:
+ break
+ is_valid = all_accuracy_valid & all(acc_seen)
+ if acc_upper_limit is not None:
+ is_valid &= acc_limit_check
+ if not hash_val:
+ self.log.error("%s not hash value for accuracy.txt", self.path)
+ is_valid = False
+ self.submission_logs.loader_data["accuracy_metrics"] = result_acc
+ if self.division.lower() == "open":
+ return True
+ return is_valid
+
+ def accuracy_json_check(self):
+ """Check that the accuracy JSON exists and is within size limits.
+
+ Returns:
+ bool: True if the JSON file exists and its size does not
+ exceed `MAX_ACCURACY_LOG_SIZE`, False otherwise.
+ """
+ if not os.path.exists(self.accuracy_json):
+ self.log.error("%s is missing", self.accuracy_json)
+ return False
+ else:
+ if os.stat(self.accuracy_json).st_size > MAX_ACCURACY_LOG_SIZE:
+ self.log.error("%s is not truncated", self.accuracy_json)
+ return False
+ return True
+
+ def loadgen_errors_check(self):
+ """Detect Loadgen errors reported in the MLPerf log.
+
+ If errors are present and not ignored by configuration, logs the
+ error messages and returns False to indicate failure.
+
+ Returns:
+ bool: True if no blocking Loadgen errors are present,
+ False otherwise.
+ """
+ if self.mlperf_log.has_error():
+ if self.config.ignore_uncommited:
+ has_other_errors = False
+ for error in self.mlperf_log.get_errors():
+ if "Loadgen built with uncommitted changes!" not in error["value"]:
+ has_other_errors = True
+ self.log.error("%s contains errors:", self.path)
+ for error in self.mlperf_log.get_errors():
+ self.log.error("%s", error["value"])
+
+ if not self.config.ignore_uncommited or has_other_errors:
+ self.log.error(
+ "%s has loadgen errors, number of errors: %s", self.path, self.mlperf_log.num_errors()
+ )
+ return False
+ return True
+
+ def dataset_check(self):
+ """Verify the accuracy run covered the expected dataset size.
+
+ If `skip_dataset_size_check` is enabled in the configuration,
+ this check is skipped and returns True. Otherwise compares the
+ `qsl_reported_total_count` from the MLPerf log to the expected
+ dataset size for the model.
+
+ Returns:
+ bool: True if the dataset sizes match or the check is skipped,
+ False if the reported count differs from expected.
+ """
+ if self.config.skip_dataset_size_check:
+ self.log.info(
+ "%s Skipping dataset size check", self.path
+ )
+ return True
+ qsl_total_count = self.mlperf_log["qsl_reported_total_count"]
+ expected_qsl_total_count = self.config.get_dataset_size(self.model)
+ if qsl_total_count != expected_qsl_total_count:
+ self.log.error(
+ "%s accurcy run does not cover all dataset, accuracy samples: %s, dataset size: %s", self.path, qsl_total_count, expected_qsl_total_count
+ )
+ return False
+ return True
diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py
new file mode 100644
index 0000000000..8e2a678fb9
--- /dev/null
+++ b/tools/submission/submission_checker/checks/base.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+
+
+class BaseCheck(ABC):
+ """
+ A generic check class meant to be inherited by concrete check implementations.
+ Subclasses must register their check methods into `self.checks`.
+ """
+
+ def __init__(self, log, path):
+ self.checks = []
+ self.log = log
+ self.path = path
+ self.name = "base checks"
+ pass
+
+ def run_checks(self):
+ """
+ Execute all registered checks. Returns True if all checks pass, False otherwise.
+ """
+ valid = True
+ errors = []
+ for check in self.checks:
+ try:
+ v = self.execute(check)
+ valid &= v
+ except BaseException:
+ valid &= False
+ self.log.error(
+ "Execution occurred in running check %s. Running %s in %s",
+ self.path,
+ check.__name__,
+ self.__class__.__name__)
+ return valid
+
+ def execute(self, check):
+ """Custom execution of a single check method."""
+ return check()
+
+ def __call__(self):
+ """Allows the check instance to be called like a function."""
+ self.log.info("Starting %s for: %s", self.name, self.path)
+ valid = self.run_checks()
+ if valid:
+ self.log.info("All %s checks passed for: %s", self.name, self.path)
+ else:
+ self.log.error(
+ "Some %s Checks failed for: %s",
+ self.name,
+ self.path)
+ return valid
diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
new file mode 100644
index 0000000000..ef2736759d
--- /dev/null
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -0,0 +1,365 @@
+
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+from .performance_check import PerformanceCheck
+from .accuracy_check import AccuracyCheck
+from ..utils import *
+import re
+import os
+
+
+class ComplianceCheck(BaseCheck):
+ """Validate compliance test artifacts for a submission.
+
+ The `ComplianceCheck` class runs a set of validations against the
+ compliance directory produced with a submission. It verifies the
+ presence of required test subdirectories and files, runs delegated
+ performance and accuracy checks for compliance tests, and inspects
+ compliance-specific performance outputs.
+
+ The class delegates some checks to `PerformanceCheck` and
+ `AccuracyCheck` helpers when relevant. Results and file lists are
+ logged via the provided logger.
+ """
+
+ def __init__(self, log, path, config: Config,
+ submission_logs: SubmissionLogs):
+ """Initialize the compliance checker.
+
+ Args:
+ log: Logger used to emit informational, warning, and error
+ messages about the compliance checks.
+ path: Filesystem path to the submission root being checked.
+ config (Config): Configuration provider for models and
+ compliance expectations.
+ submission_logs (SubmissionLogs): Parsed submission log
+ artifacts and loader metadata.
+ """
+ super().__init__(log, path)
+ self.submission_logs = submission_logs
+ self.config = config
+ self.model = self.submission_logs.loader_data.get("benchmark", "")
+ self.model_mapping = self.submission_logs.loader_data.get(
+ "model_mapping", {})
+ self.compliance_dir = self.submission_logs.loader_data.get(
+ "compliance_path", {})
+ self.division = self.submission_logs.loader_data.get("division", "")
+ self.model = self.config.get_mlperf_model(
+ self.model, self.model_mapping)
+ self.test_list = self.get_test_list(self.model)
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register the sequence of compliance checks to run.
+
+ Appends the per-submission validation callables to `self.checks` in
+ the order they should be executed by the checking framework.
+ """
+ self.checks.append(self.dir_exists_check)
+ self.checks.append(self.performance_check)
+ self.checks.append(self.accuracy_check)
+ self.checks.append(self.compliance_performance_check)
+
+ def get_test_list(self, model):
+ """Return the list of compliance tests applicable to `model`.
+
+ The mapping of models to tests is read from the configuration
+ (`self.config.base`) using the pre-defined keys
+ `models_TEST01`, `models_TEST04`, and `models_TEST06`.
+
+ Args:
+ model (str): MLPerf benchmark/model identifier.
+
+ Returns:
+ list[str]: Ordered list of compliance test names to execute.
+ """
+
+ test_list = []
+ if model in self.config.base["models_TEST01"]:
+ test_list.append("TEST01")
+ if model in self.config.base["models_TEST04"]:
+ test_list.append("TEST04")
+ if model in self.config.base["models_TEST06"]:
+ test_list.append("TEST06")
+ return test_list
+
+ def dir_exists_check(self):
+ """Verify required compliance directories and files exist.
+
+ Skips checks for the 'open' division. For each test in
+ `self.test_list`, ensures the expected test directory exists and
+ that required verification files are present depending on the
+ test type (accuracy/performance files for specific tests).
+
+ Returns:
+ bool: True if all required files and directories are present,
+ False otherwise.
+ """
+
+ if self.division.lower() == "open":
+ self.log.info(
+ "Compliance tests not needed for open division. Skipping tests on %s",
+ self.path)
+ return True
+ is_valid = True
+ for test in self.test_list:
+ test_dir = os.path.join(self.compliance_dir, test)
+ acc_path = os.path.join(
+ self.compliance_dir, test, "verify_accuracy.txt")
+ perf_comp_path = os.path.join(
+ self.compliance_dir, test, "verify_performance.txt")
+ perf_path = os.path.join(
+ self.compliance_dir,
+ test,
+ "performance",
+ "run_1",
+ "mlperf_log_detail.txt")
+ if not os.path.exists(test_dir):
+ self.log.error(
+ "Missing %s in compliance dir %s",
+ test,
+ self.compliance_dir)
+ is_valid = False
+ if test in ["TEST01", "TEST06"]:
+ if not os.path.exists(acc_path):
+ self.log.error(
+ "Missing accuracy file in compliance dir. Needs file %s", acc_path)
+ is_valid = False
+ if test in ["TEST01", "TEST04"]:
+ if not os.path.exists(perf_comp_path):
+ self.log.error(
+ "Missing performance file in compliance dir. Needs file %s",
+ perf_comp_path)
+ is_valid = False
+ if not os.path.exists(perf_path):
+ self.log.error(
+ "Missing perfomance file in compliance dir. Needs file %s", perf_path)
+ is_valid = False
+ return is_valid
+
+ def performance_check(self):
+ """Run performance compliance checks for applicable tests.
+
+ For each test that requires a performance check (TEST01 and
+ TEST04), construct a `SubmissionLogs` object pointing at the
+ test's performance log and delegate to `PerformanceCheck`.
+
+ Returns:
+ bool: True if all delegated performance checks pass, False
+ if any fail.
+ """
+
+ if self.division.lower() == "open":
+ self.log.info(
+ "Compliance tests not needed for open division. Skipping tests on %s",
+ self.path)
+ return True
+ is_valid = True
+ for test in self.test_list:
+ if test in ["TEST01", "TEST04"]:
+ test_data = {
+ "division": self.submission_logs.loader_data.get("division", ""),
+ "benchmark": self.submission_logs.loader_data.get("benchmark", ""),
+ "scenario": self.submission_logs.loader_data.get("scenario", ""),
+ "model_mapping": self.submission_logs.loader_data.get("model_mapping", {})
+ }
+ test_logs = SubmissionLogs(
+ self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data)
+ perf_check = PerformanceCheck(self.log, os.path.join(
+ self.compliance_dir, test), self.config, test_logs)
+ is_valid &= perf_check()
+ return is_valid
+
+ def accuracy_check(self):
+ """Run accuracy compliance checks for applicable tests.
+
+ For TEST01, verifies deterministic-mode pass lines and checks the
+ `accuracy` directory contents and baseline/compliance accuracy
+ values against model-specific delta thresholds.
+
+ For TEST06, inspects the pre-parsed result lines for first-token,
+ EOS, and sample-length checks.
+
+ Returns:
+ bool: True if all required accuracy checks pass, False
+ otherwise.
+ """
+
+ if self.division.lower() == "open":
+ self.log.info(
+ "Compliance tests not needed for open division. Skipping tests on %s",
+ self.path)
+ return True
+ is_valid = True
+ for test in self.test_list:
+ test_dir = os.path.join(self.compliance_dir, test)
+ if test == "TEST01":
+ lines = self.submission_logs.loader_data[f"{test}_acc_result"]
+ lines = [line.strip() for line in lines]
+ if "TEST PASS" in lines:
+ self.log.info(
+ "Compliance test accuracy check (deterministic mode) in %s passed",
+ test_dir,
+ )
+ else:
+ self.log.info(
+ "Compliance test accuracy check (deterministic mode) in %s failed",
+ test_dir,
+ )
+ test_acc_path = os.path.join(test_dir, "accuracy")
+ if not os.path.exists(test_acc_path):
+ self.log.error(
+ "%s has no accuracy directory", test_dir)
+ is_valid = False
+ else:
+ diff = files_diff(
+ list_files(
+ test_acc_path), REQUIRED_TEST01_ACC_FILES,
+ )
+ if diff:
+ self.log.error(
+ "%s has file list mismatch (%s)",
+ test_acc_path,
+ diff)
+ is_valid = False
+ else:
+ target = self.config.get_accuracy_target(
+ self.model)
+ patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
+ self.model)
+ acc_limit_check = True
+
+ acc_seen = [False for _ in acc_targets]
+ acc_baseline = {
+ acc_type: 0 for acc_type in acc_types}
+ acc_compliance = {
+ acc_type: 0 for acc_type in acc_types}
+ with open(
+ os.path.join(
+ test_acc_path, "baseline_accuracy.txt"),
+ "r",
+ encoding="utf-8",
+ ) as f:
+ for line in f:
+ for acc_type, pattern in zip(
+ acc_types, patterns):
+ m = re.match(pattern, line)
+ if m:
+ acc_baseline[acc_type] = float(
+ m.group(1))
+ with open(
+ os.path.join(
+ test_acc_path, "compliance_accuracy.txt"),
+ "r",
+ encoding="utf-8",
+ ) as f:
+ for line in f:
+ for acc_type, pattern in zip(
+ acc_types, patterns):
+ m = re.match(pattern, line)
+ if m:
+ acc_compliance[acc_type] = float(
+ m.group(1))
+ for acc_type in acc_types:
+ if acc_baseline[acc_type] == 0 or acc_compliance[acc_type] == 0:
+ is_valid = False
+ break
+ else:
+ required_delta_perc = self.config.get_delta_perc(
+ self.model, acc_type
+ )
+ delta_perc = (
+ abs(
+ 1
+ - acc_baseline[acc_type] /
+ acc_compliance[acc_type]
+ )
+ * 100
+ )
+ if delta_perc <= required_delta_perc:
+ is_valid = True
+ else:
+ self.log.error(
+ "Compliance test accuracy check (non-deterministic mode) in %s failed",
+ test_dir,
+ )
+ is_valid = False
+ break
+ elif test == "TEST06":
+ lines = self.submission_logs.loader_data[f"{test}_acc_result"]
+ lines = [line.strip() for line in lines]
+ first_token_pass = (
+ "First token check pass: True" in lines
+ or "First token check pass: Skipped" in lines
+ )
+ eos_pass = "EOS check pass: True" in lines
+ length_check_pass = "Sample length check pass: True" in lines
+ is_valid &= (
+ first_token_pass and eos_pass and length_check_pass)
+ if not is_valid:
+ self.log.error(
+ f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+ )
+ else:
+ self.log.info(f"{test_dir} does not require accuracy check")
+ return is_valid
+
+ def compliance_performance_check(self):
+ """Inspect compliance performance verification outputs.
+
+ For TEST01 and TEST04, checks the `verify_performance.txt` file for
+ a passing indicator and ensures the `performance/run_1` directory
+ contains the expected files (with optional exclusions).
+
+ Returns:
+ bool: True if all compliance performance checks pass, False
+ if any check fails.
+ """
+
+ if self.division.lower() == "open":
+ self.log.info(
+ "Compliance tests not needed for open division. Skipping tests on %s",
+ self.path)
+ return True
+ is_valid = True
+ for test in self.test_list:
+ test_dir = os.path.join(self.compliance_dir, test)
+ if test in ["TEST01", "TEST04"]:
+ fname = os.path.join(test_dir, "verify_performance.txt")
+ if not os.path.exists(fname):
+ self.log.error("%s is missing in %s", fname, test_dir)
+ is_valid = False
+ else:
+ with open(fname, "r") as f:
+ for line in f:
+ # look for: TEST PASS
+ if "TEST PASS" in line:
+ is_valid = True
+ break
+ if is_valid == False:
+ self.log.error(
+ "Compliance test performance check in %s failed",
+ test_dir)
+
+ # Check performance dir
+ test_perf_path = os.path.join(
+ test_dir, "performance", "run_1")
+ if not os.path.exists(test_perf_path):
+ self.log.error(
+ "%s has no performance/run_1 directory", test_dir)
+ is_valid = False
+ else:
+ diff = files_diff(
+ list_files(test_perf_path),
+ REQUIRED_COMP_PER_FILES,
+ ["mlperf_log_accuracy.json"],
+ )
+ if diff:
+ self.log.error(
+ "%s has file list mismatch (%s)",
+ test_perf_path,
+ diff)
+ is_valid = False
+ return is_valid
diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py
new file mode 100644
index 0000000000..06b89f56fc
--- /dev/null
+++ b/tools/submission/submission_checker/checks/measurements_checks.py
@@ -0,0 +1,139 @@
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+from ..utils import *
+import os
+
+
+class MeasurementsCheck(BaseCheck):
+ """Validate measurement artifacts included in a submission.
+
+ The `MeasurementsCheck` class verifies the presence and basic
+ correctness of measurement-related files and fields produced by a
+ submission. It ensures the measurements JSON exists, required files
+ are present (and optionally non-empty), the source directory exists,
+ and that required metadata fields inside the measurements JSON have
+ meaningful values.
+
+ Attributes:
+ submission_logs (SubmissionLogs): Parsed submission logs and
+ metadata used to locate measurement artifacts.
+ measurements_json (dict): Parsed contents of the measurements JSON.
+ measurements_dir (str): Path to the measurements directory to
+ validate file contents.
+ src_dir (str): Path to the submission source directory expected to
+ be present in the submission bundle.
+ config (Config): Configuration provider toggling optional checks.
+ """
+
+ def __init__(self, log, path, config: Config,
+ submission_logs: SubmissionLogs):
+ """Initialize the measurements checker.
+
+ Args:
+ log: Logger used to emit info/warning/error messages.
+ path: Path to the submission root being validated.
+ config (Config): Configuration helper containing feature
+ toggles for skipped checks.
+ submission_logs (SubmissionLogs): Parsed submission artifacts
+ and loader metadata.
+ """
+ super().__init__(log, path)
+ self.name = "measurement checks"
+ self.submission_logs = submission_logs
+ self.measurements_json = self.submission_logs.measurements_json
+ self.submitter = self.submission_logs.loader_data.get("submitter", "")
+ self.division = self.submission_logs.loader_data.get("division", "")
+ self.measurements_dir = self.submission_logs.loader_data.get(
+ "measurements_dir", "")
+ self.src_dir = self.submission_logs.loader_data.get("src_path", "")
+ self.config = config
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register per-submission measurement checks.
+
+ Appends the callable checks to `self.checks` in the order they
+ should be executed by the submission validation framework.
+ """
+ self.checks.append(self.missing_check)
+ self.checks.append(self.directory_exist_check)
+ self.checks.append(self.required_files_check)
+ self.checks.append(self.required_fields_check)
+
+ def missing_check(self):
+ """Ensure a measurements JSON was provided.
+
+ Returns:
+ bool: True if `measurements_json` is present, False otherwise.
+ """
+ if self.measurements_json is None:
+ self.log.error(
+ "%s measurements json file not found",
+ self.path
+ )
+ return False
+ return True
+
+ def directory_exist_check(self):
+ """Verify the expected source directory exists in the submission.
+
+ Returns:
+ bool: True if `src_dir` exists, False otherwise.
+ """
+ if not os.path.exists(self.src_dir):
+ self.log.error(
+ "%s src directory does not exist",
+ self.src_dir
+ )
+ return False
+ return True
+
+ def required_files_check(self):
+ """Confirm required measurement files exist and are non-empty.
+
+ Respects the `skip_empty_files_check` configuration flag; when that
+ flag is False, files with zero size will cause the check to fail.
+
+ Returns:
+ bool: True if all required files are present (and non-empty when
+ configured), False otherwise.
+ """
+ is_valid = True
+ files = list_files(self.measurements_dir)
+ for i in REQUIRED_MEASURE_FILES:
+ if i not in files:
+ self.log.error("%s is missing %s", self.measurements_dir, i)
+ is_valid = False
+ elif not self.config.skip_empty_files_check and (
+ os.stat(os.path.join(self.measurements_dir, i)).st_size == 0
+ ):
+ self.log.error(
+ "%s is having empty %s",
+ self.measurements_dir,
+ i)
+ is_valid = False
+ return is_valid
+
+ def required_fields_check(self):
+ """Validate presence and meaningfulness of required JSON fields.
+
+ If `skip_meaningful_fields_emptiness_check` is False in the
+ configuration, this will also fail when required fields are empty.
+
+ Returns:
+ bool: True if all required fields exist (and contain meaningful
+ values when configured), False otherwise.
+ """
+ is_valid = True
+ check_empty_fields = False if self.config.skip_meaningful_fields_emptiness_check else True
+ for k in SYSTEM_IMP_REQUIRED_FILES:
+ if k not in self.measurements_json:
+ is_valid = False
+ self.log.error("%s, field %s is missing", self.path, k)
+ elif check_empty_fields and not self.measurements_json[k]:
+ is_valid = False
+ self.log.error(
+ "%s, field %s is missing meaningful value", self.path, k)
+ return is_valid
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
new file mode 100644
index 0000000000..4923862b71
--- /dev/null
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -0,0 +1,535 @@
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+import os
+
+
+class PerformanceCheck(BaseCheck):
+ """Validate performance-related submission artifacts and metrics.
+
+ The `PerformanceCheck` class performs a comprehensive set of validations
+ on submission performance outputs. It inspects the parsed MLPerf log,
+ system JSON, and configuration to ensure that performance runs meet
+ required constraints such as sample counts, latency limits, seed values,
+ minimum durations, and scenario-specific rules. It also handles result
+ inference for edge cases and validates network mode configurations.
+
+ Attributes:
+ submission_logs (SubmissionLogs): Holder for submission log paths
+ and parsed contents (performance logs, system JSON, loader data).
+ mlperf_log: Parsed MLPerf log object for inspecting run metadata and
+ results.
+ system_json (dict): Parsed system description JSON for hardware
+ validation.
+ config (Config): Configuration provider for targets, constraints,
+ and feature toggles.
+ """
+
+ def __init__(self, log, path, config: Config,
+ submission_logs: SubmissionLogs):
+ """Initialize the performance checker.
+
+ Args:
+ log: Logger instance used to report messages.
+ path: Path to the submission being checked.
+ config (Config): Configuration provider for performance targets
+ and constraints.
+ submission_logs (SubmissionLogs): Parsed submission logs and
+ artifact paths (performance logs, system JSON, loader data).
+ """
+ super().__init__(log, path)
+ self.name = "performance checks"
+ self.submission_logs = submission_logs
+ self.mlperf_log = self.submission_logs.performance_log
+ self.system_json = self.submission_logs.system_json
+ self.config = config
+ self.model = self.submission_logs.loader_data.get("benchmark", "")
+ self.model_mapping = self.submission_logs.loader_data.get(
+ "model_mapping", {})
+ self.model = self.config.get_mlperf_model(
+ self.model, self.model_mapping)
+ self.scenario_fixed = self.submission_logs.loader_data.get(
+ "scenario", "")
+ self.scenario = self.mlperf_log["effective_scenario"]
+ self.division = self.submission_logs.loader_data.get("division", "")
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register individual performance-related checks.
+
+ Adds the per-submission validation callables to `self.checks` in
+ the order they should be executed.
+ """
+ self.checks.append(self.missing_check)
+ self.checks.append(self.loadgen_errors_check)
+ self.checks.append(self.equal_issue_check)
+ self.checks.append(self.performance_sample_count_check)
+ self.checks.append(self.seeds_check)
+ self.checks.append(self.latency_check)
+ self.checks.append(self.min_query_count_check)
+ self.checks.append(self.min_duration_check)
+ self.checks.append(self.network_check)
+ self.checks.append(self.llm_check)
+ self.checks.append(self.inferred_check)
+ self.checks.append(self.get_performance_metric_check)
+
+ def missing_check(self):
+ """Ensure the performance log was provided.
+
+ Returns:
+ bool: True if `mlperf_log` is present, False otherwise.
+ """
+ if self.mlperf_log is None:
+ self.log.error("Performance log missing at %s", self.path)
+ return False
+ return True
+
+ def loadgen_errors_check(self):
+ """Detect Loadgen errors reported in the MLPerf log.
+
+ If errors are present and not ignored by configuration, logs the
+ error messages and returns False to indicate failure.
+
+ Returns:
+ bool: True if no blocking Loadgen errors are present,
+ False otherwise.
+ """
+ if self.mlperf_log.has_error():
+ if self.config.ignore_uncommited:
+ has_other_errors = False
+ for error in self.mlperf_log.get_errors():
+ if "Loadgen built with uncommitted changes!" not in error["value"]:
+ has_other_errors = True
+ self.log.error("%s contains errors:", self.path)
+ for error in self.mlperf_log.get_errors():
+ self.log.error("%s", error["value"])
+
+ if not self.config.ignore_uncommited or has_other_errors:
+ self.log.error(
+ "%s has loadgen errors, number of errors: %s", self.path, self.mlperf_log.num_errors()
+ )
+ return False
+ return True
+
+ def equal_issue_check(self):
+ """Verify equal-issue mode is enabled for required models.
+
+ For models requiring equal-issue mode, checks that
+ `sample_concatenate_permutation` is True.
+
+ Returns:
+ bool: True if equal-issue mode is correctly set or not required,
+ False otherwise.
+ """
+ if self.config.requires_equal_issue(
+ self.model, self.division) and self.mlperf_log["effective_sample_concatenate_permutation"]:
+ self.log.error(
+ "%s requires equal issue mode (sample_concatenate_permutation), expected=true, found=false",
+ self.path)
+ return False
+ return True
+
+ def performance_sample_count_check(self):
+ """Ensure the performance run used sufficient samples.
+
+ Compares the effective performance sample count against the
+ configured minimum for the model.
+
+ Returns:
+ bool: True if the sample count meets or exceeds the requirement,
+ False otherwise.
+ """
+ required_performance_sample_count = self.config.get_performance_sample_count(
+ self.model)
+ performance_sample_count = self.mlperf_log["effective_performance_sample_count"]
+ if performance_sample_count < required_performance_sample_count:
+ self.log.error(
+ "%s performance_sample_count, found %d, needs to be >= %d",
+ self.path,
+ performance_sample_count,
+ required_performance_sample_count,
+ )
+ return False
+ return True
+
+ def seeds_check(self):
+ """Validate RNG seeds match the submission fixed values.
+
+ Checks that QSL, sample index, and schedule RNG seeds from the log
+ match the expected values from `config.seeds`.
+
+ Returns:
+ bool: True if all seeds match, False if any mismatch.
+ """
+ config_seeds = self.config.seeds
+ qsl_rng_seed = self.mlperf_log["effective_qsl_rng_seed"]
+ sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"]
+ schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"]
+ is_valid = True
+ if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
+ self.log.error(
+ "%s qsl_rng_seed is wrong, expected=%s, found=%s",
+ self.path,
+ config_seeds["qsl_rng_seed"],
+ qsl_rng_seed,
+ )
+ is_valid = False
+ if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]:
+ self.log.error(
+ "%s sample_index_rng_seed is wrong, expected=%s, found=%s",
+ self.path,
+ config_seeds["sample_index_rng_seed"],
+ sample_index_rng_seed,
+ )
+ is_valid = False
+ if schedule_rng_seed != config_seeds["schedule_rng_seed"]:
+ self.log.error(
+ "%s schedule_rng_seed is wrong, expected=%s, found=%s",
+ self.path,
+ config_seeds["schedule_rng_seed"],
+ schedule_rng_seed,
+ )
+ is_valid = False
+ return is_valid
+
+ def latency_check(self):
+ """Enforce latency constraints based on scenario and early stopping.
+
+ For scenarios using early stopping, verifies the condition was met
+ and target latency constraints. For others, checks 99th percentile
+ latency against configured limits.
+
+ Returns:
+ bool: True if latency constraints are satisfied, False otherwise.
+ """
+ uses_early_stopping = self.config.uses_early_stopping(self.scenario)
+ if uses_early_stopping:
+ # check if early_stopping condition was met
+ if not self.mlperf_log["early_stopping_met"]:
+ early_stopping_result = self.mlperf_log["early_stopping_result"]
+ self.log.error(
+ "Early stopping condition was not met, msg=%s",
+ early_stopping_result,
+ )
+ return False
+ # If the scenario has a target latency (Server scenario), check
+ # that the target latency that was passed to the early stopping
+ # is less than the target latency.
+ target_latency = self.config.latency_constraint.get(
+ self.model, dict()).get(self.scenario)
+ if target_latency:
+ early_stopping_latency_ns = self.mlperf_log["effective_target_latency_ns"]
+ self.log.info(
+ "Target latency: %s, Early Stopping Latency: %s, Scenario: %s",
+ target_latency,
+ early_stopping_latency_ns,
+ self.scenario,
+ )
+ if early_stopping_latency_ns > target_latency:
+ self.log.error(
+ "%s Latency constraint with early stopping not met, expected=%s, found=%s",
+ self.path,
+ target_latency,
+ early_stopping_latency_ns,
+ )
+ return False
+ else:
+ # check if the benchmark meets latency constraint
+ latency_99_percentile = self.mlperf_log["result_99.00_percentile_latency_ns"]
+ target_latency = self.config.latency_constraint.get(
+ self.model, dict()).get(self.scenario)
+ self.log.info(
+ "Target latency: %s, Latency: %s, Scenario: %s",
+ target_latency,
+ latency_99_percentile,
+ self.scenario,
+ )
+ if target_latency:
+ if latency_99_percentile > target_latency:
+ self.log.error(
+ "%s Latency constraint not met, expected=%s, found=%s",
+ self.path,
+ target_latency,
+ latency_99_percentile,
+ )
+ return False
+ return True
+
+ def min_query_count_check(self):
+ """Verify minimum query counts and samples per query are met.
+
+ Checks minimum query count for non-early-stopping scenarios and
+ enforces minimum samples per query for Offline scenarios in closed
+ division.
+
+ Returns:
+ bool: True if all minimum requirements are satisfied,
+ False otherwise.
+ """
+ uses_early_stopping = self.config.uses_early_stopping(self.scenario)
+ min_query_count = self.mlperf_log["effective_min_query_count"]
+ samples_per_query = self.mlperf_log["effective_samples_per_query"]
+ if not uses_early_stopping:
+ required_min_query_count = self.config.get_min_query_count(
+ self.model, self.scenario)
+ if required_min_query_count and min_query_count < required_min_query_count:
+ self.log.error(
+ "%s Required minimum Query Count not met by user config, Expected=%s, Found=%s",
+ self.path,
+ required_min_query_count,
+ min_query_count,
+ )
+ return False
+ if self.scenario.lower() == "offline" and (
+ samples_per_query < OFFLINE_MIN_SPQ_SINCE_V4[self.model]) and self.division.lower() == "closed":
+ self.log.error(
+ "%s Required minimum samples per query not met by user config, Expected=%s, Found=%s",
+ self.path,
+ OFFLINE_MIN_SPQ_SINCE_V4[self.model],
+ samples_per_query,
+ )
+ return False
+ return True
+
+ def min_duration_check(self):
+ """Ensure the test duration meets the minimum requirement.
+
+ Verifies that the effective minimum duration is at least
+ `TEST_DURATION_MS` (600 seconds).
+
+ Returns:
+ bool: True if duration meets the minimum, False otherwise.
+ """
+ required_min_duration = TEST_DURATION_MS
+ min_duration = self.mlperf_log["effective_min_duration_ms"]
+ if min_duration < required_min_duration:
+ self.log.error(
+ "%s Test duration less than 600s in user config. expected=%s, found=%s",
+ self.path,
+ required_min_duration,
+ min_duration,
+ )
+ return False
+ return True
+
+ def network_check(self):
+ """Validate network mode settings and SUT naming.
+
+ Ensures the system JSON indicates the correct network mode for the
+ division and that SUT names comply with network mode requirements.
+
+ Returns:
+ bool: True if network mode and naming are valid, False otherwise.
+ """
+ if self.system_json is None:
+ self.log.error(
+ "%s system json file not found",
+ self.path
+ )
+ return False
+ is_network_mode_sys_spec_str = self.system_json.get(
+ SYSTEM_DESC_IS_NETWORK_MODE)
+ is_network_system = (
+ is_network_mode_sys_spec_str.lower() == "true"
+ if is_network_mode_sys_spec_str is not None
+ else False
+ )
+ # verify that the system corresponds the division
+ is_valid = True
+ expected_state_by_division = {"network": True, "closed": False}
+ if self.division in expected_state_by_division:
+ is_valid = expected_state_by_division[self.division] is is_network_system
+ if not is_valid:
+ self.log.error(
+ f"{self.path} incorrect network mode(={is_network_system}) "
+ f"for division {self.division}"
+ )
+ return False
+
+ sut_name = self.mlperf_log["sut_name"]
+ if is_network_system:
+ # for network mode verify the SUT name is valid, according to the rules
+ # (must include "Network SUT" in name)
+ if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
+ self.log.error(
+ f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
+ )
+ return False
+
+ return True
+
+ def llm_check(self):
+ """Perform LLM-specific latency validations for token latencies.
+
+ For LLM models, ensures token latencies are enabled and that TTFT
+ and TPOT metrics meet configured limits for applicable scenarios.
+
+ Returns:
+ bool: True if LLM checks pass or model is not an LLM,
+ False otherwise.
+ """
+ if self.model in self.config.get_llm_models():
+ if self.mlperf_log["requested_use_token_latencies"]:
+ if self.scenario not in ["Server", "Interactive"]:
+ # For offline, singlestream and multistream no further checks are
+ # necessary
+ return True
+ else:
+ limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
+ if (
+ self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+ < limits["ttft"]
+ and self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+ < limits["tpot"]
+ ):
+ return True
+ else:
+ self.log.error(
+ f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
+ return False
+
+ self.log.error(
+ 'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+ self.mlperf_log["result_first_token_99.00_percentile_latency_ns"],
+ self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
+ limits["ttft"],
+ limits["tpot"]
+ )
+ return False
+ return True
+
+ def inferred_check(self):
+ """Validate rules for inferring results across scenarios.
+
+ Ensures that result inference is only allowed for edge systems and
+ specific scenario pairs, preventing invalid cross-scenario reuse.
+
+ Returns:
+ bool: True if inference is valid or not attempted, False otherwise.
+ """
+ if self.scenario.lower() != self.scenario_fixed.lower() and (
+ self.scenario.lower(), self.scenario_fixed.lower()) != ("server", "interactive"):
+ if "edge" not in self.system_json["system_type"].lower():
+ self.log.error(
+ "Result can not be inferred for %s suite for: %s. Scenario: %s, Scenario fixed: %s",
+ self.system_json["system_type"],
+ self.path,
+ self.scenario,
+ self.scenario_fixed)
+ return False
+ list_inferred = [
+ ("singlestream", "multistream"),
+ ("multistream", "offline"),
+ ("singlestream", "offline")
+ ]
+ if (self.scenario.lower(), self.scenario_fixed.lower()
+ ) not in list_inferred:
+ self.log.error(
+ "Result for scenario %s can not be inferred from %s for: %s",
+ self.scenario_fixed,
+ self.scenario,
+ self.path)
+ return False
+ return True
+
+ def get_performance_metric_check(self):
+ """Extract and validate the primary performance metric.
+
+ Parses the performance result from the log, applies any benchmark-
+ specific overwrites, and handles inferred results. Records the
+ metric in `submission_logs.loader_data`.
+
+ Returns:
+ bool: True if the metric is valid, False otherwise.
+ """
+ # Assumes new logging format
+ is_valid = True
+ version = self.config.version
+ if (
+ "result_validity" in self.mlperf_log.get_keys()
+ and self.mlperf_log["result_validity"] == "VALID"
+ ):
+ is_valid = True
+ scenario = self.mlperf_log["effective_scenario"]
+
+ res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
+ if (
+ version in RESULT_FIELD_BENCHMARK_OVERWRITE
+ and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+ and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
+ ):
+ res = float(
+ self.mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+ [self.model][scenario]]
+ )
+
+ inferred = False
+ if self.scenario.lower() != self.scenario_fixed.lower() and (
+ self.scenario.lower(), self.scenario_fixed.lower()) != ("server", "interactive"):
+ res, is_valid = self.get_inferred_result(res)
+ self.submission_logs.loader_data["performance_metric"] = res
+ return is_valid
+
+ def get_inferred_result(self, res):
+ """Compute inferred performance result for cross-scenario reuse.
+
+ Calculates the performance metric for the fixed scenario based on
+ the run scenario's results, applying scenario-specific formulas.
+
+ Args:
+ res (float): The raw performance result from the log.
+
+ Returns:
+ tuple: (inferred_result, is_valid) where is_valid indicates if
+ inference was successful.
+ """
+ inferred = False
+ is_valid = True
+ # Check if current scenario (and version) uses early stopping
+ uses_early_stopping = self.config.uses_early_stopping(self.scenario)
+
+ latency_mean = self.mlperf_log["result_mean_latency_ns"]
+ if self.scenario in ["MultiStream"]:
+ latency_99_percentile = self.mlperf_log[
+ "result_99.00_percentile_per_query_latency_ns"
+ ]
+ latency_mean = self.mlperf_log["result_mean_query_latency_ns"]
+ samples_per_query = self.mlperf_log["effective_samples_per_query"]
+ if self.scenario == "SingleStream":
+ # qps_wo_loadgen_overhead is only used for inferring Offline from
+ # SingleStream; only for old submissions
+ qps_wo_loadgen_overhead = self.mlperf_log["result_qps_without_loadgen_overhead"]
+
+ # special case for results inferred from different scenario
+ if self.scenario_fixed in [
+ "Offline"] and self.scenario in ["SingleStream"]:
+ inferred = True
+ res = qps_wo_loadgen_overhead
+
+ if (self.scenario_fixed in ["Offline"]
+ ) and self.scenario in ["MultiStream"]:
+ inferred = True
+ res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
+
+ if (self.scenario_fixed in ["MultiStream"]
+ ) and self.scenario in ["SingleStream"]:
+ inferred = True
+ # samples_per_query does not match with the one reported in the logs
+ # when inferring MultiStream from SingleStream
+ samples_per_query = 8
+ if uses_early_stopping:
+ early_stopping_latency_ms = self.mlperf_log["early_stopping_latency_ms"]
+ if early_stopping_latency_ms == 0:
+ self.log.error(
+ "Not enough samples were processed for early stopping to make an estimate"
+ )
+ is_valid = False
+ res = (early_stopping_latency_ms *
+ samples_per_query) / MS_TO_NS
+ else:
+ res = (latency_99_percentile * samples_per_query) / MS_TO_NS
+ if (self.scenario_fixed in ["Interactive"]
+ ) and self.scenario not in ["Server"]:
+ is_valid = False
+ return res, is_valid
diff --git a/tools/submission/submission_checker/checks/power/power_checker.py b/tools/submission/submission_checker/checks/power/power_checker.py
new file mode 100755
index 0000000000..bf6835133b
--- /dev/null
+++ b/tools/submission/submission_checker/checks/power/power_checker.py
@@ -0,0 +1,852 @@
+#!/usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from collections import OrderedDict
+from datetime import datetime, timezone, timedelta
+from typing import Dict, List, Tuple, Any, Optional, Callable
+import argparse
+import hashlib
+import json
+import os
+import re
+import traceback
+import uuid
+import logging
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+
+class LineWithoutTimeStamp(Exception):
+ pass
+
+
+class CheckerWarning(Exception):
+ pass
+
+
+SUPPORTED_VERSION = ["1.11.1"]
+SUPPORTED_MODEL = {
+ "YokogawaWT210": 8,
+ "YokogawaWT500": 35,
+ "YokogawaWT500_multichannel": 48,
+ "YokogawaWT310": 49,
+ "YokogawaWT310E": 49,
+ "YokogawaWT330": 52,
+ "YokogawaWT330E": 52,
+ "YokogawaWT330_multichannel": 77,
+ "YokogawaWT210_DC": 508,
+ "YokogawaWT310_DC": 549,
+ "YokogawaWT330_DC": 586,
+}
+
+RANGING_MODE = "ranging"
+TESTING_MODE = "run_1"
+
+RESULT_PATHS = [
+ "power/client.json",
+ "power/client.log",
+ "power/ptd_logs.txt",
+ "power/server.json",
+ "power/server.log",
+ RANGING_MODE + "/mlperf_log_detail.txt",
+ RANGING_MODE + "/mlperf_log_summary.txt",
+ RANGING_MODE + "/spl.txt",
+ TESTING_MODE + "/mlperf_log_detail.txt",
+ TESTING_MODE + "/mlperf_log_summary.txt",
+ TESTING_MODE + "/spl.txt",
+]
+
+COMMON_ERROR_RANGING = [
+ "Can't evaluate uncertainty of this sample!",
+ "Bad watts reading nan from ",
+ "Bad amps reading nan from ",
+ "Bad pf reading nan from ",
+ "Bad volts reading nan from ",
+ "Current appears to be too high for set range",
+]
+COMMON_ERROR_TESTING = ["USB."]
+WARNING_NEEDS_TO_BE_ERROR_TESTING_RE = [
+ re.compile(
+ r"Uncertainty \d+.\d+%, which is above 1.00% limit for the last sample!")
+]
+
+TIME_DELTA_TOLERANCE = 800 # in milliseconds
+
+
+def _normalize(path: str) -> str:
+ allparts: List[str] = []
+ while 1:
+ parts = os.path.split(path)
+ if parts[0] == path: # sentinel for absolute paths
+ allparts.insert(0, parts[0])
+ break
+ if parts[1] == path: # sentinel for relative paths
+ allparts.insert(0, parts[1])
+ break
+ path = parts[0]
+ allparts.insert(0, parts[1])
+ return "/".join(allparts)
+
+
+def _sort_dict(x: Dict[str, Any]) -> "OrderedDict[str, Any]":
+ return OrderedDict(sorted(x.items()))
+
+
+def hash_dir(dirname: str) -> Dict[str, str]:
+ result: Dict[str, str] = {}
+
+ for path, dirs, files in os.walk(dirname, topdown=True):
+ relpath = os.path.relpath(path, dirname)
+ if relpath == ".":
+ relpath = ""
+ for file in files:
+ fname = os.path.join(relpath, file)
+ with open(os.path.join(path, file), "rb") as f:
+ result[_normalize(fname)] = hashlib.sha1(f.read()).hexdigest()
+
+ return _sort_dict(result)
+
+
+def get_time_from_line(
+ line: str, data_regexp: str, file: str, timezone_offset: int
+) -> float:
+ log_time_str = re.search(data_regexp, line)
+ if log_time_str and log_time_str.group(0):
+ log_datetime = datetime.strptime(
+ log_time_str.group(0), "%m-%d-%Y %H:%M:%S.%f")
+ return log_datetime.replace(
+ tzinfo=timezone.utc).timestamp() + timezone_offset
+ raise LineWithoutTimeStamp(f"{line.strip()!r} in {file}.")
+
+
+class SessionDescriptor:
+ def __init__(self, path: str):
+ self.path = path
+ with open(path, "r") as f:
+ self.json_object: Dict[str, Any] = json.loads(f.read())
+ self.required_fields_check()
+
+ def required_fields_check(self) -> None:
+ required_fields = [
+ "version",
+ "timezone",
+ "modules",
+ "sources",
+ "messages",
+ "uuid",
+ "session_name",
+ "results",
+ "phases",
+ ]
+ absent_keys = set(required_fields) - self.json_object.keys()
+ assert (
+ len(absent_keys) == 0
+ ), f"Required fields {', '.join(absent_keys)!r} does not exist in {self.path!r}"
+
+
+def compare_dicts_values(
+ d1: Dict[str, str], d2: Dict[str, str], comment: str) -> None:
+ files_with_diff_check_sum = {k: d1[k]
+ for k in d1 if k in d2 and d1[k] != d2[k]}
+ assert len(files_with_diff_check_sum) == 0, f"{comment}" + "".join(
+ [
+ f"Expected {d1[i]}, but got {d2[i]} for {i}\n"
+ for i in files_with_diff_check_sum
+ ]
+ )
+
+
+def compare_dicts(s1: Dict[str, str],
+ s2: Dict[str, str], comment: str) -> None:
+ assert (
+ not s1.keys() - s2.keys()
+ ), f"{comment} Missing {', '.join(sorted(s1.keys() - s2.keys()))!r}"
+ assert (
+ not s2.keys() - s1.keys()
+ ), f"{comment} Extra {', '.join(sorted(s2.keys() - s1.keys()))!r}"
+
+ compare_dicts_values(s1, s2, comment)
+
+
+def sources_check(sd: SessionDescriptor) -> None:
+ """Compare the current checksum of the code from client.json or server.json
+ against the standard checksum of the source code from sources_checksums.json.
+ """
+ s = sd.json_object["sources"]
+
+ with open(os.path.join(os.path.dirname(__file__), "sources_checksums.json")) as f:
+ sources_samples = json.load(f)
+
+ assert s in sources_samples, f"{s} do not exist in 'sources_checksums.json'"
+
+
+def ptd_messages_check(sd: SessionDescriptor) -> None:
+ """Performs multiple checks:
+ - Check the ptd version number.
+ - Check the device model.
+ - Compare message replies with expected values.
+ - Check that initial values set after the test is completed.
+ """
+ msgs: List[Dict[str, str]] = sd.json_object["ptd_messages"]
+
+ def get_ptd_answer(command: str) -> str:
+ for msg in msgs:
+ if msg["cmd"] == command:
+ return msg["reply"]
+ return ""
+
+ identify_answer = get_ptd_answer("Identify")
+ assert (
+ len(identify_answer) != 0
+ ), "There is no answer to the 'Identify' command for PTD."
+ power_meter_model = identify_answer.split(",")[0]
+ groups = re.search(r"(?<=version=)(.+?)-", identify_answer)
+ version = "" if groups is None else groups.group(1)
+
+ assert (
+ version in SUPPORTED_VERSION
+ ), f"PTD version {version!r} is not supported. Supported versions are {SUPPORTED_VERSION}"
+ assert (
+ power_meter_model in SUPPORTED_MODEL.keys()
+ ), f"Power meter {power_meter_model!r} is not supported. Only {', '.join(SUPPORTED_MODEL.keys())} are supported."
+
+ def check_reply(cmd: str, reply: str) -> None:
+ stop_counter = 0
+ for msg in msgs:
+ if msg["cmd"].startswith(cmd):
+ if msg["cmd"] == "Stop":
+ # In normal flow the third answer to stop command is
+ # `Error: no measurement to stop`
+ if stop_counter == 2:
+ reply = "Error: no measurement to stop"
+ stop_counter += 1
+ assert (
+ reply == msg["reply"]
+ ), f"Wrong reply for {msg['cmd']!r} command. Expected {reply!r}, but got {msg['reply']!r}"
+
+ check_reply("SR,A", "Range A changed")
+ check_reply("SR,V", "Range V changed")
+ check_reply(
+ "Go,1000,",
+ "Starting untimed measurement, maximum 500000 samples at 1000ms with 0 rampup samples",
+ )
+ check_reply("Stop", "Stopping untimed measurement")
+
+ def get_initial_range(param_num: int, reply: str) -> str:
+ reply_list = reply.split(",")
+ try:
+ if reply_list[param_num] == "0" and float(
+ reply_list[param_num + 1]) > 0:
+ return reply_list[param_num + 1]
+ except (ValueError, IndexError):
+ assert False, f"Can not get power meters initial values from {reply!r}"
+ return "Auto"
+
+ def get_command_by_value_and_number(
+ cmd: str, number: int) -> Optional[str]:
+ command_counter = 0
+ for msg in msgs:
+ if msg["cmd"].startswith(cmd):
+ command_counter += 1
+ if command_counter == number:
+ return msg["cmd"]
+ assert False, f"Can not find the {number} command starting with {cmd!r}."
+ return None
+
+ initial_amps = get_initial_range(1, msgs[2]["reply"])
+ initial_volts = get_initial_range(3, msgs[2]["reply"])
+
+ # sometimes the SR,A,Auto comes from 3rd or 4th response
+ initial_amps_command_3 = get_command_by_value_and_number("SR,A", 3)
+ initial_amps_command_4 = get_command_by_value_and_number("SR,A", 4)
+ initial_volts_command = get_command_by_value_and_number("SR,V", 3)
+ assert (initial_amps_command_3 == f"SR,A,{initial_amps}") or (
+ initial_amps_command_4 == f"SR,A,{initial_amps}"
+ ), f"Do not set Amps range as initial. Expected 'SR,A,{initial_amps}', got {initial_amps_command_3!r} and {initial_amps_command_4!r}."
+ assert (
+ initial_volts_command == f"SR,V,{initial_volts}"
+ ), f"Do not set Volts range as initial. Expected 'SR,V,{initial_volts}', got {initial_volts_command!r}."
+
+
+def uuid_check(client_sd: SessionDescriptor,
+ server_sd: SessionDescriptor) -> None:
+ """Compare UUIDs from client.json and server.json. They should be the same."""
+ uuid_c = client_sd.json_object["uuid"]
+ uuid_s = server_sd.json_object["uuid"]
+
+ assert uuid.UUID(uuid_c["client"]) == uuid.UUID(
+ uuid_s["client"]
+ ), "'client uuid' is not equal."
+ assert uuid.UUID(uuid_c["server"]) == uuid.UUID(
+ uuid_s["server"]
+ ), "'server uuid' is not equal."
+
+
+def _get_begin_end_time_from_mlperf_log_detail(
+ path: str, client_sd: SessionDescriptor
+) -> Tuple[float, float]:
+ system_begin = None
+ system_end = None
+
+ file = os.path.join(path, "mlperf_log_detail.txt")
+
+ with open(file) as f:
+ for line in f:
+ if re.search("power_begin", line.lower()):
+ system_begin = get_time_from_line(
+ line,
+ r"(\d*-\d*-\d* \d*:\d*:\d*\.\d*)",
+ file,
+ 0,
+ )
+ elif re.search("power_end", line.lower()):
+ system_end = get_time_from_line(
+ line,
+ r"(\d*-\d*-\d* \d*:\d*:\d*\.\d*)",
+ file,
+ 0,
+ )
+ if system_begin and system_end:
+ break
+
+ assert system_begin is not None, f"Can not get power_begin time from {file!r}"
+ assert system_end is not None, f"Can not get power_end time from {file!r}"
+
+ return system_begin, system_end
+
+
+def phases_check(
+ client_sd: SessionDescriptor, server_sd: SessionDescriptor, path: str
+) -> None:
+ """Check that the time difference between corresponding checkpoint values
+ from client.json and server.json is less than or equal to TIME_DELTA_TOLERANCE ms.
+ Check that the loadgen timestamps are within workload time interval.
+ Check that the duration of loadgen test for the ranging mode is comparable
+ with duration of loadgen test for the testing mode.
+ """
+ phases_ranging_c = client_sd.json_object["phases"]["ranging"]
+ phases_testing_c = client_sd.json_object["phases"]["testing"]
+ phases_ranging_s = server_sd.json_object["phases"]["ranging"]
+ phases_testing_s = server_sd.json_object["phases"]["testing"]
+
+ def compare_time(
+ phases_client: List[List[float]], phases_server: List[List[float]], mode: str
+ ) -> None:
+ assert len(phases_client) == len(
+ phases_server
+ ), f"Phases amount is not equal for {mode} mode."
+ for i in range(len(phases_client)):
+ time_difference = abs(phases_client[i][0] - phases_server[i][0])
+ assert time_difference <= TIME_DELTA_TOLERANCE / 1000, (
+ f"The time difference for {i + 1} phase of {mode} mode is more than {TIME_DELTA_TOLERANCE}ms."
+ f"Observed difference is {time_difference * 1000}ms"
+ )
+
+ compare_time(phases_ranging_c, phases_ranging_s, RANGING_MODE)
+ compare_time(phases_testing_c, phases_testing_s, TESTING_MODE)
+
+ def compare_duration(range_duration: float, test_duration: float) -> None:
+ duration_diff = (range_duration - test_duration) / range_duration
+
+ if duration_diff > 0.5:
+ raise CheckerWarning(
+ f"Duration of the testing mode ({round(test_duration,2)}) is lower than that of "
+ f"ranging mode ({round(range_duration,2)}) by {round(duration_diff*100,2)} "
+ f"percent which is more than the expected 5 percent limit."
+ )
+
+ def compare_time_boundaries(
+ begin: float, end: float, phases: List[Any], mode: str
+ ) -> None:
+ # TODO: temporary workaround, remove when proper DST handling is
+ # implemented!
+ assert (
+ phases[1][0] < begin < phases[2][0]
+ or phases[1][0] < begin - 3600 < phases[2][0]
+ ), f"Loadgen test begin time is not within {mode} mode time interval."
+ assert (
+ phases[1][0] < end < phases[2][0]
+ or phases[1][0] < end - 3600 < phases[2][0]
+ ), f"Loadgen test end time is not within {mode} mode time interval."
+
+ system_begin_r, system_end_r = _get_begin_end_time_from_mlperf_log_detail(
+ os.path.join(path, "ranging"), client_sd
+ )
+
+ system_begin_t, system_end_t = _get_begin_end_time_from_mlperf_log_detail(
+ os.path.join(path, "run_1"), client_sd
+ )
+
+ compare_time_boundaries(
+ system_begin_r,
+ system_end_r,
+ phases_ranging_c,
+ "ranging")
+ compare_time_boundaries(
+ system_begin_t,
+ system_end_t,
+ phases_testing_c,
+ "testing")
+
+ ranging_duration_d = system_end_r - system_begin_r
+ testing_duration_d = system_end_t - system_begin_t
+
+ compare_duration(ranging_duration_d, testing_duration_d)
+
+ def get_avg_power(power_path: str, run_path: str) -> Tuple[float, float]:
+ # parse the power logs
+
+ power_begin, power_end = _get_begin_end_time_from_mlperf_log_detail(
+ os.path.join(path, os.path.basename(run_path)), client_sd
+ )
+
+ # convert to UTC
+ power_begin = datetime.fromtimestamp(power_begin, tz=timezone.utc)
+ power_end = datetime.fromtimestamp(power_end, tz=timezone.utc)
+
+ detail_log_fname = os.path.join(run_path, "mlperf_log_detail.txt")
+ datetime_format = "%m-%d-%Y %H:%M:%S.%f"
+
+ spl_fname = os.path.join(run_path, "spl.txt")
+ power_list = []
+ pf_list = []
+
+ with open(spl_fname) as f:
+ for line in f:
+ if not line.startswith("Time"):
+ continue
+ timestamp = (
+ datetime.strptime(line.split(",")[1], datetime_format)
+ ).replace(tzinfo=timezone.utc)
+ if timestamp > power_begin and timestamp < power_end:
+ cpower = float(line.split(",")[3])
+ cpf = float(line.split(",")[9])
+ if cpower > 0:
+ power_list.append(cpower)
+ if cpf > 0:
+ pf_list.append(cpf)
+
+ if len(power_list) == 0:
+ power = -1.0
+ else:
+ power = sum(power_list) / len(power_list)
+ if len(pf_list) == 0:
+ pf = -1.0
+ else:
+ pf = sum(pf_list) / len(pf_list)
+ return power, pf
+
+ ranging_watts, ranging_pf = get_avg_power(
+ os.path.join(path, "power"), os.path.join(path, "ranging")
+ )
+ testing_watts, testing_pf = get_avg_power(
+ os.path.join(path, "power"), os.path.join(path, "run_1")
+ )
+ ranging_watts = round(ranging_watts, 5)
+ testing_watts = round(testing_watts, 5)
+ ranging_pf = round(ranging_pf, 5)
+ testing_pf = round(testing_pf, 5)
+
+ delta = round((float(testing_watts) / float(ranging_watts) - 1) * 100, 2)
+
+ assert delta > -5, (
+ f"Average power during the testing mode run is lower than that during the ranging run by more than 5%. "
+ f"Observed delta is {delta}% "
+ f"with avg. ranging power {ranging_watts}, avg.testing power {testing_watts}, "
+ f"avg. ranging power factor {ranging_pf} and avg. testing power factor {testing_pf}"
+ )
+ # print(f"{path},{ranging_watts},{testing_watts},{delta}%,{ranging_pf},{testing_pf}\n")
+
+
+def session_name_check(
+ client_sd: SessionDescriptor, server_sd: SessionDescriptor
+) -> None:
+ """Check that session names from client.json and server.json are equal."""
+ session_name_c = client_sd.json_object["session_name"]
+ session_name_s = server_sd.json_object["session_name"]
+ assert (
+ session_name_c == session_name_s
+ ), f"Session name is not equal. Client session name is {session_name_c!r}. Server session name is {session_name_s!r}"
+
+
+def messages_check(client_sd: SessionDescriptor,
+ server_sd: SessionDescriptor) -> None:
+ """Compare client and server messages list length.
+ Compare messages values and replies from client.json and server.json.
+ Compare client and server version.
+ """
+ mc = client_sd.json_object["messages"]
+ ms = server_sd.json_object["messages"]
+
+ assert len(mc) == len(
+ ms
+ ), f"Client commands list length ({len(mc)}) should be equal to server commands list length ({len(ms)}). "
+
+ # Check that server.json contains all client.json messages and replies.
+ for i in range(len(mc)):
+ assert (
+ mc[i]["cmd"] == ms[i]["cmd"]
+ ), f"Commands {i} are different. Server command is {ms[i]['cmd']!r}. Client command is {mc[i]['cmd']!r}."
+ if "time" != mc[i]["cmd"]:
+ assert mc[i]["reply"] == ms[i]["reply"], (
+ f"Replies on command {mc[i]['cmd']!r} are different. "
+ f"Server reply is {ms[i]['reply']!r}. Client command is {mc[i]['reply']!r}."
+ )
+
+ # Check client and server version from server.json.
+ # Server.json contains all client.json messages and replies. Checked
+ # earlier.
+ def get_version(regexp: str, line: str) -> str:
+ version_o = re.search(regexp, line)
+ assert version_o is not None, f"Server version is not defined in:'{line}'"
+ return version_o.group(1)
+
+ client_version = get_version(
+ r"mlcommons\/power client v(\d+)$",
+ ms[0]["cmd"])
+ server_version = get_version(
+ r"mlcommons\/power server v(\d+)$",
+ ms[0]["reply"])
+
+ assert (
+ client_version == server_version
+ ), f"Client.py version ({client_version}) is not equal server.py version ({server_version})."
+
+
+def results_check(
+ server_sd: SessionDescriptor, client_sd: SessionDescriptor, result_path: str
+) -> None:
+ """Calculate the checksum for result files. Compare them with the checksums
+ list formed from joined results from server.json and client.json.
+ Check that results from client.json and server.json have no extra and absent files.
+ Compare that results files from client.json and server.json have the same checksum.
+ """
+
+ # Hashes of the files in results directory
+ results = dict(hash_dir(result_path))
+ # Hashes recorded in server.json
+ results_s = server_sd.json_object["results"]
+ # Hashes recorded in client.json
+ results_c = client_sd.json_object["results"]
+
+ # TODO: server.json checksum
+ results.pop("power/server.json")
+ # TODO: client.json checksum is no longer recorded
+ results.pop("power/client.json")
+ result_paths_copy = RESULT_PATHS.copy()
+ result_paths_copy.remove("power/server.json")
+ result_paths_copy.remove("power/client.json")
+
+ def remove_optional_path(res: Dict[str, str]) -> None:
+ keys = list(res.keys())
+ for path in keys:
+ # Ignore all the optional files.
+ if path not in result_paths_copy:
+ del res[path]
+
+ # We only check the hashes of the files required for submission.
+ remove_optional_path(results_s)
+ remove_optional_path(results_c)
+ remove_optional_path(results)
+
+ # Make sure the hashes match between server.json and client.json
+ compare_dicts_values(
+ results_s,
+ results_c,
+ f"{server_sd.path} and {client_sd.path} results checksum comparison",
+ )
+ compare_dicts_values(
+ results_c,
+ results_s,
+ f"{client_sd.path} and {server_sd.path} results checksum comparison",
+ )
+
+ # Check if the hashes of the files in results directory match the ones
+ # recorded in server.json/client.json.
+ result_c_s = {**results_c, **results_s}
+
+ compare_dicts(
+ result_c_s,
+ results,
+ f"{server_sd.path} + {client_sd.path} results checksum values and "
+ f"calculated {result_path} content checksum comparison:\n",
+ )
+
+ # Check if all the required files are present
+ def result_files_compare(
+ res: Dict[str, str], ref_res: List[str], path: str
+ ) -> None:
+ # If a file is required (in ref_res) but is not present in results directory (res),
+ # then the submission is invalid.
+ absent_files = set(ref_res) - set(res.keys())
+ assert (
+ len(absent_files) == 0
+ ), f"There are absent files {', '.join(absent_files)!r} in the results of {path}"
+
+ result_files_compare(
+ result_c_s, result_paths_copy, f"{server_sd.path} + {client_sd.path}"
+ )
+ result_files_compare(results, result_paths_copy, result_path)
+
+
+def check_ptd_logs(
+ server_sd: SessionDescriptor, client_sd: SessionDescriptor, path: str
+) -> None:
+ """Check if ptd message starts with 'WARNING' or 'ERROR' in ptd logs.
+ Check 'Uncertainty checking for Yokogawa... is activated' in PTD logs.
+ """
+ start_ranging_time = None
+ stop_ranging_time = None
+ ranging_mark = f"{server_sd.json_object['session_name']}_ranging"
+
+ start_load_time, stop_load_time = _get_begin_end_time_from_mlperf_log_detail(
+ os.path.join(path, "run_1"), client_sd
+ )
+
+ file_path = os.path.join(path, "power", "ptd_logs.txt")
+ date_regexp = r"(^\d\d-\d\d-\d\d\d\d \d\d:\d\d:\d\d.\d\d\d)"
+ timezone_offset = int(server_sd.json_object["timezone"])
+
+ with open(file_path, "r") as f:
+ ptd_log_lines = f.readlines()
+
+ def find_error_or_warning(reg_exp: str, line: str, error: bool) -> None:
+ problem_line = re.search(reg_exp, line)
+
+ if problem_line and problem_line.group(0):
+ log_time = get_time_from_line(line, date_regexp, file_path, 0)
+ if start_ranging_time is None or stop_ranging_time is None:
+ assert False, "Can not find ranging time in ptd_logs.txt."
+ if error:
+ if problem_line.group(0).strip() in COMMON_ERROR_TESTING:
+ raise CheckerWarning(
+ f"{line.strip().replace('ERROR', 'Warning')!r} in ptd_log.txt during testing stage but it is accepted. Treated as WARNING"
+ )
+ assert (
+ start_ranging_time < log_time < stop_ranging_time
+ ), f"{line.strip()!r} in ptd_log.txt"
+
+ # Treat uncommon errors in ranging phase as warnings
+ if all(
+ not problem_line.group(0).strip().startswith(
+ common_ranging_error)
+ for common_ranging_error in COMMON_ERROR_RANGING
+ ):
+ raise CheckerWarning(
+ f"{line.strip().replace('ERROR', 'Warning')!r} in ptd_log.txt during ranging stage. Treated as WARNING"
+ )
+ else:
+ if (
+ start_load_time + TIME_DELTA_TOLERANCE
+ < log_time
+ < stop_load_time - TIME_DELTA_TOLERANCE
+ ):
+ for warning_to_be_error in WARNING_NEEDS_TO_BE_ERROR_TESTING_RE:
+ warning_line = warning_to_be_error.search(
+ problem_line.group(0).strip()
+ )
+ if warning_line and warning_line.group(0):
+ assert (
+ False
+ ), f"{line.strip()!r} during testing phase. Test start time: {start_load_time}, Log time: {log_time}, Test stop time: {stop_load_time} "
+
+ raise CheckerWarning(
+ f"{line.strip()!r} in ptd_log.txt during load stage"
+ )
+
+ start_ranging_line = f": Go with mark {ranging_mark!r}"
+
+ def get_msg_without_time(line: str) -> Optional[str]:
+ try:
+ get_time_from_line(line, date_regexp, file_path, timezone_offset)
+ except LineWithoutTimeStamp:
+ return line
+ msg_o = re.search(f"(?<={date_regexp}).+", line)
+ if msg_o is None:
+ return None
+ return msg_o.group(0).strip()
+
+ for line in ptd_log_lines:
+ msg = get_msg_without_time(line)
+ if msg is None:
+ continue
+ if (not start_ranging_time) and (start_ranging_line == msg):
+ start_ranging_time = get_time_from_line(
+ line, date_regexp, file_path, 0 # timezone_offset
+ )
+ if (not stop_ranging_time) and bool(start_ranging_time):
+ if ": Completed test" == msg:
+ stop_ranging_time = get_time_from_line(
+ line, date_regexp, file_path, 0 # timezone_offset
+ )
+ break
+
+ if start_ranging_time is None or stop_ranging_time is None:
+ assert False, "Can not find ranging time in ptd_logs.txt."
+
+ is_uncertainty_check_activated = False
+
+ for line in ptd_log_lines:
+ msg_o = re.search(
+ r"Uncertainty checking for Yokogawa\S+ is activated", line)
+ if msg_o is not None:
+ try:
+ log_time = None
+ log_time = get_time_from_line(
+ line, date_regexp, file_path, 0 # timezone_offset
+ )
+ except LineWithoutTimeStamp:
+ assert (
+ log_time is not None
+ ), "ptd_logs.txt: Can not get timestamp for 'Uncertainty checking for Yokogawa... is activated' message."
+ assert (
+ start_ranging_time is not None and log_time < start_ranging_time
+ ), "ptd_logs.txt: Uncertainty checking Yokogawa... was activated after ranging mode was started."
+ is_uncertainty_check_activated = True
+ break
+
+ assert (
+ is_uncertainty_check_activated
+ ), "ptd_logs.txt: Line 'Uncertainty checking for Yokogawa... is activated' is not found."
+
+ for line in ptd_log_lines:
+ find_error_or_warning("(?<=WARNING:).+", line, error=False)
+ find_error_or_warning("(?<=ERROR:).+", line, error=True)
+
+
+def check_ptd_config(server_sd: SessionDescriptor) -> None:
+ """Check the device number is supported.
+ If the device is multichannel, check that two numbers are using for channel configuration.
+ """
+ ptd_config = server_sd.json_object["ptd_config"]
+
+ for analyzer in ptd_config:
+ dev_num = analyzer["device_type"]
+ assert dev_num in SUPPORTED_MODEL.values(), (
+ f"Device number {dev_num} is not supported. Supported numbers are "
+ + ", ".join([str(i) for i in set(SUPPORTED_MODEL.values())])
+ )
+
+ if dev_num == 77:
+ channels = ""
+ command = analyzer["command"]
+
+ for i in range(len(command)):
+ if command[i] == "-c":
+ channels = command[i + 1]
+ break
+
+ dev_name = ""
+ for name, num in SUPPORTED_MODEL.items():
+ if num == dev_num:
+ dev_name = name
+ break
+
+ assert (
+ len(channels.split(",")) == 2
+ and analyzer["channel"]
+ and len(analyzer["channel"]) == 2
+ ), f"Expected multichannel mode for {dev_name}, but got 1-channel."
+
+
+def debug_check(server_sd: SessionDescriptor) -> None:
+ """Check debug is disabled on server-side"""
+ assert (
+ server_sd.json_object.get("debug", False) is False
+ ), "Server was running in debug mode"
+
+
+def check_with_logging(
+ check_name: str, check: Callable[[], None]) -> Tuple[bool, bool]:
+ try:
+ check()
+ except AssertionError as e:
+ log.error(f"[ ] {check_name}")
+ log.error(f"\t{e}\n")
+ return False, False
+ except CheckerWarning as e:
+ log.warning(f"[x] {check_name}")
+ log.warning(f"\t{e}\n")
+ return True, True
+ except Exception:
+ log.exception(f"[ ] {check_name}")
+ log.exception("Unhandled exeception:")
+ traceback.print_exc()
+ return False, False
+ else:
+ log.info(f"[x] {check_name}")
+ return True, False
+
+
+def check(path: str) -> int:
+ client = SessionDescriptor(os.path.join(path, "power/client.json"))
+ server = SessionDescriptor(os.path.join(path, "power/server.json"))
+
+ check_with_description = {
+ "Check client sources checksum": lambda: sources_check(client),
+ "Check server sources checksum": lambda: sources_check(server),
+ "Check PTD commands and replies": lambda: ptd_messages_check(server),
+ "Check UUID": lambda: uuid_check(client, server),
+ "Check session name": lambda: session_name_check(client, server),
+ "Check time difference": lambda: phases_check(client, server, path),
+ "Check client server messages": lambda: messages_check(client, server),
+ "Check results checksum": lambda: results_check(server, client, path),
+ "Check errors and warnings from PTD logs": lambda: check_ptd_logs(
+ server, client, path
+ ),
+ "Check PTD configuration": lambda: check_ptd_config(server),
+ "Check debug is disabled on server-side": lambda: debug_check(server),
+ }
+
+ result = True
+ warnings = False
+
+ for description in check_with_description.keys():
+ check_result, check_warnings = check_with_logging(
+ description, check_with_description[description]
+ )
+ result &= check_result
+ warnings |= check_warnings
+
+ if result:
+ log.info(
+ "\nAll checks passed"
+ f"{'. Warnings encountered, check for audit!' if warnings else ''}"
+ )
+ else:
+ log.error(
+ f"\nERROR: Not all checks passed"
+ f"{'. Warnings encountered, check for audit!' if warnings else ''}"
+ )
+
+ return 0 if result else 1
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Check PTD client-server session results"
+ )
+ parser.add_argument(
+ "session_directory",
+ help="directory with session results data")
+
+ args = parser.parse_args()
+
+ return_code = check(args.session_directory)
+
+ exit(return_code)
diff --git a/tools/submission/submission_checker/checks/power/sources_checksums.json b/tools/submission/submission_checker/checks/power/sources_checksums.json
new file mode 100644
index 0000000000..ec8e7bc8f2
--- /dev/null
+++ b/tools/submission/submission_checker/checks/power/sources_checksums.json
@@ -0,0 +1,19 @@
+[
+ {
+ "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
+ "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
+ "lib/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
+ "lib/client.py": "ac2aa093c8e8bbc9569b9e2a3471bc64e58a2258",
+ "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
+ "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
+ "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
+ "lib/server.py": "9bc9a3b6b9b716520658ac175913bcbbdb354336",
+ "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
+ "lib/summary.py": "602a9fedfa503e5544d27ab3b0067f235047efc3",
+ "lib/time_sync.py": "80894ef2389e540781ff78de94db16aa4203a14e",
+ "server.py": "c3f90f2f7eeb4db30727556d0c815ebc89b3d28b",
+ "tests/unit/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
+ "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
+ "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a"
+ }
+]
\ No newline at end of file
diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py
new file mode 100644
index 0000000000..d3519a3503
--- /dev/null
+++ b/tools/submission/submission_checker/checks/power_check.py
@@ -0,0 +1,239 @@
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+from .power.power_checker import check as check_power_more
+from ..utils import *
+import os
+import sys
+import datetime
+
+
+class PowerCheck(BaseCheck):
+ """Validate power measurement artifacts and compute power metrics.
+
+ The `PowerCheck` class verifies the presence and correctness of power-
+ related files in submissions that include power measurements. It runs
+ external power validation scripts, ensures required files are present,
+ and computes power consumption metrics and efficiency ratios based on
+ performance logs and power data.
+
+ Attributes:
+ submission_logs (SubmissionLogs): Parsed submission logs and
+ metadata used to locate power and performance artifacts.
+ mlperf_log: Parsed MLPerf performance log for timestamps and
+ query counts.
+ power_path (str): Path to the power measurement directory.
+ testing_path (str): Path to the testing run directory.
+ ranging_path (str): Path to the ranging run directory.
+ has_power (bool): Whether power measurements are present in the
+ submission.
+ config (Config): Configuration provider for toggling power checks.
+ """
+
+ def __init__(self, log, path, config: Config,
+ submission_logs: SubmissionLogs):
+ """Initialize the power checker.
+
+ Args:
+ log: Logger used to emit info/warning/error messages.
+ path: Path to the submission root being validated.
+ config (Config): Configuration helper containing feature
+ toggles for power checks.
+ submission_logs (SubmissionLogs): Parsed submission artifacts
+ and loader metadata.
+ """
+ super().__init__(log, path)
+ self.config = config
+ self.submission_logs = submission_logs
+ self.mlperf_log = self.submission_logs.performance_log
+ self.scenario_fixed = self.submission_logs.loader_data.get(
+ "scenario", "")
+ self.power_path = self.submission_logs.loader_data.get(
+ "power_dir_path", "")
+ self.testing_path = os.path.dirname(
+ self.submission_logs.loader_data.get(
+ "perf_path", ""))
+ self.ranging_path = os.path.join(
+ os.path.dirname(self.testing_path), "ranging")
+ self.has_power = os.path.exists(self.power_path)
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register per-submission power checks.
+
+ Appends the callable checks to `self.checks` in the order they
+ should be executed by the submission validation framework.
+ """
+ self.checks.append(self.required_files_check)
+ self.checks.append(self.external_power_check)
+ self.checks.append(self.get_power_metric_check)
+
+ def required_files_check(self):
+ """Verify required files exist in power-related directories.
+
+ Checks that testing, ranging, and power directories contain all
+ expected files, skipping if no power measurements are present.
+
+ Returns:
+ bool: True if all required files are present, False otherwise.
+ """
+ if not self.has_power:
+ return True
+
+ self.log.info("Checking necessary power files for %s", self.path)
+ is_valid = True
+ required_files = REQUIRED_PERF_FILES + REQUIRED_PERF_POWER_FILES
+ diff = files_diff(
+ list_files(self.testing_path),
+ required_files,
+ OPTIONAL_PERF_FILES)
+ if diff:
+ self.log.error(
+ "%s has file list mismatch (%s)",
+ self.testing_path,
+ diff)
+ is_valid = False
+ diff = files_diff(
+ list_files(self.ranging_path),
+ required_files,
+ OPTIONAL_PERF_FILES)
+ if diff:
+ self.log.error(
+ "%s has file list mismatch (%s)",
+ self.ranging_path,
+ diff)
+ is_valid = False
+ diff = files_diff(list_files(self.power_path), REQUIRED_POWER_FILES)
+ if diff:
+ self.log.error(
+ "%s has file list mismatch (%s)",
+ self.power_path,
+ diff)
+ is_valid = False
+ return is_valid
+
+ def external_power_check(self):
+ """Run external Power WG validation script.
+
+ Executes the power_checker.py script from the Power WG if power
+ checks are enabled and power data is present.
+
+ Returns:
+ bool: True if the external check passes or is skipped,
+ False otherwise.
+ """
+ if not self.config.skip_power_check and self.has_power:
+ self.log.info("Running external power checks for %s", self.path)
+ python_version_major = int(sys.version.split(" ")[0].split(".")[0])
+ python_version_minor = int(sys.version.split(" ")[0].split(".")[1])
+ assert python_version_major == 3 and python_version_minor >= 7, (
+ "Power check " " only " "supports " "Python " "3.7+"
+ )
+ perf_path = os.path.dirname(self.power_path)
+ check_power_result = check_power_more(perf_path)
+ sys.stdout.flush()
+ sys.stderr.flush()
+ if check_power_result != 0:
+ self.log.error(
+ "Power WG power_checker.py did not pass for: %s",
+ perf_path)
+ return False
+ return True
+
+ def get_power_metric_check(self):
+ """Compute and validate power consumption metrics.
+
+ Parses power logs to extract samples within the measurement window,
+ calculates average power, and derives power metrics and efficiency
+ based on scenario and performance data. Stores results in loader data.
+
+ Returns:
+ bool: True if power metrics are successfully computed,
+ False otherwise.
+ """
+ if not self.has_power:
+ return True
+ # parse the power logs
+ is_valid = True
+ server_timezone = datetime.timedelta(0)
+ client_timezone = datetime.timedelta(0)
+
+ datetime_format = "%m-%d-%Y %H:%M:%S.%f"
+ power_begin = (
+ datetime.datetime.strptime(
+ self.mlperf_log["power_begin"], datetime_format)
+ + client_timezone
+ )
+ power_end = (
+ datetime.datetime.strptime(
+ self.mlperf_log["power_end"], datetime_format)
+ + client_timezone
+ )
+ # Obtain the scenario also from logs to check if power is inferred
+ scenario = self.mlperf_log["effective_scenario"]
+
+ log_path = self.testing_path
+ spl_fname = os.path.join(log_path, "spl.txt")
+ power_list = []
+ with open(spl_fname) as f:
+ for line in f:
+ if not line.startswith("Time"):
+ continue
+ timestamp = (
+ datetime.datetime.strptime(
+ line.split(",")[1], datetime_format)
+ + server_timezone
+ )
+ if timestamp > power_begin and timestamp < power_end:
+ value = float(line.split(",")[3])
+ if value > 0:
+ power_list.append(float(line.split(",")[3]))
+
+ if len(power_list) == 0:
+ self.log.error(
+ "%s has no power samples falling in power range: %s - %s",
+ spl_fname,
+ power_begin,
+ power_end,
+ )
+ is_valid = False
+ else:
+ avg_power = sum(power_list) / len(power_list)
+ power_duration = (power_end - power_begin).total_seconds()
+ if self.scenario_fixed.lower() in [
+ "offline", "server", "interactive"]:
+ # In Offline and Server scenarios, the power metric is in W.
+ power_metric = avg_power
+ avg_power_efficiency = self.submission_logs.loader_data[
+ "performance_metric"] / avg_power
+
+ else:
+ # In SingleStream and MultiStream scenarios, the power metric is in
+ # mJ/query.
+ assert self.scenario_fixed.lower() in [
+ "multistream",
+ "singlestream",
+ ], "Unknown scenario: {:}".format(self.scenario_fixed)
+
+ num_queries = int(self.mlperf_log["result_query_count"])
+
+ power_metric = avg_power * power_duration * 1000 / num_queries
+
+ if self.scenario_fixed.lower() in ["singlestream"]:
+ samples_per_query = 1
+ elif self.scenario_fixed.lower() in ["multistream"]:
+ samples_per_query = 8
+
+ if (self.scenario_fixed.lower() in ["multistream"]
+ ) and scenario.lower() in ["singlestream"]:
+ power_metric = (
+ avg_power * power_duration * samples_per_query * 1000 / num_queries
+ )
+
+ avg_power_efficiency = (
+ samples_per_query * 1000) / power_metric
+
+ self.submission_logs.loader_data["power_metric"] = power_metric
+ self.submission_logs.loader_data["avg_power_efficiency"] = avg_power_efficiency
+ return is_valid
diff --git a/tools/submission/submission_checker/checks/structure_check.py b/tools/submission/submission_checker/checks/structure_check.py
new file mode 100644
index 0000000000..e0667bb7f6
--- /dev/null
+++ b/tools/submission/submission_checker/checks/structure_check.py
@@ -0,0 +1,11 @@
+from .base import BaseCheck
+
+
+class StructureCheck(BaseCheck):
+ def __init__(self, log, path, parsed_log):
+ super().__init__(log, path)
+ self.parsed_log = parsed_log
+ self.checks.append(self.sample_check)
+
+ def sample_check(self):
+ return True
diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py
new file mode 100644
index 0000000000..54746c0408
--- /dev/null
+++ b/tools/submission/submission_checker/checks/system_check.py
@@ -0,0 +1,220 @@
+from .base import BaseCheck
+from ..constants import *
+from ..loader import SubmissionLogs
+from ..configuration.configuration import Config
+from ..utils import *
+
+
+class SystemCheck(BaseCheck):
+ """Validate system description JSON and metadata consistency.
+
+ The `SystemCheck` class verifies the presence and correctness of the
+ system description JSON file in submissions. It ensures the system
+ metadata matches submission expectations, validates required fields
+ and their types, checks system type and availability status, and
+ enforces consistency between JSON fields and submission directory
+ structure.
+
+ Attributes:
+ submission_logs (SubmissionLogs): Parsed submission logs and
+ metadata used to locate system JSON and compare fields.
+ system_json (dict): Parsed system description JSON contents.
+ submitter (str): Submitter name extracted from submission path.
+ division (str): Division extracted from submission path.
+ config (Config): Configuration provider for field validation rules.
+ """
+
+ def __init__(self, log, path, config: Config,
+ submission_logs: SubmissionLogs):
+ """Initialize the system checker.
+
+ Args:
+ log: Logger used to emit info/warning/error messages.
+ path: Path to the submission being validated.
+ config (Config): Configuration helper containing validation
+ rules and toggles.
+ submission_logs (SubmissionLogs): Parsed submission artifacts
+ and loader metadata.
+ """
+ super().__init__(log, path)
+ self.name = "system checks"
+ self.submission_logs = submission_logs
+ self.system_json = self.submission_logs.system_json
+ self.submitter = self.submission_logs.loader_data.get("submitter", "")
+ self.division = self.submission_logs.loader_data.get("division", "")
+ self.config = config
+ self.setup_checks()
+
+ def setup_checks(self):
+ """Register per-submission system checks.
+
+ Appends the callable checks to `self.checks` in the order they
+ should be executed by the submission validation framework.
+ """
+ self.checks.append(self.missing_check)
+ self.checks.append(self.availability_check)
+ self.checks.append(self.system_type_check)
+ self.checks.append(self.network_check)
+ self.checks.append(self.required_fields_check)
+ self.checks.append(self.submitter_check)
+ self.checks.append(self.division_check)
+
+ def missing_check(self):
+ """Ensure the system JSON file was provided.
+
+ Returns:
+ bool: True if `system_json` is present, False otherwise.
+ """
+ if self.system_json is None:
+ self.log.error(
+ "%s system json file not found",
+ self.path
+ )
+ return False
+ return True
+
+ def availability_check(self):
+ """Validate the system's availability status.
+
+ Checks that the 'status' field contains a valid availability value
+ from the predefined list.
+
+ Returns:
+ bool: True if status is valid, False otherwise.
+ """
+ availability = self.system_json.get("status").lower()
+ if availability not in VALID_AVAILABILITIES:
+ self.log.error(
+ "%s has invalid status (%s)", self.path, availability
+ )
+ return False
+ return True
+
+ def system_type_check(self):
+ """Verify the system type is allowed.
+
+ Ensures 'system_type' is one of the valid types: datacenter, edge,
+ or combinations thereof.
+
+ Returns:
+ bool: True if system type is valid, False otherwise.
+ """
+ system_type = self.system_json.get("system_type")
+ valid_system_types = [
+ "datacenter", "edge", "datacenter,edge", "edge,datacenter"]
+
+ if system_type not in valid_system_types:
+ self.log.error(
+ "%s has invalid system type (%s)",
+ self.path,
+ system_type,
+ )
+ return False
+ # Maybe add this line if needed
+ # self.config.set_type(system_type)
+ return True
+
+ def network_check(self):
+ """Ensure network mode matches division requirements.
+
+ For closed division, network mode must be false; for network
+ division, it must be true.
+
+ Returns:
+ bool: True if network mode is correct for the division,
+ False otherwise.
+ """
+ is_network = self.system_json.get(SYSTEM_DESC_IS_NETWORK_MODE)
+ is_network = (
+ is_network.lower() == "true"
+ if is_network is not None
+ else False
+ )
+ expected_state_by_division = {"network": True, "closed": False}
+ if self.division in expected_state_by_division:
+ if expected_state_by_division[self.division] != is_network:
+ self.log.error(
+ f"{self.path} incorrect network mode(={is_network})"
+ f"for division '{self.division}'"
+ )
+ return False
+ return True
+
+ def required_fields_check(self):
+ """Validate presence and validity of required system fields.
+
+ Checks for required fields based on network mode, ensures they
+ exist, and validates meaningful responses and numeric types where
+ required, respecting configuration toggles.
+
+ Returns:
+ bool: True if all required fields are present and valid,
+ False otherwise.
+ """
+ required_fields = SYSTEM_DESC_REQUIRED_FIELDS.copy()
+ is_network = self.system_json.get(SYSTEM_DESC_IS_NETWORK_MODE)
+ if is_network:
+ required_fields += SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE
+
+ check_empty_fields = False if self.config.skip_meaningful_fields_emptiness_check else True
+ is_valid = True
+ for k in required_fields:
+ if k not in self.system_json:
+ is_valid = False
+ self.log.error("%s, field %s is missing", self.path, k)
+ elif (
+ check_empty_fields
+ and k in SYSTEM_DESC_MEANINGFUL_RESPONSE_REQUIRED_FIELDS
+ and not self.system_json[k]
+ ):
+ is_valid = False
+ self.log.error(
+ "%s, field %s requires a meaningful response but is empty", self.path, k
+ )
+ elif (
+ check_empty_fields
+ and k in SYSTEM_DESC_NUMERIC_RESPONSE_REQUIRED_FIELDS
+ and not is_number(str(self.system_json[k]))
+ ):
+ self.log.error(
+ "%s, field %s requires a numeric response but is empty", self.path, k
+ )
+ return is_valid
+
+ def submitter_check(self):
+ """Verify submitter name consistency.
+
+ Ensures the 'submitter' field in system JSON matches the submitter
+ name from the submission directory structure.
+
+ Returns:
+ bool: True if submitter names match, False otherwise.
+ """
+ if self.system_json.get("submitter").lower() != self.submitter.lower():
+ self.log.error(
+ "%s has submitter %s, directory has %s",
+ self.path,
+ self.system_json.get("submitter"),
+ self.submitter,
+ )
+ return False
+ return True
+
+ def division_check(self):
+ """Verify division consistency.
+
+ Ensures the 'division' field in system JSON matches the division
+ from the submission directory structure.
+
+ Returns:
+ bool: True if divisions match, False otherwise.
+ """
+ if self.system_json.get("division").lower() != self.division.lower():
+ self.log.error(
+ "%s has division %s, directory has %s",
+ self.path,
+ self.system_json.get("division"),
+ self.division,
+ )
+ return False
+ return True
diff --git a/tools/submission/submission_checker/configuration/__init__.py b/tools/submission/submission_checker/configuration/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/submission_checker/configuration/configuration.py b/tools/submission/submission_checker/configuration/configuration.py
new file mode 100644
index 0000000000..97b9351ee9
--- /dev/null
+++ b/tools/submission/submission_checker/configuration/configuration.py
@@ -0,0 +1,221 @@
+from ..constants import MODEL_CONFIG, ACC_PATTERN
+
+
+class Config:
+ """Select config value by mlperf version and submission type."""
+
+ def __init__(
+ self,
+ version,
+ extra_model_benchmark_map,
+ ignore_uncommited=False,
+ skip_compliance=False,
+ skip_power_check=False,
+ skip_meaningful_fields_emptiness_check=False,
+ skip_check_power_measure_files=False,
+ skip_empty_files_check=False,
+ skip_extra_files_in_root_check=False,
+ skip_extra_accuracy_files_check=False,
+ skip_all_systems_have_results_check=False,
+ skip_calibration_check=False,
+ skip_dataset_size_check=False
+ ):
+ self.base = MODEL_CONFIG.get(version)
+ self.extra_model_benchmark_map = extra_model_benchmark_map
+ self.version = version
+ self.ignore_uncommited = ignore_uncommited
+
+ # Skip flags. All set to false for official submission
+ self.skip_compliance = skip_compliance
+ self.skip_power_check = skip_power_check
+ self.skip_meaningful_fields_emptiness_check = skip_meaningful_fields_emptiness_check
+ self.skip_check_power_measure_files = skip_check_power_measure_files
+ self.skip_empty_files_check = skip_empty_files_check
+ self.skip_extra_files_in_root_check = skip_extra_files_in_root_check
+ self.skip_extra_accuracy_files_check = skip_extra_accuracy_files_check
+ self.skip_all_systems_have_results_check = skip_all_systems_have_results_check
+ self.skip_calibration_check = skip_calibration_check
+ self.skip_dataset_size_check = skip_dataset_size_check
+ self.load_config(version)
+
+ def load_config(self, version):
+ # TODO: Load values from
+ self.models = self.base["models"]
+ self.seeds = self.base["seeds"]
+ if self.base.get("test05_seeds"):
+ self.test05_seeds = self.base["test05_seeds"]
+ self.accuracy_target = self.base["accuracy-target"]
+ self.accuracy_delta_perc = self.base["accuracy-delta-perc"]
+ self.accuracy_upper_limit = self.base.get("accuracy-upper-limit", {})
+ self.performance_sample_count = self.base["performance-sample-count"]
+ self.dataset_size = self.base["dataset-size"]
+ self.latency_constraint = self.base.get("latency-constraint", {})
+ self.min_queries = self.base.get("min-queries", {})
+ self.required = None
+ self.optional = None
+
+ def set_type(self, submission_type):
+ if submission_type == "datacenter":
+ self.required = self.base["required-scenarios-datacenter"]
+ self.optional = self.base["optional-scenarios-datacenter"]
+ elif submission_type == "edge":
+ self.required = self.base["required-scenarios-edge"]
+ self.optional = self.base["optional-scenarios-edge"]
+ elif (
+ submission_type == "datacenter,edge" or submission_type == "edge,datacenter"
+ ):
+ self.required = self.base["required-scenarios-datacenter-edge"]
+ self.optional = self.base["optional-scenarios-datacenter-edge"]
+ else:
+ raise ValueError("invalid system type")
+
+ def get_mlperf_model(self, model, extra_model_mapping=None):
+ # preferred - user is already using the official name
+ if model in self.models:
+ return model
+
+ # simple mapping, ie resnet50->resnet
+ mlperf_model = self.base["model_mapping"].get(model)
+ if mlperf_model:
+ return mlperf_model
+
+ # Custom mapping provided by the submitter
+ if extra_model_mapping is not None:
+ mlperf_model = extra_model_mapping.get(model)
+ if mlperf_model:
+ return mlperf_model
+
+ # try to guess, keep this for backwards compatibility
+ # TODO: Generalize this guess or remove it completely?
+
+ if "mobilenet" in model or "efficientnet" in model or "resnet50" in model:
+ model = "resnet"
+ elif "bert-99.9" in model:
+ model = "bert-99.9"
+ elif "bert-99" in model:
+ model = "bert-99"
+ elif "llama3_1-405b" in model:
+ model = "llama3.1-405b"
+ # map again
+ mlperf_model = self.base["model_mapping"].get(model, model)
+ return mlperf_model
+
+ def get_required(self, model):
+ model = self.get_mlperf_model(model)
+ if model not in self.required:
+ return None
+ return set(self.required[model])
+
+ def get_optional(self, model):
+ model = self.get_mlperf_model(model)
+ if model not in self.optional:
+ return set()
+ return set(self.optional[model])
+
+ def get_accuracy_target(self, model):
+ if model not in self.accuracy_target:
+ raise ValueError("model not known: " + model)
+ return self.accuracy_target[model]
+
+ def get_accuracy_upper_limit(self, model):
+ return self.accuracy_upper_limit.get(model, None)
+
+ def get_accuracy_values(self, model):
+ patterns = []
+ acc_targets = []
+ acc_types = []
+ acc_limits = []
+ up_patterns = []
+ acc_limit_check = False
+
+ target = self.get_accuracy_target(model)
+ acc_upper_limit = self.get_accuracy_upper_limit(model)
+ if acc_upper_limit is not None:
+ for i in range(0, len(acc_upper_limit), 2):
+ acc_type, acc_target = acc_upper_limit[i: i + 2]
+ acc_limits.append(acc_target)
+ up_patterns.append(ACC_PATTERN[acc_type])
+
+ for i in range(0, len(target), 2):
+ acc_type, acc_target = target[i: i + 2]
+ patterns.append(ACC_PATTERN[acc_type])
+ acc_targets.append(acc_target)
+ acc_types.append(acc_type)
+
+ return patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit
+
+ def get_performance_sample_count(self, model):
+ model = self.get_mlperf_model(model)
+ if model not in self.performance_sample_count:
+ raise ValueError("model not known: " + model)
+ return self.performance_sample_count[model]
+
+ def ignore_errors(self, line):
+ for error in self.base["ignore_errors"]:
+ if error in line:
+ return True
+ if (
+ self.ignore_uncommited
+ and ("ERROR : Loadgen built with uncommitted " "changes!") in line
+ ):
+ return True
+ return False
+
+ def get_min_query_count(self, model, scenario):
+ model = self.get_mlperf_model(model)
+ if model not in self.min_queries:
+ raise ValueError("model not known: " + model)
+ return self.min_queries[model].get(scenario)
+
+ def get_dataset_size(self, model):
+ model = self.get_mlperf_model(model)
+ if model not in self.dataset_size:
+ raise ValueError("model not known: " + model)
+ return self.dataset_size[model]
+
+ def get_delta_perc(self, model, metric):
+ if model in self.accuracy_delta_perc:
+ if metric in self.accuracy_delta_perc[model]:
+ return self.accuracy_delta_perc[model][metric]
+
+ more_accurate = model.find("99.9")
+ if more_accurate == -1:
+ required_delta_perc = 1
+ else:
+ required_delta_perc = 0.1
+ return required_delta_perc
+
+ def has_new_logging_format(self):
+ return True
+
+ def uses_early_stopping(self, scenario):
+ return scenario in ["Server", "SingleStream", "MultiStream"]
+
+ def requires_equal_issue(self, model, division):
+ return (
+ division in ["closed", "network"]
+ and model
+ in [
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "gptj-99",
+ "gptj-99.9",
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "mixtral-8x7b",
+ ]
+ and self.version in ["v4.1"]
+ )
+
+ def get_llm_models(self):
+ return [
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "llama2-70b-interactive-99",
+ "llama2-70b-interactive-99.9",
+ "mixtral-8x7b",
+ "llama3.1-405b",
+ "llama3.1-8b",
+ "llama3.1-8b-edge",
+ "deepseek-r1"
+ ]
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
new file mode 100644
index 0000000000..0799529ad0
--- /dev/null
+++ b/tools/submission/submission_checker/constants.py
@@ -0,0 +1,1587 @@
+MODEL_CONFIG = {
+ "v6.0": {
+ "models": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "llama3.1-8b",
+ "llama3.1-8b-edge",
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "stable-diffusion-xl",
+ "mixtral-8x7b",
+ "llama3.1-405b",
+ "rgat",
+ "pointpainting",
+ "deepseek-r1",
+ "whisper",
+ "gpt-oss-120b",
+ "wan-2.2-t2v-a14b",
+ "qwen3-vl-235b-a22b",
+ "dlrm-v3",
+ ],
+ "required-scenarios-datacenter": {
+ "retinanet": ["Server", "Offline"],
+ "dlrm-v3": ["Server", "Offline"],
+ "3d-unet-99": ["Offline"],
+ "3d-unet-99.9": ["Offline"],
+ "llama3.1-8b": ["Offline"],
+ "llama2-70b-99": ["Offline"],
+ "llama2-70b-99.9": ["Offline"],
+ "stable-diffusion-xl": ["Server", "Offline"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Offline"],
+ "rgat": ["Offline"],
+ "whisper": ["Offline"],
+ "deepseek-r1": ["Offline"],
+ "gpt-oss-120b": ["Offline"],
+ "qwen3-vl-235b-a22b": ["Server", "Offline"],
+ "dlrm-v3": ["Server", "Offline"],
+ },
+ "optional-scenarios-datacenter": {
+ "llama2-70b-99": ["Interactive", "Server"],
+ "llama2-70b-99.9": ["Interactive", "Server"],
+ "llama3.1-405b": ["Interactive", "Server"],
+ "llama3.1-8b": ["Interactive", "Server"],
+ "deepseek-r1": ["Interactive", "Server"],
+ "gpt-oss-120b": ["Interactive", "Server"],
+ },
+ "required-scenarios-edge": {
+ "resnet": ["SingleStream", "MultiStream", "Offline"],
+ "retinanet": ["SingleStream", "MultiStream", "Offline"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "llama3.1-8b-edge": ["SingleStream", "Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline"],
+ "pointpainting": ["SingleStream"],
+ "whisper": ["Offline"],
+ },
+ "optional-scenarios-edge": {},
+ "required-scenarios-datacenter-edge": {
+ "resnet": ["SingleStream", "MultiStream", "Offline", "Server"],
+ "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "llama3.1-8b": ["Offline"],
+ "llama3.1-8b-edge": ["SingleStream", "Offline"],
+ "llama2-70b-99": ["Offline"],
+ "llama2-70b-99.9": ["Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Offline"],
+ "rgat": ["Offline"],
+ "pointpainting": ["SingleStream"],
+ "deepseek-r1": ["Offline"],
+ "whisper": ["Offline"],
+ "gpt-oss-120b": ["Offline"],
+ "qwen3-vl-235b-a22b": ["Offline"],
+ "dlrm-v3": ["Offline", "Server"],
+ },
+ "optional-scenarios-datacenter-edge": {
+ "llama2-70b-99": ["Interactive", "Server"],
+ "llama2-70b-99.9": ["Interactive", "Server"],
+ "llama3.1-405b": ["Interactive", "Server"],
+ "llama3.1-8b": ["Interactive", "Server"],
+ "deepseek-r1": ["Interactive", "Server"],
+ "gpt-oss-120b": ["Interactive", "Server"],
+ "qwen3-vl-235b-a22b": ["Interactive", "Server"],
+ },
+ "accuracy-target": {
+ "resnet": ("acc", 76.46 * 0.99),
+ "retinanet": ("mAP", 37.55 * 0.99),
+ "bert-99": ("F1", 90.874 * 0.99),
+ "bert-99.9": ("F1", 90.874 * 0.999),
+ "dlrm-v2-99": ("AUC", 80.31 * 0.99),
+ "dlrm-v2-99.9": ("AUC", 80.31 * 0.999),
+ "3d-unet-99": ("DICE", 0.86170 * 0.99),
+ "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
+
+ "llama3.1-8b": (
+ "ROUGE1",
+ 38.7792 * 0.99,
+ "ROUGE2",
+ 15.9075 * 0.99,
+ "ROUGEL",
+ 24.4957 * 0.99,
+ "ROUGELSUM",
+ 35.793 * 0.99,
+ "GEN_LEN",
+ 8167644 * 0.9,
+ ),
+ "llama3.1-8b-edge": (
+ "ROUGE1",
+ 39.06 * 0.99,
+ "ROUGE2",
+ 16.1147 * 0.99,
+ "ROUGEL",
+ 24.6375 * 0.99,
+ "ROUGELSUM",
+ 36.124 * 0.99,
+ "GEN_LEN",
+ 3051113 * 0.9,
+ ),
+ "llama2-70b-99": (
+ "ROUGE1",
+ 44.4312 * 0.99,
+ "ROUGE2",
+ 22.0352 * 0.99,
+ "ROUGEL",
+ 28.6162 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "llama2-70b-99.9": (
+ "ROUGE1",
+ 44.4312 * 0.999,
+ "ROUGE2",
+ 22.0352 * 0.999,
+ "ROUGEL",
+ 28.6162 * 0.999,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.68631873,
+ "FID_SCORE",
+ 23.01085758,
+ ),
+ "mixtral-8x7b": (
+ "ROUGE1",
+ 45.5989 * 0.99,
+ "ROUGE2",
+ 23.3526 * 0.99,
+ "ROUGEL",
+ 30.4608 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 144.84 * 0.9,
+ "gsm8k_accuracy",
+ 73.66 * 0.99,
+ "mbxp_accuracy",
+ 60.16 * 0.99,
+ ),
+ "llama3.1-405b": (
+ "ROUGEL",
+ 21.6666 * 0.99,
+ "exact_match",
+ 90.1335 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 684.68 * 0.9,
+ ),
+ "rgat": ("acc", 0.7286 * 0.99),
+ "pointpainting": ("mAP", 0.5425 * 0.999),
+ "deepseek-r1": ("exact_match", 0.99 * 81.3582, "TOKENS_PER_SAMPLE", 0.9 * 3886.2274),
+ "whisper": ("ACCURACY", (100.0 - 2.0671) * 0.99),
+ # TODO: Placeholder for now
+ "gpt-oss-120b": ("exact_match", 83.13 * 0.99),
+ # TODO: Placeholder for now
+ "qwen3-vl-235b-a22b": ("F1", 0.7903 * 0.99),
+ "dlrm-v3": ("AUC", 78.663 * 0.99), # TODO: Placeholder for now
+ },
+ "accuracy-upper-limit": {
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.81331801,
+ "FID_SCORE",
+ 23.95007626,
+ ),
+ "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
+ "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
+ "llama3.1-8b": ("GEN_LEN", 8167644 * 1.1),
+ "llama3.1-8b-edge": ("GEN_LEN", 3051113 * 1.1),
+ "deepseek-r1": ("TOKENS_PER_SAMPLE", 1.1 * 3886.2274),
+ # TODO: Placeholder for now
+ "gpt-oss-120b": ("TOKENS_PER_SAMPLE", 1.1 * 9999),
+ },
+ "accuracy-delta-perc": {
+ "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2}
+ },
+ "performance-sample-count": {
+ "resnet": 1024,
+ "retinanet": 64,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 204800,
+ "dlrm-v2-99.9": 204800,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "llama3.1-8b": 13368,
+ "llama3.1-8b-edge": 5000,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 1024,
+ "deepseek-r1": 4388,
+ "whisper": 1633,
+ # TODO: Need to add accuracy sample count checkers as well (4395)
+ "gpt-oss-120b": 6396,
+ "qwen3-vl-235b-a22b": 48289,
+ "dlrm-v3": 34996,
+ },
+ "dataset-size": {
+ "resnet": 50000,
+ "retinanet": 24781,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 330067,
+ "dlrm-v2-99.9": 330067,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "llama3.1-8b": 13368,
+ "llama3.1-8b-edge": 5000,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 39987,
+ "deepseek-r1": 4388,
+ "whisper": 1633,
+ # TODO: Need to add accuracy sample count checkers as well (4395)
+ "gpt-oss-120b": 6396,
+ "qwen3-vl-235b-a22b": 48289,
+ "dlrm-v3": 34996,
+ },
+ # model_mapping.json is expected in the root directory of the
+ # submission folder for open submissions and so the below dictionary is
+ # not really needed
+ "model_mapping": {
+ # map model names to the official mlperf model class
+ "ssd-resnet34": "retinanet",
+ "mobilenet": "resnet",
+ "resnet50": "resnet",
+ "llama3_1-405b": "llama3.1-405b",
+ "llama3_1-8b": "llama3.1-8b",
+ "llama3_1-8b-edge": "llama3.1-8b-edge",
+ },
+ "seeds": {
+ # TODO: Update random seeds
+ "qsl_rng_seed": 2465351861681999779,
+ "sample_index_rng_seed": 14276810075590677512,
+ "schedule_rng_seed": 3936089224930324775,
+ },
+ "ignore_errors": [],
+ "latency-constraint": {
+ "resnet": {"Server": 15000000},
+ "retinanet": {"Server": 100000000},
+ "dlrm-v2-99": {"Server": 60000000},
+ "dlrm-v2-99.9": {"Server": 60000000},
+ "llama3.1-8b": {"Server": 20000000000},
+ "stable-diffusion-xl": {"Server": 20000000000},
+ "llama2-70b-99": {"Server": 20000000000},
+ "llama2-70b-99.9": {"Server": 20000000000},
+ "mixtral-8x7b": {"Server": 20000000000},
+ "llama3.1-405b": {"Server": 60000000000},
+ "deepseek-r1": {"Server": 60000000000},
+ "gpt-oss-120b": {"Server": 60000000000},
+ "qwen3-vl-235b-a22b": {"Server": 60000000000},
+ "dlrm-v3": {"Server": 60000000000},
+ },
+ "min-queries": {
+ "resnet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "retinanet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "bert-99": {"SingleStream": 1024, "Offline": 1},
+ "bert-99.9": {"SingleStream": 1024, "Offline": 1},
+ "dlrm-v2-99": {"Server": 270336, "Offline": 1},
+ "dlrm-v2-99.9": {"Server": 270336, "Offline": 1},
+ "3d-unet-99": {"SingleStream": 1024, "Offline": 1},
+ "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1},
+ "llama3.1-8b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama3.1-8b-edge": {"SingleStream": 1024, "Offline": 1},
+ "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "stable-diffusion-xl": {
+ "SingleStream": 1024,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "rgat": {"SingleStream": 1024, "Offline": 1},
+ "pointpainting": {"SingleStream": 1024},
+ "deepseek-r1": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "whisper": {"SingleStream": 1024, "Offline": 1},
+ "gpt-oss-120b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "qwen3-vl-235b-a22b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "dlrm-v3": {"Server": 270336, "Offline": 1},
+ },
+ "models_TEST01": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "dlrm-v2-99",
+ "dlrm-v2-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "stable-diffusion-xl",
+ "rgat",
+ "pointpainting",
+ "whisper",
+ ],
+ "models_TEST04": [
+ "resnet",
+ "stable-diffusion-xl",
+ "pointpainting",
+ ],
+ "models_TEST06": [
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "llama2-70b-interactive-99",
+ "llama2-70b-interactive-99.9",
+ "llama3.1-405b",
+ "llama3.1-8b",
+ "llama3.1-8b-interactive",
+ "llama3.1-405b-interactive",
+ "mixtral-8x7b",
+ "deepseek-r1",
+ ]
+ },
+ "v5.0": {
+ "models": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "dlrm-v2-99",
+ "dlrm-v2-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "gptj-99",
+ "gptj-99.9",
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "llama2-70b-interactive-99",
+ "llama2-70b-interactive-99.9",
+ "stable-diffusion-xl",
+ "mixtral-8x7b",
+ "llama3.1-405b",
+ "rgat",
+ "pointpainting",
+ ],
+ "required-scenarios-datacenter": {
+ "resnet": ["Server", "Offline"],
+ "retinanet": ["Server", "Offline"],
+ "dlrm-v2-99": ["Server", "Offline"],
+ "dlrm-v2-99.9": ["Server", "Offline"],
+ "3d-unet-99": ["Offline"],
+ "3d-unet-99.9": ["Offline"],
+ "gptj-99": ["Server", "Offline"],
+ "gptj-99.9": ["Server", "Offline"],
+ "llama2-70b-99": ["Server", "Offline"],
+ "llama2-70b-99.9": ["Server", "Offline"],
+ "llama2-70b-interactive-99": ["Server", "Offline"],
+ "llama2-70b-interactive-99.9": ["Server", "Offline"],
+ "stable-diffusion-xl": ["Server", "Offline"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Server", "Offline"],
+ "rgat": ["Offline"],
+ },
+ "optional-scenarios-datacenter": {},
+ "required-scenarios-edge": {
+ "resnet": ["SingleStream", "MultiStream", "Offline"],
+ "retinanet": ["SingleStream", "MultiStream", "Offline"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "gptj-99": ["SingleStream", "Offline"],
+ "gptj-99.9": ["SingleStream", "Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline"],
+ "pointpainting": ["SingleStream"],
+ },
+ "optional-scenarios-edge": {},
+ "required-scenarios-datacenter-edge": {
+ "resnet": ["SingleStream", "Offline", "MultiStream", "Server"],
+ "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "dlrm-v2-99": ["Offline", "Server"],
+ "dlrm-v2-99.9": ["Offline", "Server"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "gptj-99": ["SingleStream", "Offline", "Server"],
+ "gptj-99.9": ["SingleStream", "Offline", "Server"],
+ "llama2-70b-99": ["Server", "Offline"],
+ "llama2-70b-99.9": ["Server", "Offline"],
+ "llama2-70b-interactive-99": ["Server", "Offline"],
+ "llama2-70b-interactive-99.9": ["Server", "Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Server", "Offline"],
+ "rgat": ["Offline"],
+ "pointpainting": ["SingleStream"],
+ },
+ "optional-scenarios-datacenter-edge": {},
+ "accuracy-target": {
+ "resnet": ("acc", 76.46 * 0.99),
+ "retinanet": ("mAP", 37.55 * 0.99),
+ "bert-99": ("F1", 90.874 * 0.99),
+ "bert-99.9": ("F1", 90.874 * 0.999),
+ "dlrm-v2-99": ("AUC", 80.31 * 0.99),
+ "dlrm-v2-99.9": ("AUC", 80.31 * 0.999),
+ "3d-unet-99": ("DICE", 0.86170 * 0.99),
+ "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
+
+ "gptj-99": (
+ "ROUGE1",
+ 42.9865 * 0.99,
+ "ROUGE2",
+ 20.1235 * 0.99,
+ "ROUGEL",
+ 29.9881 * 0.99,
+ "GEN_LEN",
+ 4016878 * 0.9,
+ ),
+ "gptj-99.9": (
+ "ROUGE1",
+ 42.9865 * 0.999,
+ "ROUGE2",
+ 20.1235 * 0.999,
+ "ROUGEL",
+ 29.9881 * 0.999,
+ "GEN_LEN",
+ 4016878 * 0.9,
+ ),
+ "llama2-70b-99": (
+ "ROUGE1",
+ 44.4312 * 0.99,
+ "ROUGE2",
+ 22.0352 * 0.99,
+ "ROUGEL",
+ 28.6162 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "llama2-70b-99.9": (
+ "ROUGE1",
+ 44.4312 * 0.999,
+ "ROUGE2",
+ 22.0352 * 0.999,
+ "ROUGEL",
+ 28.6162 * 0.999,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "llama2-70b-interactive-99": (
+ "ROUGE1",
+ 44.4312 * 0.99,
+ "ROUGE2",
+ 22.0352 * 0.99,
+ "ROUGEL",
+ 28.6162 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "llama2-70b-interactive-99.9": (
+ "ROUGE1",
+ 44.4312 * 0.999,
+ "ROUGE2",
+ 22.0352 * 0.999,
+ "ROUGEL",
+ 28.6162 * 0.999,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.68631873,
+ "FID_SCORE",
+ 23.01085758,
+ ),
+ "mixtral-8x7b": (
+ "ROUGE1",
+ 45.5989 * 0.99,
+ "ROUGE2",
+ 23.3526 * 0.99,
+ "ROUGEL",
+ 30.4608 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 144.84 * 0.9,
+ "gsm8k_accuracy",
+ 73.66 * 0.99,
+ "mbxp_accuracy",
+ 60.16 * 0.99,
+ ),
+ "llama3.1-405b": (
+ "ROUGEL",
+ 21.6666 * 0.99,
+ "exact_match",
+ 90.1335 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 684.68 * 0.9,
+ ),
+ "rgat": ("acc", 0.7286 * 0.99),
+ "pointpainting": ("mAP", 0.5425 * 0.999),
+ },
+ "accuracy-upper-limit": {
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.81331801,
+ "FID_SCORE",
+ 23.95007626,
+ ),
+ "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "llama2-70b-interactive-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "llama2-70b-interactive-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
+ "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
+ },
+ "accuracy-delta-perc": {
+ "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2}
+ },
+ "performance-sample-count": {
+ "resnet": 1024,
+ "retinanet": 64,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 204800,
+ "dlrm-v2-99.9": 204800,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "gptj-99": 13368,
+ "gptj-99.9": 13368,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "llama2-70b-interactive-99": 24576,
+ "llama2-70b-interactive-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 1024,
+ },
+ "dataset-size": {
+ "resnet": 50000,
+ "retinanet": 24781,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 330067,
+ "dlrm-v2-99.9": 330067,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "gptj-99": 13368,
+ "gptj-99.9": 13368,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "llama2-70b-interactive-99": 24576,
+ "llama2-70b-interactive-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 39987,
+ },
+ # model_mapping.json is expected in the root directory of the
+ # submission folder for open submissions and so the below dictionary is
+ # not really needed
+ "model_mapping": {
+ # map model names to the official mlperf model class
+ "ssd-resnet34": "retinanet",
+ "mobilenet": "resnet",
+ "resnet50": "resnet",
+ "llama3_1-405b": "llama3.1-405b",
+ },
+ "seeds": {
+ # TODO: Update random seeds
+ "qsl_rng_seed": 6023615788873153749,
+ "sample_index_rng_seed": 15036839855038426416,
+ "schedule_rng_seed": 9933818062894767841,
+ },
+ "ignore_errors": [],
+ "latency-constraint": {
+ "resnet": {"Server": 15000000},
+ "retinanet": {"Server": 100000000},
+ "dlrm-v2-99": {"Server": 60000000},
+ "dlrm-v2-99.9": {"Server": 60000000},
+ "gptj-99": {"Server": 20000000000},
+ "gptj-99.9": {"Server": 20000000000},
+ "stable-diffusion-xl": {"Server": 20000000000},
+ "llama2-70b-99": {"Server": 20000000000},
+ "llama2-70b-99.9": {"Server": 20000000000},
+ "llama2-70b-interactive-99": {"Server": 20000000000},
+ "llama2-70b-interactive-99.9": {"Server": 20000000000},
+ "mixtral-8x7b": {"Server": 20000000000},
+ "llama3.1-405b": {"Server": 60000000000}
+ },
+ "min-queries": {
+ "resnet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "retinanet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "bert-99": {"SingleStream": 1024, "Offline": 1},
+ "bert-99.9": {"SingleStream": 1024, "Offline": 1},
+ "dlrm-v2-99": {"Server": 270336, "Offline": 1},
+ "dlrm-v2-99.9": {"Server": 270336, "Offline": 1},
+ "3d-unet-99": {"SingleStream": 1024, "Offline": 1},
+ "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1},
+ "gptj-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-interactive-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-interactive-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "stable-diffusion-xl": {
+ "SingleStream": 1024,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "rgat": {"SingleStream": 1024, "Offline": 1},
+ "pointpainting": {"SingleStream": 1024},
+ },
+ "models_TEST01": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "dlrm-v2-99",
+ "dlrm-v2-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "stable-diffusion-xl",
+ "rgat",
+ "pointpainting",
+ ],
+ "models_TEST04": [
+ "resnet",
+ "stable-diffusion-xl",
+ "pointpainting",
+ ],
+ "models_TEST06": [
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "llama2-70b-interactive-99",
+ "llama2-70b-interactive-99.9",
+ "gptj-99",
+ "gptj-99.9",
+ "mixtral-8x7b",
+ ]
+ },
+ "v5.1": {
+ "models": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "dlrm-v2-99",
+ "dlrm-v2-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "llama3.1-8b",
+ "llama3.1-8b-edge",
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "stable-diffusion-xl",
+ "mixtral-8x7b",
+ "llama3.1-405b",
+ "rgat",
+ "pointpainting",
+ "deepseek-r1",
+ "whisper",
+ ],
+ "required-scenarios-datacenter": {
+ "retinanet": ["Server", "Offline"],
+ "dlrm-v2-99": ["Server", "Offline"],
+ "dlrm-v2-99.9": ["Server", "Offline"],
+ "3d-unet-99": ["Offline"],
+ "3d-unet-99.9": ["Offline"],
+ "llama3.1-8b": ["Offline"],
+ "llama2-70b-99": ["Offline"],
+ "llama2-70b-99.9": ["Offline"],
+ "stable-diffusion-xl": ["Server", "Offline"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Offline"],
+ "rgat": ["Offline"],
+ "deepseek-r1": ["Server", "Offline"],
+ "whisper": ["Offline"],
+ },
+ "optional-scenarios-datacenter": {
+ "llama2-70b-99": ["Interactive", "Server"],
+ "llama2-70b-99.9": ["Interactive", "Server"],
+ "llama3.1-405b": ["Interactive", "Server"],
+ "llama3.1-8b": ["Interactive", "Server"],
+ },
+ "required-scenarios-edge": {
+ "resnet": ["SingleStream", "MultiStream", "Offline"],
+ "retinanet": ["SingleStream", "MultiStream", "Offline"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "llama3.1-8b-edge": ["SingleStream", "Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline"],
+ "pointpainting": ["SingleStream"],
+ "whisper": ["Offline"],
+ },
+ "optional-scenarios-edge": {},
+ "required-scenarios-datacenter-edge": {
+ "resnet": ["SingleStream", "MultiStream", "Offline", "Server"],
+ "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"],
+ "bert-99": ["SingleStream", "Offline"],
+ "bert-99.9": ["SingleStream", "Offline"],
+ "dlrm-v2-99": ["Offline", "Server"],
+ "dlrm-v2-99.9": ["Offline", "Server"],
+ "3d-unet-99": ["SingleStream", "Offline"],
+ "3d-unet-99.9": ["SingleStream", "Offline"],
+ "llama3.1-8b": ["Offline"],
+ "llama3.1-8b-edge": ["SingleStream", "Offline"],
+ "llama2-70b-99": ["Offline"],
+ "llama2-70b-99.9": ["Offline"],
+ "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
+ "mixtral-8x7b": ["Server", "Offline"],
+ "llama3.1-405b": ["Offline"],
+ "rgat": ["Offline"],
+ "pointpainting": ["SingleStream"],
+ "deepseek-r1": ["SingleStream", "Server", "Offline"],
+ "whisper": ["Offline"],
+ },
+ "optional-scenarios-datacenter-edge": {
+ "llama2-70b-99": ["Interactive", "Server"],
+ "llama2-70b-99.9": ["Interactive", "Server"],
+ "llama3.1-405b": ["Interactive", "Server"],
+ "llama3.1-8b": ["Interactive", "Server"],
+ },
+ "accuracy-target": {
+ "resnet": ("acc", 76.46 * 0.99),
+ "retinanet": ("mAP", 37.55 * 0.99),
+ "bert-99": ("F1", 90.874 * 0.99),
+ "bert-99.9": ("F1", 90.874 * 0.999),
+ "dlrm-v2-99": ("AUC", 80.31 * 0.99),
+ "dlrm-v2-99.9": ("AUC", 80.31 * 0.999),
+ "3d-unet-99": ("DICE", 0.86170 * 0.99),
+ "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
+
+ "llama3.1-8b": (
+ "ROUGE1",
+ 38.7792 * 0.99,
+ "ROUGE2",
+ 15.9075 * 0.99,
+ "ROUGEL",
+ 24.4957 * 0.99,
+ "ROUGELSUM",
+ 35.793 * 0.99,
+ "GEN_LEN",
+ 8167644 * 0.9,
+ ),
+ "llama3.1-8b-edge": (
+ "ROUGE1",
+ 39.06 * 0.99,
+ "ROUGE2",
+ 16.1147 * 0.99,
+ "ROUGEL",
+ 24.6375 * 0.99,
+ "ROUGELSUM",
+ 36.124 * 0.99,
+ "GEN_LEN",
+ 3051113 * 0.9,
+ ),
+ "llama2-70b-99": (
+ "ROUGE1",
+ 44.4312 * 0.99,
+ "ROUGE2",
+ 22.0352 * 0.99,
+ "ROUGEL",
+ 28.6162 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "llama2-70b-99.9": (
+ "ROUGE1",
+ 44.4312 * 0.999,
+ "ROUGE2",
+ 22.0352 * 0.999,
+ "ROUGEL",
+ 28.6162 * 0.999,
+ "TOKENS_PER_SAMPLE",
+ 294.45 * 0.9,
+ ),
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.68631873,
+ "FID_SCORE",
+ 23.01085758,
+ ),
+ "mixtral-8x7b": (
+ "ROUGE1",
+ 45.5989 * 0.99,
+ "ROUGE2",
+ 23.3526 * 0.99,
+ "ROUGEL",
+ 30.4608 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 144.84 * 0.9,
+ "gsm8k_accuracy",
+ 73.66 * 0.99,
+ "mbxp_accuracy",
+ 60.16 * 0.99,
+ ),
+ "llama3.1-405b": (
+ "ROUGEL",
+ 21.6666 * 0.99,
+ "exact_match",
+ 90.1335 * 0.99,
+ "TOKENS_PER_SAMPLE",
+ 684.68 * 0.9,
+ ),
+ "rgat": ("acc", 0.7286 * 0.99),
+ "pointpainting": ("mAP", 0.5425 * 0.999),
+ "deepseek-r1": ("exact_match", 0.99 * 81.3582, "TOKENS_PER_SAMPLE", 0.9 * 3886.2274),
+ "whisper": ("ACCURACY", (100.0 - 2.0671) * 0.99),
+ },
+ "accuracy-upper-limit": {
+ "stable-diffusion-xl": (
+ "CLIP_SCORE",
+ 31.81331801,
+ "FID_SCORE",
+ 23.95007626,
+ ),
+ "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+ "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
+ "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
+ "llama3.1-8b": ("GEN_LEN", 8167644 * 1.1),
+ "llama3.1-8b-edge": ("GEN_LEN", 3051113 * 1.1),
+ "deepseek-r1": ("TOKENS_PER_SAMPLE", 1.1 * 3886.2274)
+ },
+ "accuracy-delta-perc": {
+ "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2}
+ },
+ "performance-sample-count": {
+ "resnet": 1024,
+ "retinanet": 64,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 204800,
+ "dlrm-v2-99.9": 204800,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "llama3.1-8b": 13368,
+ "llama3.1-8b-edge": 5000,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 1024,
+ "deepseek-r1": 4388,
+ "whisper": 1633,
+ },
+ "dataset-size": {
+ "resnet": 50000,
+ "retinanet": 24781,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 330067,
+ "dlrm-v2-99.9": 330067,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "llama3.1-8b": 13368,
+ "llama3.1-8b-edge": 5000,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "pointpainting": 39987,
+ "deepseek-r1": 4388,
+ "whisper": 1633,
+ },
+ # model_mapping.json is expected in the root directory of the
+ # submission folder for open submissions and so the below dictionary is
+ # not really needed
+ "model_mapping": {
+ # map model names to the official mlperf model class
+ "ssd-resnet34": "retinanet",
+ "mobilenet": "resnet",
+ "resnet50": "resnet",
+ "llama3_1-405b": "llama3.1-405b",
+ "llama3_1-8b": "llama3.1-8b",
+ "llama3_1-8b-edge": "llama3.1-8b-edge",
+ },
+ "seeds": {
+ # TODO: Update random seeds
+ "qsl_rng_seed": 1780908523862526354,
+ "sample_index_rng_seed": 14771362308971278857,
+ "schedule_rng_seed": 18209322760996052031,
+ },
+ "ignore_errors": [],
+ "latency-constraint": {
+ "resnet": {"Server": 15000000},
+ "retinanet": {"Server": 100000000},
+ "dlrm-v2-99": {"Server": 60000000},
+ "dlrm-v2-99.9": {"Server": 60000000},
+ "llama3.1-8b": {"Server": 20000000000},
+ "stable-diffusion-xl": {"Server": 20000000000},
+ "llama2-70b-99": {"Server": 20000000000},
+ "llama2-70b-99.9": {"Server": 20000000000},
+ "mixtral-8x7b": {"Server": 20000000000},
+ "llama3.1-405b": {"Server": 60000000000},
+ "deepseek-r1": {"Server": 60000000000},
+ },
+ "min-queries": {
+ "resnet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "retinanet": {
+ "SingleStream": 1024,
+ "MultiStream": 270336,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "bert-99": {"SingleStream": 1024, "Offline": 1},
+ "bert-99.9": {"SingleStream": 1024, "Offline": 1},
+ "dlrm-v2-99": {"Server": 270336, "Offline": 1},
+ "dlrm-v2-99.9": {"Server": 270336, "Offline": 1},
+ "3d-unet-99": {"SingleStream": 1024, "Offline": 1},
+ "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1},
+ "llama3.1-8b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama3.1-8b-edge": {"SingleStream": 1024, "Offline": 1},
+ "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "stable-diffusion-xl": {
+ "SingleStream": 1024,
+ "Server": 270336,
+ "Offline": 1,
+ },
+ "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "rgat": {"SingleStream": 1024, "Offline": 1},
+ "pointpainting": {"SingleStream": 1024},
+ "deepseek-r1": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+ "whisper": {"SingleStream": 1024, "Offline": 1},
+ },
+ "models_TEST01": [
+ "resnet",
+ "retinanet",
+ "bert-99",
+ "bert-99.9",
+ "dlrm-v2-99",
+ "dlrm-v2-99.9",
+ "3d-unet-99",
+ "3d-unet-99.9",
+ "stable-diffusion-xl",
+ "rgat",
+ "pointpainting",
+ "whisper",
+ ],
+ "models_TEST04": [
+ "resnet",
+ "stable-diffusion-xl",
+ "pointpainting",
+ ],
+ "models_TEST06": [
+ "llama2-70b-99",
+ "llama2-70b-99.9",
+ "llama2-70b-interactive-99",
+ "llama2-70b-interactive-99.9",
+ "llama3.1-405b",
+ "llama3.1-8b",
+ "llama3.1-8b-interactive",
+ "llama3.1-405b-interactive",
+ "mixtral-8x7b",
+ "deepseek-r1",
+ ]
+ },
+}
+
+VALID_DIVISIONS = ["open", "closed", "network"]
+VALID_AVAILABILITIES = ["available", "preview", "rdi"]
+REQUIRED_PERF_FILES = ["mlperf_log_summary.txt", "mlperf_log_detail.txt"]
+OPTIONAL_PERF_FILES = ["mlperf_log_accuracy.json"]
+REQUIRED_PERF_POWER_FILES = ["spl.txt"]
+REQUIRED_POWER_FILES = [
+ "client.json",
+ "client.log",
+ "ptd_logs.txt",
+ "server.json",
+ "server.log",
+]
+REQUIRED_ACC_FILES = [
+ "mlperf_log_summary.txt",
+ "mlperf_log_detail.txt",
+ "accuracy.txt",
+ "mlperf_log_accuracy.json",
+]
+REQUIRED_ACC_BENCHMARK = {
+ "stable-diffusion-xl": {
+ "v5.0": {
+ "images": [
+ "4459",
+ "4015",
+ "2705",
+ "1682",
+ "4048",
+ "4683",
+ "3757",
+ "1578",
+ "3319",
+ "95",
+ ]
+ },
+ "v5.1": {
+ "images": [
+ "2747",
+ "2235",
+ "2165",
+ "1515",
+ "1538",
+ "1367",
+ "2419",
+ "4629",
+ "3657",
+ "4532",
+ ]
+ },
+ }
+}
+REQUIRED_MEASURE_FILES = ["user.conf", "README.md"]
+REQUIRED_POWER_MEASURE_FILES = ["analyzer_table.*", "power_settings.*"]
+MS_TO_NS = 1000 * 1000
+S_TO_MS = 1000
+FILE_SIZE_LIMIT_MB = 50
+MB_TO_BYTES = 1024 * 1024
+MAX_ACCURACY_LOG_SIZE = 10 * 1024
+OFFLINE_MIN_SPQ = 24576
+TEST_DURATION_MS_PRE_1_0 = 60000
+TEST_DURATION_MS = 600000
+REQUIRED_COMP_PER_FILES = ["mlperf_log_summary.txt", "mlperf_log_detail.txt"]
+REQUIRED_TEST01_ACC_FILES_1 = ["mlperf_log_accuracy.json", "accuracy.txt"]
+REQUIRED_TEST01_ACC_FILES = REQUIRED_TEST01_ACC_FILES_1 + [
+ "baseline_accuracy.txt",
+ "compliance_accuracy.txt",
+]
+
+OFFLINE_MIN_SPQ_SINCE_V4 = {
+ "resnet": 24576,
+ "retinanet": 24576,
+ "bert-99": 10833,
+ "bert-99.9": 10833,
+ "dlrm-v2-99": 24576,
+ "dlrm-v2-99.9": 24576,
+ "3d-unet-99": 43,
+ "3d-unet-99.9": 43,
+ "rnnt": 2513,
+ "llama3.1-8b": 13368,
+ "llama3.1-8b-edge": 5000,
+ "llama2-70b-99": 24576,
+ "llama2-70b-99.9": 24576,
+ "llama2-70b-interactive-99": 24576,
+ "llama2-70b-interactive-99.9": 24576,
+ "stable-diffusion-xl": 5000,
+ "mixtral-8x7b": 15000,
+ "llama3.1-405b": 8313,
+ "rgat": 788379,
+ "deepseek-r1": 4388,
+ "whisper": 1633,
+ "pointpainting": 6636,
+}
+
+SCENARIO_MAPPING = {
+ "singlestream": "SingleStream",
+ "multistream": "MultiStream",
+ "server": "Server",
+ "offline": "Offline",
+ "interactive": "Interactive",
+}
+
+RESULT_FIELD = {
+ "Offline": "Samples per second",
+ "SingleStream": "90th percentile latency (ns)",
+ "MultiStream": "Samples per query",
+ "Server": "Scheduled samples per second",
+}
+
+RESULT_FIELD_NEW = {
+ "v5.0": {
+ "Offline": "result_samples_per_second",
+ "SingleStream": "early_stopping_latency_ss",
+ "MultiStream": "early_stopping_latency_ms",
+ "Server": "result_completed_samples_per_sec",
+ },
+ "v5.1": {
+ "Offline": "result_samples_per_second",
+ "SingleStream": "early_stopping_latency_ss",
+ "MultiStream": "early_stopping_latency_ms",
+ "Server": "result_completed_samples_per_sec",
+ },
+ "v6.0": {
+ "Offline": "result_samples_per_second",
+ "SingleStream": "early_stopping_latency_ss",
+ "MultiStream": "early_stopping_latency_ms",
+ "Server": "result_completed_samples_per_sec",
+ },
+}
+
+RESULT_FIELD_BENCHMARK_OVERWRITE = {
+ "v5.0": {
+ "llama2-70b-99": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama2-70b-99.9": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama2-70b-interactive-99": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama2-70b-interactive-99.9": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "gptj-99": {
+ "Offline": "result_inferred_tokens_per_second",
+ "Server": "result_inferred_completed_tokens_per_second",
+ },
+ "gptj-99.9": {
+ "Offline": "result_inferred_tokens_per_second",
+ "Server": "result_inferred_completed_tokens_per_second",
+ },
+ "mixtral-8x7b": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama3.1-405b": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ },
+ "v5.1": {
+ "llama2-70b-99": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama2-70b-99.9": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama3.1-8b": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama3.1-8b-edge": {
+ "Offline": "result_tokens_per_second",
+ "SingleStream": "result_90.00_percentile_latency_ns",
+ },
+ "mixtral-8x7b": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "llama3.1-405b": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "deepseek-r1": {
+ "Offline": "result_tokens_per_second",
+ "Server": "result_completed_tokens_per_second",
+ },
+ "whisper": {
+ "Offline": "result_tokens_per_second",
+ }
+ },
+}
+
+LLM_LATENCY_LIMITS = {
+ "llama2-70b-99": {
+ "Server": {
+ "ttft": 2000 * 1000000, "tpot": 200 * 1000000
+ },
+ "Interactive": {
+ "ttft": 450 * 1000000, "tpot": 40 * 1000000
+ },
+ },
+ "llama2-70b-99.9": {
+ "Server": {
+ "ttft": 2000 * 1000000, "tpot": 200 * 1000000
+ },
+ "Interactive": {
+ "ttft": 450 * 1000000, "tpot": 40 * 1000000
+ },
+ },
+ "llama2-70b-interactive-99": {
+ "Server": {
+ "ttft": 450 * 1000000, "tpot": 40 * 1000000
+ },
+ },
+ # for v5.0
+ "llama2-70b-interactive-99.9": {
+ "Server": {
+ "ttft": 450 * 1000000, "tpot": 40 * 1000000
+ },
+ },
+ "mixtral-8x7b": {
+ "Server": {
+ "ttft": 2000 * 1000000, "tpot": 200 * 1000000
+ }
+ },
+ "llama3.1-405b": {
+ "Server": {
+ "ttft": 6000 * 1000000, "tpot": 175 * 1000000
+ },
+ "Interactive": {
+ "ttft": 4500 * 1000000, "tpot": 80 * 1000000
+ },
+ },
+ "llama3.1-8b": {
+ "Server": {
+ "ttft": 2000 * 1000000, "tpot": 100 * 1000000
+ },
+ "Interactive": {
+ "ttft": 500 * 1000000, "tpot": 30 * 1000000
+ }
+ },
+ "deepseek-r1": {
+ "Server": {
+ "ttft": 2000 * 1000000, "tpot": 80 * 1000000
+ }
+ }
+
+}
+
+ACC_PATTERN = {
+ "acc": r"^(?:\{\"accuracy|accuracy)[\": ]*=?\s*([\d\.]+).*",
+ "meanAcc": r".*'mean-accuracy':\s+'?([\d.]+)'?.*",
+ "AUC": r"^AUC=([\d\.]+).*",
+ "mAP": r".*(?:mAP=|'Total':)\s*([\d.]+)",
+ "bleu": r"^BLEU\:\s*([\d\.]+).*",
+ "F1": r"^{[\"\']exact_match[\"\']\:\s*[\d\.]+,\s*[\"\']f1[\"\']\:\s*([\d\.]+)}",
+ "ACCURACY": r"Word Error Rate\:.*, accuracy=([0-9\.]+)%",
+ "DICE": r"Accuracy\:\s*mean\s*=\s*([\d\.]+).*",
+ "ROUGE1": r".*'rouge1':\s+'?([\d.]+)'?.*",
+ "ROUGE2": r".*'rouge2':\s+'?([\d.]+)'?.*",
+ "ROUGEL": r".*'rougeL':\s+'?([\d.]+)'?.*",
+ "ROUGELSUM": r".*'rougeLsum':\s+'?([\d.]+)'?.*",
+ "GEN_LEN": r".*'gen_len':\s([\d.]+).*",
+ "TOKENS_PER_SAMPLE": r".*'tokens_per_sample':\s([\d.]+).*",
+ "CLIP_SCORE": r".*'CLIP_SCORE':\s+'?([\d.]+).*",
+ "FID_SCORE": r".*'FID_SCORE':\s+'?([\d.]+).*",
+ "gsm8k_accuracy": r".*'gsm8k':\s([\d.]+).*",
+ "mbxp_accuracy": r".*'mbxp':\s([\d.]+).*",
+ "exact_match": r".*'exact_match':\s([\d.]+).*"
+}
+
+SYSTEM_DESC_REQUIRED_FIELDS = [
+ "division",
+ "submitter",
+ "status",
+ "system_name",
+ "number_of_nodes",
+ "host_processor_model_name",
+ "host_processors_per_node",
+ "host_processor_core_count",
+ "host_memory_capacity",
+ "host_storage_capacity",
+ "host_storage_type",
+ "accelerators_per_node",
+ "accelerator_model_name",
+ "accelerator_memory_capacity",
+ "framework",
+ "operating_system",
+ "system_type",
+ "other_software_stack",
+ "host_processor_frequency",
+ "host_processor_caches",
+ "host_memory_configuration",
+ "host_processor_interconnect",
+ "host_networking",
+ "host_networking_topology",
+ "accelerator_frequency",
+ "accelerator_host_interconnect",
+ "accelerator_interconnect",
+ "accelerator_interconnect_topology",
+ "accelerator_memory_configuration",
+ "accelerator_on-chip_memories",
+ "cooling",
+ "hw_notes",
+ "sw_notes",
+ "host_network_card_count",
+ "system_type_detail",
+ # "network_speed_mbit",
+]
+
+SYSTEM_DESC_MEANINGFUL_RESPONSE_REQUIRED_FIELDS = [
+ "division",
+ "submitter",
+ "system_type",
+ "status",
+ "system_name",
+ "number_of_nodes",
+ "host_processor_model_name",
+ "host_processors_per_node",
+ "host_processor_core_count",
+ "host_memory_capacity",
+ "host_memory_configuration",
+ "host_storage_capacity",
+ "host_storage_type",
+ "host_networking",
+ "host_network_card_count",
+ "host_networking_topology",
+ "accelerators_per_node",
+ "accelerator_model_name",
+ "accelerator_memory_capacity",
+ "accelerator_host_interconnect",
+ "accelerator_memory_configuration",
+ "accelerator_interconnect",
+ "cooling",
+ "framework",
+ "operating_system",
+ "other_software_stack",
+]
+
+SYSTEM_DESC_NUMERIC_RESPONSE_REQUIRED_FIELDS = [
+ # "network_speed_mbit"
+]
+
+
+SYSTEM_DESC_REQUIRED_FIELDS_POWER = [
+ "power_management",
+ "filesystem",
+ "boot_firmware_version",
+ "management_firmware_version",
+ "other_hardware",
+ "number_of_type_nics_installed",
+ "nics_enabled_firmware",
+ "nics_enabled_os",
+ "nics_enabled_connected",
+ "network_speed_mbit",
+ "power_supply_quantity_and_rating_watts",
+ "power_supply_details",
+ "disk_drives",
+ "disk_controllers",
+ "system_power_only",
+]
+
+SYSTEM_DESC_MEANINGFUL_RESPONSE_REQUIRED_FIELDS_POWER = []
+
+SYSTEM_DESC_IS_NETWORK_MODE = "is_network"
+SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE = [
+ SYSTEM_DESC_IS_NETWORK_MODE,
+ "network_type",
+ "network_media",
+ "network_rate",
+ "nic_loadgen",
+ "number_nic_loadgen",
+ "net_software_stack_loadgen",
+ "network_protocol",
+ "number_connections",
+ "nic_sut",
+ "number_nic_sut",
+ "net_software_stack_sut",
+ "network_topology",
+]
+NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME = "Network SUT"
+
+SYSTEM_IMP_REQUIRED_FILES = [
+ "input_data_types",
+ "retraining",
+ "starting_weights_filename",
+ "weight_data_types",
+ "weight_transformations",
+]
+
+SPECIAL_UNIT_DICT = {
+ "llama3.1-8b": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ },
+ "llama3.1-8b-edge": {
+ "Offline": "Tokens/s",
+ },
+ "llama2-70b-99": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ "Interactive": "Tokens/s",
+ },
+ "llama2-70b-99.9": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ "Interactive": "Tokens/s",
+ },
+ "mixtral-8x7b": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ "Interactive": "Tokens/s",
+ },
+ "llama3.1-405b": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ "Interactive": "Tokens/s",
+ },
+ "deepseek-r1": {
+ "Offline": "Tokens/s",
+ "Server": "Tokens/s",
+ "Interactive": "Tokens/s",
+ },
+}
+UNIT_DICT = {
+ "SingleStream": "Latency (ms)",
+ "MultiStream": "Latency (ms)",
+ "Offline": "Samples/s",
+ "Server": "Queries/s",
+ "Interactive": "Queries/s",
+
+ "singlestream": "Latency (ms)",
+ "multistream": "Latency (ms)",
+ "offline": "Samples/s",
+ "server": "Queries/s",
+ "interactive": "Queries/s",
+}
+POWER_UNIT_DICT = {
+ "SingleStream": "millijoules",
+ "MultiStream": "millijoules",
+ "Offline": "Watts",
+ "Server": "Watts",
+ "Interactive": "Watts",
+
+ "singlestream": "millijoules",
+ "multistream": "millijoules",
+ "offline": "Watts",
+ "server": "Watts",
+ "interactive": "Watts",
+}
+
+
+PERFORMANCE_LOG_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_detail.txt",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_detail.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_detail.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_detail.txt",
+}
+
+PERFORMANCE_SUMMARY_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
+}
+
+ACCURACY_LOG_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
+}
+
+ACCURACY_RESULT_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/accuracy.txt",
+}
+
+ACCURACY_JSON_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
+}
+
+POWER_DIR_PATH = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
+}
+
+MEASUREMENTS_PATH = {
+ "v5.0": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{file}",
+ "v5.1": "{division}/{submitter}/measurements/{system}/{benchmark}/{scenario}/{file}",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/measurements.json",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/measurements.json",
+}
+
+TEST01_PERF_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST01/performance/run_1/mlperf_log_detail.txt",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST01/performance/run_1/mlperf_log_detail.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST01/performance/run_1/mlperf_log_detail.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST01/performance/run_1/mlperf_log_detail.txt",
+}
+
+TEST01_ACC_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST01/verify_accuracy.txt",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST01/verify_accuracy.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST01/verify_accuracy.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST01/verify_accuracy.txt",
+}
+
+TEST04_PERF_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST04/performance/run_1/mlperf_log_detail.txt",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST04/performance/run_1/mlperf_log_detail.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST04/performance/run_1/mlperf_log_detail.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST04/performance/run_1/mlperf_log_detail.txt",
+}
+
+TEST04_ACC_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST04/verify_accuracy.txt",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST04/verify_accuracy.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST04/verify_accuracy.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST04/verify_accuracy.txt",
+}
+
+TEST06_ACC_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST06/verify_accuracy.txt",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/TEST06/verify_accuracy.txt",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST06/verify_accuracy.txt",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST06/verify_accuracy.txt",
+}
+
+COMPLIANCE_PATH = {
+ "v5.0": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/",
+ "v5.1": "{division}/{submitter}/compliance/{system}/{benchmark}/{scenario}/",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/",
+}
+
+SYSTEM_PATH = {
+ "v5.0": "{division}/{submitter}/systems/{system}.json",
+ "v5.1": "{division}/{submitter}/systems/{system}.json",
+ "v6.0": "{division}/{submitter}/systems/{system}.json",
+ "default": "{division}/{submitter}/systems/{system}.json",
+}
+
+SRC_PATH = {
+ "v5.0": "{division}/{submitter}/code",
+ "v5.1": "{division}/{submitter}/code",
+ "v6.0": "{division}/{submitter}/src",
+ "default": "{division}/{submitter}/src",
+}
diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py
new file mode 100644
index 0000000000..99b028d8d0
--- /dev/null
+++ b/tools/submission/submission_checker/loader.py
@@ -0,0 +1,364 @@
+import os
+from .constants import *
+from .utils import list_dir
+from .parsers.loadgen_parser import LoadgenParser
+from typing import Generator, Literal
+from .utils import *
+import logging
+import json
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s",
+)
+
+
+class SubmissionLogs:
+ """Container for parsed submission log artifacts and metadata.
+
+ The `SubmissionLogs` class holds references to parsed log files and
+ associated metadata for a single submission. It serves as a data
+ transfer object passed between loading and validation phases.
+ """
+
+ def __init__(self, performance_log=None, accuracy_log=None, accuracy_result=None,
+ accuracy_json=None, system_json=None, measurements_json=None, loader_data={}) -> None:
+ """Initialize the submission logs container.
+
+ Args:
+ performance_log: Parsed performance log object.
+ accuracy_log: Parsed accuracy log object.
+ accuracy_result (list[str]): Accuracy result file lines.
+ accuracy_json (str): Accuracy JSON file path.
+ system_json (dict): System description JSON contents.
+ measurements_json (dict): Measurements JSON contents.
+ loader_data (dict): Metadata dictionary with paths and info.
+ """
+ self.performance_log = performance_log
+ self.accuracy_log = accuracy_log
+ self.accuracy_result = accuracy_result
+ self.accuracy_json = accuracy_json
+ self.system_json = system_json
+ self.loader_data = loader_data
+ self.measurements_json = measurements_json
+
+
+class Loader:
+ """Loads and parses submission artifacts from the filesystem.
+
+ The `Loader` class traverses the submission directory structure,
+ identifies valid submissions, and parses their log files and metadata.
+ It yields `SubmissionLogs` objects for each valid submission found,
+ handling version-specific path formats and optional artifacts.
+ """
+
+ def __init__(self, root, version) -> None:
+ """Initialize the submission loader.
+
+ Sets up path templates based on the MLPerf version and root
+ directory.
+
+ Args:
+ root (str): Root directory containing submissions.
+ version (str): MLPerf version for path resolution.
+ """
+ self.root = root
+ self.version = version
+ self.logger = logging.getLogger("LoadgenParser")
+ self.perf_log_path = os.path.join(
+ self.root, PERFORMANCE_LOG_PATH.get(
+ version, PERFORMANCE_LOG_PATH["default"]))
+ self.perf_summary_path = os.path.join(
+ self.root, PERFORMANCE_SUMMARY_PATH.get(
+ version, PERFORMANCE_SUMMARY_PATH["default"]))
+ self.acc_log_path = os.path.join(
+ self.root, ACCURACY_LOG_PATH.get(
+ version, ACCURACY_LOG_PATH["default"]))
+ self.acc_result_path = os.path.join(
+ self.root, ACCURACY_RESULT_PATH.get(
+ version, ACCURACY_RESULT_PATH["default"]))
+ self.acc_json_path = os.path.join(
+ self.root, ACCURACY_JSON_PATH.get(
+ version, ACCURACY_JSON_PATH["default"]))
+ self.system_log_path = os.path.join(
+ self.root, SYSTEM_PATH.get(
+ version, SYSTEM_PATH["default"]))
+ self.measurements_path = os.path.join(
+ self.root, MEASUREMENTS_PATH.get(
+ version, MEASUREMENTS_PATH["default"]))
+ self.compliance_path = os.path.join(
+ self.root, COMPLIANCE_PATH.get(
+ version, COMPLIANCE_PATH["default"]))
+ self.test01_perf_path = os.path.join(
+ self.root, TEST01_PERF_PATH.get(
+ version, TEST01_PERF_PATH["default"]))
+ self.test01_acc_path = os.path.join(
+ self.root, TEST01_ACC_PATH.get(
+ version, TEST01_ACC_PATH["default"]))
+ self.test04_perf_path = os.path.join(
+ self.root, TEST04_PERF_PATH.get(
+ version, TEST04_PERF_PATH["default"]))
+ self.test04_acc_path = os.path.join(
+ self.root, TEST04_ACC_PATH.get(
+ version, TEST04_ACC_PATH["default"]))
+ self.test06_acc_path = os.path.join(
+ self.root, TEST06_ACC_PATH.get(
+ version, TEST06_ACC_PATH["default"]))
+ self.power_dir_path = os.path.join(
+ self.root, POWER_DIR_PATH.get(
+ version, POWER_DIR_PATH["default"]))
+ self.src_path = os.path.join(
+ self.root, SRC_PATH.get(
+ version, SRC_PATH["default"]))
+
+ def get_measurement_path(self, path, division,
+ submitter, system, benchmark, scenario):
+ """Resolve the measurements JSON file path with dynamic filename.
+
+ For paths containing '{file}', searches the measurements directory
+ for JSON files matching the system and scenario, selecting the
+ appropriate one.
+
+ Args:
+ path (str): Template path that may contain '{file}'.
+ division (str): Submission division.
+ submitter (str): Submitter name.
+ system (str): System name.
+ benchmark (str): Benchmark name.
+ scenario (str): Scenario name.
+
+ Returns:
+ str: Resolved path to the measurements JSON file.
+ """
+ measurements_file = None
+ if "{file}" in path:
+ files = list_files(
+ str(
+ os.path.dirname(path)).format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario))
+ for i in files:
+ if i.startswith(system) and i.endswith(
+ "_" + scenario + ".json"):
+ measurements_file = i
+ # end = len("_" + scenario + ".json")
+ break
+ elif i.startswith(system) and i.endswith(".json"):
+ measurements_file = i
+ # end = len(".json")
+ break
+ return path.format(division=division, submitter=submitter, system=system,
+ benchmark=benchmark, scenario=scenario, file=measurements_file)
+ return path.format(division=division, submitter=submitter,
+ system=system, benchmark=benchmark, scenario=scenario)
+
+ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
+ "AccuracyResult", "AccuracyJSON", "Test", "System", "Measurements"]):
+ """Load and parse a single log file based on its type.
+
+ Handles different log types with appropriate parsing: Loadgen logs
+ are parsed with LoadgenParser, JSON files are loaded as dicts,
+ accuracy results as line lists, etc.
+
+ Args:
+ path (str): Filesystem path to the log file.
+ log_type (str): Type of log to load, determining parsing method.
+
+ Returns:
+ Parsed log object (dict, list, LoadgenParser, or str) or None
+ if loading fails.
+ """
+ log = None
+ if os.path.exists(path):
+ self.logger.info("Loading %s log from %s", log_type, path)
+ if log_type in ["Performance", "Accuracy", "Test"]:
+ log = LoadgenParser(path)
+ elif log_type in ["System", "Measurements"]:
+ with open(path) as f:
+ log = json.load(f)
+ elif log_type in ["AccuracyResult"]:
+ with open(path) as f:
+ log = f.readlines()
+ elif log_type in ["AccuracyJSON"]:
+ log = path
+ else:
+ self.logger.info(
+ "Could not load %s log from %s, log type not recognized",
+ log_type,
+ path)
+ else:
+ self.logger.info(
+ "Could not load %s log from %s, path does not exist",
+ log_type,
+ path)
+ return log
+
+ def load(self) -> Generator[SubmissionLogs, None, None]:
+ """Traverse submissions directory and yield parsed log containers.
+
+ Iterates through the directory structure (division/submitter/results/
+ system/benchmark/scenario), formats paths for each submission,
+ loads all available logs, and yields a SubmissionLogs object.
+
+ Yields:
+ SubmissionLogs: Container with parsed logs and metadata for
+ each valid submission found.
+ """
+ for division in list_dir(self.root):
+ if division not in VALID_DIVISIONS:
+ continue
+ division_path = os.path.join(self.root, division)
+ for submitter in list_dir(division_path):
+ results_path = os.path.join(
+ division_path, submitter, "results")
+ model_mapping = {}
+ if division == "open" and os.path.exists(os.path.join(
+ division_path, submitter, "model_mapping.json")):
+ model_mapping = self.load_single_log(os.path.join(
+ division_path, submitter, "model_mapping.json"), "System")
+ for system in list_dir(results_path):
+ system_path = os.path.join(results_path, system)
+ system_json_path = self.system_log_path.format(
+ division=division, submitter=submitter, system=system)
+ system_json = self.load_single_log(
+ system_json_path, "System")
+ for benchmark in list_dir(system_path):
+ benchmark_path = os.path.join(system_path, benchmark)
+ for scenario in list_dir(benchmark_path):
+ scenario_path = os.path.join(
+ benchmark_path, benchmark)
+ # Format Paths for a specific submission
+ perf_path = self.perf_log_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ acc_path = self.acc_log_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ acc_result_path = self.acc_result_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ acc_json_path = self.acc_json_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ power_dir_path = self.power_dir_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ measurements_path = self.get_measurement_path(
+ self.measurements_path,
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ compliance_path = self.compliance_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ test01_perf_path = self.test01_perf_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ test01_acc_path = self.test01_acc_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ test04_perf_path = self.test04_perf_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ test04_acc_path = self.test04_acc_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ test06_acc_path = self.test06_acc_path.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ src_path = self.src_path.format(
+ division=division, submitter=submitter)
+
+ # Load logs
+ perf_log = self.load_single_log(
+ perf_path, "Performance")
+ acc_log = self.load_single_log(
+ acc_path, "Accuracy")
+ acc_result = self.load_single_log(
+ acc_result_path, "AccuracyResult")
+ acc_json = self.load_single_log(
+ acc_json_path, "AccuracyJSON")
+ measurements_json = self.load_single_log(
+ measurements_path, "Measurements")
+
+ # Load test logs
+ test01_perf_log = self.load_single_log(
+ test01_perf_path, "Performance")
+ test01_acc_result = self.load_single_log(
+ test01_acc_path, "AccuracyResult")
+ test04_perf_log = self.load_single_log(
+ test04_perf_path, "Performance")
+ test04_acc_result = self.load_single_log(
+ test04_acc_path, "AccuracyResult")
+ test06_acc_result = self.load_single_log(
+ test06_acc_path, "AccuracyResult")
+
+ loader_data = {
+ # Submission info
+ "division": division,
+ "submitter": submitter,
+ "system": system,
+ "benchmark": benchmark,
+ "scenario": scenario,
+ # Submission paths
+ "perf_path": perf_path,
+ "acc_path": acc_path,
+ "system_path": system_path,
+ "measurements_path": measurements_path,
+ "measurements_dir": os.path.dirname(measurements_path),
+ "compliance_path": compliance_path,
+ "model_mapping": model_mapping,
+ "power_dir_path": power_dir_path,
+ "src_path": src_path,
+ # Test paths
+ "TEST01_perf_path": test01_perf_path,
+ "TEST01_acc_path": test01_acc_path,
+ "TEST04_perf_path": test04_perf_path,
+ "TEST04_acc_path": test04_acc_path,
+ "TEST06_acc_path": test06_acc_path,
+ # Test logs
+ "TEST01_perf_log": test01_perf_log,
+ "TEST01_acc_result": test01_acc_result,
+ "TEST04_perf_log": test04_perf_log,
+ "TEST04_acc_result": test04_acc_result,
+ "TEST06_acc_result": test06_acc_result,
+ }
+ yield SubmissionLogs(perf_log, acc_log, acc_result, acc_json, system_json, measurements_json, loader_data)
diff --git a/tools/submission/submission_checker/main.py b/tools/submission/submission_checker/main.py
new file mode 100644
index 0000000000..6346549d45
--- /dev/null
+++ b/tools/submission/submission_checker/main.py
@@ -0,0 +1,365 @@
+import argparse
+import logging
+import os
+
+if __name__ == "__main__" and __package__ is None:
+ import sys
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+ __package__ = "submission_checker"
+
+from .constants import MODEL_CONFIG
+from .configuration.configuration import Config
+from .loader import Loader
+from .checks.performance_check import PerformanceCheck
+from .checks.accuracy_check import AccuracyCheck
+from .checks.system_check import SystemCheck
+from .checks.measurements_checks import MeasurementsCheck
+from .checks.compliance_check import ComplianceCheck
+from .checks.power_check import PowerCheck
+from .results import ResultExporter
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+
+def get_args():
+ """Parse command-line arguments for the submission checker.
+
+ Sets up an ArgumentParser with options for input directory, version,
+ filtering, output files, and various skip flags for different checks.
+
+ Returns:
+ argparse.Namespace: Parsed command-line arguments.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", required=True, help="submission directory")
+ parser.add_argument(
+ "--version",
+ default="v5.1",
+ choices=list(MODEL_CONFIG.keys()),
+ help="mlperf version",
+ )
+ parser.add_argument("--submitter", help="filter to submitter")
+ parser.add_argument(
+ "--csv",
+ default="summary.csv",
+ help="csv file with results")
+ parser.add_argument(
+ "--skip_compliance",
+ action="store_true",
+ help="Pass this cmdline option to skip checking compliance/ dir",
+ )
+ parser.add_argument(
+ "--extra-model-benchmark-map",
+ help="File containing extra custom model mapping. It is assumed to be inside the folder open/",
+ default="model_mapping.json",
+ )
+ parser.add_argument(
+ "--debug",
+ action="store_true",
+ help="extra debug output")
+ parser.add_argument(
+ "--submission-exceptions",
+ action="store_true",
+ help="ignore certain errors for submission",
+ )
+ parser.add_argument(
+ "--skip-power-check",
+ action="store_true",
+ help="skips Power WG's check.py script on each power submission.",
+ )
+ parser.add_argument(
+ "--skip-meaningful-fields-emptiness-check",
+ action="store_true",
+ help="skips the check of empty values in required measurement field values",
+ )
+ parser.add_argument(
+ "--skip-check-power-measure-files",
+ action="store_true",
+ help="skips the check of required measure files for power runs",
+ )
+ parser.add_argument(
+ "--skip-empty-files-check",
+ action="store_true",
+ help="skips the check of empty required files",
+ )
+ parser.add_argument(
+ "--skip-extra-files-in-root-check",
+ action="store_true",
+ help="skips the check of extra files inside the root submission dir",
+ )
+ parser.add_argument(
+ "--skip-extra-accuracy-files-check",
+ action="store_true",
+ help="skips the check of extra accuracy files like the images folder of SDXL",
+ )
+ parser.add_argument(
+ "--scenarios-to-skip",
+ help="Delimited list input of scenarios to skip. i.e. if you only have Offline results, pass in 'Server'",
+ type=str,
+ )
+ parser.add_argument(
+ "--skip-all-systems-have-results-check",
+ action="store_true",
+ help="skips the check that all the systems in the systems and measurements folder should have results",
+ )
+ parser.add_argument(
+ "--skip-calibration-check",
+ action="store_true",
+ help="skips the check that the calibration documentation should exist",
+ )
+ parser.add_argument(
+ "--skip-dataset-size-check",
+ action="store_true",
+ help="skips dataset size check, only for backwards compatibility",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ """Run the MLPerf submission checker on the provided directory.
+
+ Parses arguments, initializes configuration and loader, iterates
+ through all submissions, runs validation checks (performance,
+ accuracy, system, measurements, power), collects results, and
+ exports summaries. Logs pass/fail status and statistics.
+
+ Returns:
+ int: 0 if all submissions pass checks, 1 if any errors found.
+ """
+ args = get_args()
+
+ config = Config(
+ args.version,
+ args.extra_model_benchmark_map,
+ ignore_uncommited=args.submission_exceptions,
+ skip_compliance=args.skip_power_check,
+ skip_power_check=args.skip_power_check,
+ skip_meaningful_fields_emptiness_check=args.skip_meaningful_fields_emptiness_check,
+ skip_check_power_measure_files=args.skip_check_power_measure_files,
+ skip_empty_files_check=args.skip_empty_files_check,
+ skip_extra_files_in_root_check=args.skip_extra_files_in_root_check,
+ skip_extra_accuracy_files_check=args.skip_extra_accuracy_files_check,
+ skip_all_systems_have_results_check=args.skip_all_systems_have_results_check,
+ skip_calibration_check=args.skip_calibration_check,
+ skip_dataset_size_check=args.skip_dataset_size_check
+ )
+
+ if args.scenarios_to_skip:
+ scenarios_to_skip = [
+ scenario for scenario in args.scenarios_to_skip.split(",")]
+ else:
+ scenarios_to_skip = []
+
+ loader = Loader(args.input, args.version)
+ exporter = ResultExporter(args.csv, config)
+ results = {}
+ systems = {}
+ for division in ["closed", "open", "network"]:
+ systems[division] = {}
+ systems[division]["power"] = {}
+ systems[division]["non_power"] = {}
+
+ # Main loop over all the submissions
+ for logs in loader.load():
+ # Initialize check classes
+ performance_checks = PerformanceCheck(
+ log, logs.loader_data["perf_path"], config, logs)
+ accuracy_checks = AccuracyCheck(
+ log, logs.loader_data["acc_path"], config, logs)
+ system_checks = SystemCheck(
+ log, logs.loader_data["system_path"], config, logs)
+ measurements_checks = MeasurementsCheck(
+ log, logs.loader_data["measurements_path"], config, logs)
+ power_checks = PowerCheck(
+ log, logs.loader_data["power_dir_path"], config, logs)
+ # Run checks
+ valid = True
+ valid &= performance_checks()
+ valid &= accuracy_checks()
+ valid &= system_checks()
+ valid &= measurements_checks()
+ valid &= power_checks()
+ # Add results to summary
+ if valid:
+ # Results dictionary
+ results[logs.loader_data.get("perf_path")] = logs.loader_data.get(
+ "performance_metric")
+ # System dictionary
+ system_id = logs.loader_data.get("system")
+ if os.path.exists(logs.loader_data.get("power_dir_path", "")):
+ if system_id in systems[logs.loader_data.get(
+ "division")]["power"]:
+ systems[logs.loader_data.get(
+ "division")]["power"][system_id] += 1
+ else:
+ systems[logs.loader_data.get(
+ "division")]["power"][system_id] = 1
+ else:
+ if system_id in systems[logs.loader_data.get(
+ "division")]["non_power"]:
+ systems[logs.loader_data.get(
+ "division")]["non_power"][system_id] += 1
+ else:
+ systems[logs.loader_data.get(
+ "division")]["non_power"][system_id] = 1
+ # CSV exporter
+ exporter.add_result(logs)
+ else:
+ results[logs.loader_data.get("perf_path")] = None
+ # Export results
+ exporter.export()
+
+ # log results
+ log.info("---")
+ with_results = 0
+ for k, v in sorted(results.items()):
+ if v:
+ log.info("Results %s %s", k, v)
+ with_results += 1
+ log.info("---")
+ for k, v in sorted(results.items()):
+ if v is None:
+ log.error("NoResults %s", k)
+
+ closed_systems = systems.get("closed", {})
+ open_systems = systems.get("open", {})
+ network_systems = systems.get("network", {})
+ closed_power_systems = closed_systems.get("power", {})
+ closed_non_power_systems = closed_systems.get("non_power", {})
+ open_power_systems = open_systems.get("power", {})
+ open_non_power_systems = open_systems.get("non_power", {})
+ network_power_systems = network_systems.get("power", {})
+ network_non_power_systems = network_systems.get("non_power", {})
+
+ number_closed_power_systems = len(closed_power_systems)
+ number_closed_non_power_systems = len(closed_non_power_systems)
+ number_closed_systems = (
+ number_closed_power_systems + number_closed_non_power_systems
+ )
+ number_open_power_systems = len(open_power_systems)
+ number_open_non_power_systems = len(open_non_power_systems)
+ number_open_systems = number_open_power_systems + number_open_non_power_systems
+ number_network_power_systems = len(network_power_systems)
+ number_network_non_power_systems = len(network_non_power_systems)
+ number_network_systems = (
+ number_network_power_systems + number_network_non_power_systems
+ )
+
+ def merge_two_dict(x, y):
+ z = x.copy()
+ for key in y:
+ if key not in z:
+ z[key] = y[key]
+ else:
+ z[key] += y[key]
+ return z
+
+ # systems can be repeating in open, closed and network
+ unique_closed_systems = merge_two_dict(
+ closed_power_systems, closed_non_power_systems
+ )
+ unique_open_systems = merge_two_dict(
+ open_power_systems, open_non_power_systems)
+ unique_network_systems = merge_two_dict(
+ network_power_systems, network_non_power_systems
+ )
+
+ unique_systems = merge_two_dict(unique_closed_systems, unique_open_systems)
+ unique_systems = merge_two_dict(unique_systems, unique_network_systems)
+
+ # power systems can be repeating in open, closed and network
+ unique_power_systems = merge_two_dict(
+ closed_power_systems, open_power_systems)
+ unique_power_systems = merge_two_dict(
+ unique_power_systems, network_power_systems)
+
+ number_systems = len(unique_systems)
+ number_power_systems = len(unique_power_systems)
+
+ # Counting the number of closed,open and network results
+ def sum_dict_values(x):
+ count = 0
+ for key in x:
+ count += x[key]
+ return count
+
+ count_closed_power_results = sum_dict_values(closed_power_systems)
+ count_closed_non_power_results = sum_dict_values(closed_non_power_systems)
+ count_closed_results = count_closed_power_results + count_closed_non_power_results
+
+ count_open_power_results = sum_dict_values(open_power_systems)
+ count_open_non_power_results = sum_dict_values(open_non_power_systems)
+ count_open_results = count_open_power_results + count_open_non_power_results
+
+ count_network_power_results = sum_dict_values(network_power_systems)
+ count_network_non_power_results = sum_dict_values(
+ network_non_power_systems)
+ count_network_results = (
+ count_network_power_results + count_network_non_power_results
+ )
+
+ count_power_results = (
+ count_closed_power_results
+ + count_open_power_results
+ + count_network_power_results
+ )
+
+ # print summary
+ log.info("---")
+ log.info(
+ "Results=%d, NoResults=%d, Power Results=%d",
+ with_results,
+ len(results) - with_results,
+ count_power_results,
+ )
+
+ log.info("---")
+ log.info(
+ "Closed Results=%d, Closed Power Results=%d\n",
+ count_closed_results,
+ count_closed_power_results,
+ )
+ log.info(
+ "Open Results=%d, Open Power Results=%d\n",
+ count_open_results,
+ count_open_power_results,
+ )
+ log.info(
+ "Network Results=%d, Network Power Results=%d\n",
+ count_network_results,
+ count_network_power_results,
+ )
+ log.info("---")
+
+ log.info(
+ "Systems=%d, Power Systems=%d",
+ number_systems,
+ number_power_systems)
+ log.info(
+ "Closed Systems=%d, Closed Power Systems=%d",
+ number_closed_systems,
+ number_closed_power_systems,
+ )
+ log.info(
+ "Open Systems=%d, Open Power Systems=%d",
+ number_open_systems,
+ number_open_power_systems,
+ )
+ log.info(
+ "Network Systems=%d, Network Power Systems=%d",
+ number_network_systems,
+ number_network_power_systems,
+ )
+ log.info("---")
+ if len(results) != with_results:
+ log.error("SUMMARY: submission has errors")
+ return 1
+ else:
+ log.info("SUMMARY: submission looks OK")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tools/submission/submission_checker/parsers/__init__.py b/tools/submission/submission_checker/parsers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/submission_checker/parsers/base.py b/tools/submission/submission_checker/parsers/base.py
new file mode 100644
index 0000000000..1af266b53b
--- /dev/null
+++ b/tools/submission/submission_checker/parsers/base.py
@@ -0,0 +1,40 @@
+
+from abc import ABC, abstractmethod
+
+
+class BaseParser:
+ def __init__(self, log_path):
+ """
+ Helper class to parse the detail logs.
+ log_path: path to the detail log.
+ strict: whether to ignore lines with :::MLLOG prefix but with invalid JSON format.
+ """
+ self.path = log_path
+
+ @abstractmethod
+ def __getitem__(self, key):
+ """
+ Get the value of the message with the specific key. If a key appears multiple times, the first one is used.
+ """
+ pass
+
+ @abstractmethod
+ def get(self, key):
+ """
+ Get all the messages with specific key in the log.
+ """
+ pass
+
+ @abstractmethod
+ def get_messages(self):
+ """
+ Get all the messages in the log.
+ """
+ pass
+
+ @abstractmethod
+ def get_keys(self):
+ """
+ Get all the keys in the log.
+ """
+ pass
diff --git a/tools/submission/submission_checker/parsers/loadgen_parser.py b/tools/submission/submission_checker/parsers/loadgen_parser.py
new file mode 100644
index 0000000000..b2812c0b78
--- /dev/null
+++ b/tools/submission/submission_checker/parsers/loadgen_parser.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import sys
+from .base import BaseParser
+
+# pylint: disable=missing-docstring
+
+
+class LoadgenParser(BaseParser):
+ def __init__(self, log_path, strict=True):
+ """
+ Helper class to parse the detail logs.
+ log_path: path to the detail log.
+ strict: whether to ignore lines with :::MLLOG prefix but with invalid JSON format.
+ """
+ self.loadgen_marker = ":::MLLOG"
+ self.endpoints_marker = ":::ENDPTS"
+ self.marker = ""
+ self.log_is_endpoints = False
+ self.logger = logging.getLogger("MLPerfLog")
+ self.messages = {}
+ with open(log_path, "r", encoding="utf-8") as f:
+ for i, line in enumerate(f):
+ line = line.rstrip()
+ if i == 0:
+ if line.find(self.loadgen_marker) == 0:
+ self.marker = self.loadgen_marker
+ elif line.find(self.endpoints_marker) == 0:
+ self.marker = self.endpoints_marker
+ self.log_is_endpoints = True
+ else:
+ raise RuntimeError(
+ "Marker not found in first line: {:}".format(line))
+ if line.find(self.marker) == 0:
+ try:
+ log = json.loads(line[len(self.marker):])
+ if log["key"] in self.messages:
+ self.messages[log["key"]].append(log)
+ else:
+ self.messages[log["key"]] = [log]
+ except BaseException:
+ if strict:
+ raise RuntimeError(
+ "Encountered invalid line: {:}".format(line)
+ )
+ else:
+ self.logger.warning(
+ "Skipping invalid line: {:}".format(line)
+ )
+ self.keys = set(self.messages.keys())
+ self.logger.info(
+ "Sucessfully loaded MLPerf log from {:}.".format(log_path))
+
+ def __getitem__(self, key):
+ """
+ Get the value of the message with the specific key. If a key appears multiple times, the first one is used.
+ """
+ if key not in self.keys:
+ return None
+ results = self.messages[key]
+ if len(results) != 1:
+ self.logger.warning(
+ "There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(
+ key
+ )
+ )
+ return results[0]["value"]
+
+ def get(self, key):
+ """
+ Get all the messages with specific key in the log.
+ """
+ results = self.messages[key]
+ return results
+
+ def get_messages(self):
+ """
+ Get all the messages in the log.
+ """
+ return self.messages
+
+ def get_keys(self):
+ """
+ Get all the keys in the log.
+ """
+ return self.keys
+
+ def get_dict(self):
+ """
+ Get a dict representing the log. If a key appears multiple times, the first one is used.
+ """
+ result = {}
+ for message in self.messages:
+ if message["key"] not in result:
+ result[message["key"]] = message["value"]
+ else:
+ self.logger.warning(
+ "There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(
+ message["key"]
+ )
+ )
+
+ def dump(self, output_path):
+ """
+ Dump the entire log as a json file.
+ """
+ with open(output_path, "w") as f:
+ json.dump(self.messages, f, indent=4)
+
+ def num_messages(self):
+ """Get number of messages (including errors and warnings) in the log."""
+ return len(self.messages)
+
+ def num_errors(self):
+ """Get number of errors in the log."""
+ count = 0
+ for key in self.messages:
+ for message in self.messages[key]:
+ if message["metadata"]["is_error"]:
+ count += 1
+ return count
+
+ def num_warnings(self):
+ """Get number of warning in the log."""
+ count = 0
+ for message in self.messages:
+ if message["metadata"]["is_warning"]:
+ count += 1
+ return count
+
+ def has_error(self):
+ """Check if the log contains any errors."""
+ return self.num_errors() != 0
+
+ def has_warning(self):
+ """Check if the log contains any warnings."""
+ return self.num_warnings() != 0
+
+ def get_errors(self):
+ """
+ Get all the error messages in the log.
+ """
+ results = []
+ for key in self.messages:
+ for message in self.messages[key]:
+ if message["metadata"]["is_error"]:
+ results.append(message)
+ return results
+
+ def get_warnings(self):
+ """
+ Get all the warning messages in the log.
+ """
+ results = []
+ for message in self.messages:
+ if message["metadata"]["is_warning"]:
+ results.append(message)
+ return results
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--input",
+ required=True,
+ help="path to the detail log")
+ parser.add_argument(
+ "--ignore_invalid_lines",
+ action="store_true",
+ help="whether to stop if there are lines with invalid formats",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ """
+ Inspect a detailed log.
+ """
+ args = get_args()
+ mlperf_log = LoadgenParser(args.input,
+ strict=not args.ignore_invalid_lines)
+ logging.basicConfig(
+ level=logging.INFO,
+ format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s",
+ )
+ logger = logging.getLogger("main")
+ logger.info("Details of the log:")
+ logger.info("- Number of messages: {:d}".format(mlperf_log.num_messages()))
+ logger.info("- Number of errors: {:d}".format(mlperf_log.num_errors()))
+ logger.info("- Number of warnings: {:d}".format(mlperf_log.num_warnings()))
+ logger.info("- Contents:")
+ messages = mlperf_log.get_messages()
+ for message in messages:
+ logger.info('"{:}": {:}'.format(message["key"], message["value"]))
+ logger.info("Done!")
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tools/submission/submission_checker/results.py b/tools/submission/submission_checker/results.py
new file mode 100644
index 0000000000..30fb40ef40
--- /dev/null
+++ b/tools/submission/submission_checker/results.py
@@ -0,0 +1,154 @@
+
+from .loader import SubmissionLogs
+from .configuration.configuration import Config
+import os
+import csv
+from .constants import *
+import json
+
+
+class ResultExporter:
+ """Exports submission validation results to CSV format.
+
+ The `ResultExporter` class collects validated submission data and
+ exports it to a CSV file with standardized columns for MLPerf
+ submission summaries. It handles both performance and power results,
+ duplicating rows for power submissions with power-specific metrics.
+ """
+
+ def __init__(self, csv_path, config: Config) -> None:
+ """Initialize the result exporter.
+
+ Sets up the CSV header columns and prepares for result collection.
+
+ Args:
+ csv_path (str): Path to the output CSV file.
+ config (Config): Configuration helper for model mappings.
+ """
+ self.head = [
+ "Organization",
+ "Availability",
+ "Division",
+ "SystemType",
+ "SystemName",
+ "Platform",
+ "Model",
+ "MlperfModel",
+ "Scenario",
+ "Result",
+ "Accuracy",
+ "number_of_nodes",
+ "host_processor_model_name",
+ "host_processors_per_node",
+ "host_processor_core_count",
+ "accelerator_model_name",
+ "accelerators_per_node",
+ "Location",
+ "framework",
+ "operating_system",
+ "notes",
+ "compliance",
+ "errors",
+ "version",
+ "inferred",
+ "has_power",
+ "Units",
+ "weight_data_types",
+ ]
+ self.rows = []
+ self.csv_path = csv_path
+ self.config = config
+
+ def add_result(self, submission_logs: SubmissionLogs):
+ """Add a validated submission result to the export queue.
+
+ Extracts relevant fields from submission logs and system JSON,
+ formats them into a CSV row, and appends to the rows list. For
+ power submissions, adds an additional row with power metrics.
+
+ Args:
+ submission_logs (SubmissionLogs): Validated submission data
+ and metadata.
+ """
+ row = {key: "" for key in self.head}
+ row["Organization"] = submission_logs.loader_data["submitter"]
+ row["Availability"] = submission_logs.system_json["status"]
+ row["Division"] = submission_logs.loader_data["division"]
+ row["SystemType"] = submission_logs.system_json["system_type"]
+ row["SystemName"] = submission_logs.system_json["system_name"]
+ row["Platform"] = submission_logs.loader_data["system"]
+ row["Model"] = submission_logs.loader_data["benchmark"]
+ row["MlperfModel"] = self.config.get_mlperf_model(
+ row["Model"], submission_logs.loader_data.get("model_mapping", {}))
+ row["Scenario"] = submission_logs.loader_data["scenario"]
+ row["Result"] = submission_logs.loader_data["performance_metric"]
+ row["Accuracy"] = json.dumps(
+ submission_logs.loader_data["accuracy_metrics"]).replace(
+ ",",
+ " ").replace(
+ '"',
+ "").replace(
+ "{",
+ "").replace(
+ "}",
+ "").strip()
+ row["number_of_nodes"] = submission_logs.system_json["number_of_nodes"]
+ row["host_processor_model_name"] = submission_logs.system_json["host_processor_model_name"]
+ row["host_processors_per_node"] = submission_logs.system_json["host_processors_per_node"]
+ row["host_processor_core_count"] = submission_logs.system_json["host_processor_core_count"]
+ row["accelerator_model_name"] = submission_logs.system_json["accelerator_model_name"]
+ row["accelerators_per_node"] = submission_logs.system_json["accelerators_per_node"]
+ row["Location"] = os.path.dirname(
+ submission_logs.loader_data["perf_path"])
+ row["framework"] = submission_logs.system_json["framework"]
+ row["operating_system"] = submission_logs.system_json["operating_system"]
+ notes = submission_logs.system_json.get("hw_notes", "")
+ if submission_logs.system_json.get("sw_notes"):
+ notes = notes + ". " if notes else ""
+ notes = notes + submission_logs.system_json.get("sw_notes")
+ row["notes"] = notes
+ row["compliance"] = submission_logs.loader_data["division"] # TODO
+ row["errors"] = 0
+ row["version"] = self.config.version
+ row["inferred"] = 1 if row["Scenario"] != submission_logs.performance_log["effective_scenario"] and (
+ submission_logs.performance_log["effective_scenario"], row["Scenario"]) != ("server", "interactive") else 0
+ row["has_power"] = os.path.exists(
+ submission_logs.loader_data["power_dir_path"])
+ unit = SPECIAL_UNIT_DICT.get(
+ row["MlperfModel"], UNIT_DICT).get(
+ row["Scenario"], UNIT_DICT[row["Scenario"]]
+ )
+ row["Units"] = unit
+ row["weight_data_types"] = submission_logs.measurements_json["weight_data_types"]
+ self.rows.append(row.copy())
+ if row["has_power"]:
+ row["Result"] = submission_logs.loader_data["power_metric"]
+ power_unit = POWER_UNIT_DICT[row["Scenario"]]
+ row["Units"] = power_unit
+ self.rows.append(row.copy())
+
+ def export_row(self, row: dict):
+ """Write a single result row to the CSV file.
+
+ Formats the row dictionary into a quoted CSV line and appends it
+ to the output file.
+
+ Args:
+ row (dict): Result row data keyed by column headers.
+ """
+ values = [f'"{row.get(key, "")}"' for key in self.head]
+ csv_row = ",".join(values) + "\n"
+ with open(self.csv_path, "+a") as csv:
+ csv.write(csv_row)
+
+ def export(self):
+ """Export all accumulated results to the CSV file.
+
+ Writes the header row first, then iterates through all collected
+ rows, exporting each one.
+ """
+ csv_header = ",".join(self.head) + "\n"
+ with open(self.csv_path, "w") as csv:
+ csv.write(csv_header)
+ for row in self.rows:
+ self.export_row(row)
diff --git a/tools/submission/submission_checker/utils.py b/tools/submission/submission_checker/utils.py
new file mode 100644
index 0000000000..b2aabbdb87
--- /dev/null
+++ b/tools/submission/submission_checker/utils.py
@@ -0,0 +1,316 @@
+import os
+from .constants import *
+from .parsers.loadgen_parser import LoadgenParser
+
+
+def list_dir(*path):
+ path = os.path.join(*path)
+ return sorted([f for f in os.listdir(
+ path) if os.path.isdir(os.path.join(path, f))])
+
+
+def list_files(*path):
+ path = os.path.join(*path)
+ return sorted([f for f in os.listdir(
+ path) if os.path.isfile(os.path.join(path, f))])
+
+
+def list_empty_dirs_recursively(*path):
+ path = os.path.join(*path)
+ return [dirpath for dirpath, dirs, files in os.walk(
+ path) if not dirs and not files]
+
+
+def list_dirs_recursively(*path):
+ path = os.path.join(*path)
+ return [dirpath for dirpath, dirs, files in os.walk(path)]
+
+
+def list_files_recursively(*path):
+ path = os.path.join(*path)
+ return [
+ os.path.join(dirpath, file)
+ for dirpath, dirs, files in os.walk(path)
+ for file in files
+ ]
+
+
+def files_diff(list1, list2, optional=None):
+ """returns a list of files that are missing or added."""
+ if not optional:
+ optional = []
+ optional = optional + ["mlperf_log_trace.json", "results.json", ".gitkeep"]
+ return set(list1).symmetric_difference(set(list2)) - set(optional)
+
+
+def check_extra_files(path, target_files):
+ missing_files = []
+ check_pass = True
+ folders = list_dir(path)
+ for dir in target_files.keys():
+ if dir not in folders:
+ check_pass = False
+ missing_files.append(os.path.join(path, dir))
+ else:
+ files = [f.split(".")[0]
+ for f in list_files(os.path.join(path, dir))]
+ for target_file in target_files[dir]:
+ if target_file not in files:
+ check_pass = False
+ missing_files.append(
+ f"{os.path.join(path, dir, target_file)}.png")
+ if "captions" not in files:
+ missing_files.append(
+ f"{os.path.join(path, dir, 'captions.txt')}")
+ return check_pass, missing_files
+
+
+def split_path(m):
+ return m.replace("\\", "/").split("/")
+
+
+def get_boolean(s):
+ if s is None:
+ return False
+ elif isinstance(s, bool):
+ return s
+ elif isinstance(s, str):
+ return s.lower() == "true"
+ elif isinstance(s, int):
+ return bool(s)
+ else:
+ raise TypeError(
+ f"Variable should be bool, string or int, got {type(s)} instead"
+ )
+
+
+def merge_two_dict(x, y):
+ z = x.copy()
+ for key in y:
+ if key not in z:
+ z[key] = y[key]
+ else:
+ z[key] += y[key]
+ return z
+
+
+def sum_dict_values(x):
+ count = 0
+ for key in x:
+ count += x[key]
+ return count
+
+
+def is_number(s):
+ try:
+ float(s)
+ return True
+ except ValueError:
+ return False
+
+
+def get_performance_metric(
+ config, model, path, scenario_fixed):
+ # Assumes new logging format
+ version = config.version
+
+ fname = os.path.join(path, "mlperf_log_detail.txt")
+ mlperf_log = LoadgenParser(fname)
+ if (
+ "result_validity" in mlperf_log.get_keys()
+ and mlperf_log["result_validity"] == "VALID"
+ ):
+ is_valid = True
+ scenario = mlperf_log["effective_scenario"]
+
+ res = float(mlperf_log[RESULT_FIELD_NEW[version][scenario]])
+ if (
+ version in RESULT_FIELD_BENCHMARK_OVERWRITE
+ and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+ and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]
+ ):
+ res = float(
+ mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+ [model][scenario]]
+ )
+
+ inferred = False
+ if scenario_fixed != scenario:
+ inferred, res, _ = get_inferred_result(
+ scenario_fixed, scenario, res, mlperf_log, config, False
+ )
+
+ return res
+
+
+def get_inferred_result(
+ scenario_fixed, scenario, res, mlperf_log, config, log_error=False
+):
+
+ inferred = False
+ is_valid = True
+ # Check if current scenario (and version) uses early stopping
+ uses_early_stopping = config.uses_early_stopping(scenario)
+
+ latency_mean = mlperf_log["result_mean_latency_ns"]
+ if scenario in ["MultiStream"]:
+ latency_99_percentile = mlperf_log[
+ "result_99.00_percentile_per_query_latency_ns"
+ ]
+ latency_mean = mlperf_log["result_mean_query_latency_ns"]
+ samples_per_query = mlperf_log["effective_samples_per_query"]
+ if scenario == "SingleStream":
+ # qps_wo_loadgen_overhead is only used for inferring Offline from
+ # SingleStream; only for old submissions
+ qps_wo_loadgen_overhead = mlperf_log["result_qps_without_loadgen_overhead"]
+
+ # special case for results inferred from different scenario
+ if scenario_fixed in ["Offline"] and scenario in ["SingleStream"]:
+ inferred = True
+ res = qps_wo_loadgen_overhead
+
+ if (scenario_fixed in ["Offline"]) and scenario in ["MultiStream"]:
+ inferred = True
+ res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
+
+ if (scenario_fixed in ["MultiStream"]) and scenario in ["SingleStream"]:
+ inferred = True
+ # samples_per_query does not match with the one reported in the logs
+ # when inferring MultiStream from SingleStream
+ samples_per_query = 8
+ if uses_early_stopping:
+ early_stopping_latency_ms = mlperf_log["early_stopping_latency_ms"]
+ if early_stopping_latency_ms == 0 and log_error:
+ log.error(
+ "Not enough samples were processed for early stopping to make an estimate"
+ )
+ is_valid = False
+ res = (early_stopping_latency_ms * samples_per_query) / MS_TO_NS
+ else:
+ res = (latency_99_percentile * samples_per_query) / MS_TO_NS
+ if (scenario_fixed in ["Interactive"]) and scenario not in ["Server"]:
+ is_valid = False
+ return inferred, res, is_valid
+
+
+def check_compliance_perf_dir(test_dir):
+ is_valid = False
+ import logging
+ log = logging.getLogger("main")
+
+ fname = os.path.join(test_dir, "verify_performance.txt")
+ if not os.path.exists(fname):
+ log.error("%s is missing in %s", fname, test_dir)
+ is_valid = False
+ else:
+ with open(fname, "r") as f:
+ for line in f:
+ # look for: TEST PASS
+ if "TEST PASS" in line:
+ is_valid = True
+ break
+ if is_valid == False:
+ log.error(
+ "Compliance test performance check in %s failed",
+ test_dir)
+
+ # Check performance dir
+ test_perf_path = os.path.join(test_dir, "performance", "run_1")
+ if not os.path.exists(test_perf_path):
+ log.error("%s has no performance/run_1 directory", test_dir)
+ is_valid = False
+ else:
+ diff = files_diff(
+ list_files(test_perf_path),
+ REQUIRED_COMP_PER_FILES,
+ ["mlperf_log_accuracy.json"],
+ )
+ if diff:
+ log.error(
+ "%s has file list mismatch (%s)",
+ test_perf_path,
+ diff)
+ is_valid = False
+
+ return is_valid
+
+
+def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
+ # parse the power logs
+ import datetime
+ import logging
+ log = logging.getLogger("main")
+ server_timezone = datetime.timedelta(0)
+ client_timezone = datetime.timedelta(0)
+
+ detail_log_fname = os.path.join(log_path, "mlperf_log_detail.txt")
+ mlperf_log = LoadgenParser(detail_log_fname)
+ datetime_format = "%m-%d-%Y %H:%M:%S.%f"
+ power_begin = (
+ datetime.datetime.strptime(mlperf_log["power_begin"], datetime_format)
+ + client_timezone
+ )
+ power_end = (
+ datetime.datetime.strptime(mlperf_log["power_end"], datetime_format)
+ + client_timezone
+ )
+ # Obtain the scenario also from logs to check if power is inferred
+ scenario = mlperf_log["effective_scenario"]
+
+ spl_fname = os.path.join(log_path, "spl.txt")
+ power_list = []
+ with open(spl_fname) as f:
+ for line in f:
+ if not line.startswith("Time"):
+ continue
+ timestamp = (
+ datetime.datetime.strptime(line.split(",")[1], datetime_format)
+ + server_timezone
+ )
+ if timestamp > power_begin and timestamp < power_end:
+ value = float(line.split(",")[3])
+ if value > 0:
+ power_list.append(float(line.split(",")[3]))
+
+ if len(power_list) == 0:
+ log.error(
+ "%s has no power samples falling in power range: %s - %s",
+ spl_fname,
+ power_begin,
+ power_end,
+ )
+ is_valid = False
+ else:
+ avg_power = sum(power_list) / len(power_list)
+ power_duration = (power_end - power_begin).total_seconds()
+ if scenario_fixed in ["Offline", "Server", "Interactive"]:
+ # In Offline and Server scenarios, the power metric is in W.
+ power_metric = avg_power
+ avg_power_efficiency = res / avg_power
+
+ else:
+ # In SingleStream and MultiStream scenarios, the power metric is in
+ # mJ/query.
+ assert scenario_fixed in [
+ "MultiStream",
+ "SingleStream",
+ ], "Unknown scenario: {:}".format(scenario_fixed)
+
+ num_queries = int(mlperf_log["result_query_count"])
+
+ power_metric = avg_power * power_duration * 1000 / num_queries
+
+ if scenario_fixed in ["SingleStream"]:
+ samples_per_query = 1
+ elif scenario_fixed in ["MultiStream"]:
+ samples_per_query = 8
+
+ if (scenario_fixed in ["MultiStream"]
+ ) and scenario in ["SingleStream"]:
+ power_metric = (
+ avg_power * power_duration * samples_per_query * 1000 / num_queries
+ )
+
+ avg_power_efficiency = (samples_per_query * 1000) / power_metric
+
+ return is_valid, power_metric, scenario, avg_power_efficiency
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker_old.py
similarity index 100%
rename from tools/submission/submission_checker.py
rename to tools/submission/submission_checker_old.py