mlcommons · arjunsuresh · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
@@ -1,5 +1,4 @@
 name: Reset Current Branch to Upstream After Squash Merge
-
 on:
   workflow_dispatch:
     inputs:

@@ -16,6 +16,31 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
 
 Please see [here](https://docs.mlcommons.org/inference/benchmarks/) for the MLPerf inference documentation website which includes automated commands to run MLPerf inference benchmarks using different implementations.
 
+## MLPerf Inference v5.0 (submission deadline February 28, 2025)
+
+For submissions, please use the master branch and any commit since the [5.0 seed release](https://github.com/mlcommons/inference/commit/5d83ed5de438ffb55bca4cdb2966fba90a9dbca6) although it is best to use the latest commit in the [master branch](https://github.com/mlcommons/inference).
+
+For power submissions please use [SPEC PTD 1.11.1](https://github.com/mlcommons/power) (needs special access) and any commit of the power-dev repository after the [code-freeze](https://github.com/mlcommons/power-dev/commit/65eedd4a60b5c50ac44cbae061d2a428e9fb190a)
+
+
+| model | reference app | framework | dataset | category
+| ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | tensorflow, onnx, tvm, ncnn | imagenet2012 | edge,datacenter |
+| retinanet 800x800 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | pytorch, onnx | openimages resized to 800x800| edge,datacenter |
+| bert | [language/bert](https://github.com/mlcommons/inference/tree/master/language/bert) | tensorflow, pytorch, onnx | squad-1.1 | edge |
+| dlrm-v2 | [recommendation/dlrm_v2](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorch) | pytorch | Multihot Criteo Terabyte | datacenter |
+| 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 | edge,datacenter |
+| gpt-j | [language/gpt-j](https://github.com/mlcommons/inference/tree/master/language/gpt-j)| pytorch | CNN-Daily Mail | edge,datacenter |
+| stable-diffusion-xl | [text_to_image](https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
+| llama2-70b | [language/llama2-70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) | pytorch | OpenOrca | datacenter |
+| llama3.1-405b | [language/llama3-405b](https://github.com/mlcommons/inference/tree/master/language/llama3.1-405b) | pytorch | LongBench, LongDataCollections, Ruler, GovReport | datacenter |
+| mixtral-8x7b | [language/mixtral-8x7b](https://github.com/mlcommons/inference/tree/master/language/mixtral-8x7b) | pytorch | OpenOrca, MBXP, GSM8K | datacenter |
+| rgat | [graph/rgat](https://github.com/mlcommons/inference/tree/master/graph/R-GAT) | pytorch | IGBH | datacenter |
+| pointpainting | [automotive/3d-object-detection](https://github.com/mlcommons/inference/tree/master/automotive/3d-object-detection) | pytorch, onnx | Waymo Open Dataset | edge |
+
+* Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.
+
+
 ## MLPerf Inference v4.1 (submission deadline July 26, 2024)
 
 For submissions, please use the master branch and any commit since the [4.1 seed release](https://github.com/mlcommons/inference/pull/1736/files) although it is best to use the latest commit. v4.1 tag will be created from the master branch after the result publication.

@@ -111,10 +111,6 @@ def get_args():
         help="device to run the benchmark",
     )
 
-    # file to use mlperf rules compliant parameters
-    parser.add_argument(
-        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
-    )
     # file for user LoadGen settings such as target QPS
     parser.add_argument(
         "--user_conf",
@@ -350,11 +346,6 @@ def main():
         "cmdline": str(args),
     }
 
-    mlperf_conf = os.path.abspath(args.mlperf_conf)
-    if not os.path.exists(mlperf_conf):
-        log.error("{} not found".format(mlperf_conf))
-        sys.exit(1)
-
     user_conf = os.path.abspath(args.user_conf)
     if not os.path.exists(user_conf):
         log.error("{} not found".format(user_conf))

@@ -41,3 +41,4 @@ The `run_verification.py` found in each test directory will copy the test files
 | Llama3.1-405b | [TEST06](./TEST06/) |
 | mixtral-8x7b | [TEST06](./TEST06/) |
 | R-GAT | [TEST01](./TEST01/) |
+| PointPainting | [TEST01](./TEST01/) [TEST04](./TEST04/) |
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 256
@@ -30,11 +30,12 @@
 log = logging.getLogger("Mixtral-8x7B-Instruct-v0.1")
 
 gen_kwargs = {
-    "early_stopping": True,
-    "max_new_tokens": 1024,
+    # "min_new_tokens": 1,
     "min_new_tokens": 2,
-    "num_beams": 1,
+    "max_new_tokens": 1024,
     "do_sample": False,
+    "temperature": None,
+    "top_p": None,
 }
 
 
@@ -238,80 +239,32 @@ def process_queries(self):
                 input_masks_tensor = []
                 input_len = []
                 input_dataset = []
+                batch_texts = []
+                datasets = []
                 for q in qitem:
-                    input_ids_tensor.append(
-                        pad(
-                            self.data_object.input_ids[q.index],
-                            (
-                                max_seq_len -
-                                self.data_object.input_lens[q.index],
-                                0,
-                                0,
-                                0,
-                            ),
-                            value=self.tokenizer.pad_token_id,
-                        )
-                    )
-                    input_masks_tensor.append(
-                        pad(
-                            self.data_object.attention_masks[q.index],
-                            (
-                                max_seq_len -
-                                self.data_object.input_lens[q.index],
-                                0,
-                                0,
-                                0,
-                            ),
-                            value=0,
-                        )
-                    )
+                    batch_texts.append(self.data_object.input_texts[q.index])
                     input_len.append(self.data_object.input_lens[q.index])
-
                     # In case we predict code generation, we can specify an
                     # additional stop sequence
                     input_dataset.append(
                         self.data_object.dataset_names[q.index])
-                input_ids_tensor = torch.cat(input_ids_tensor)
-                input_masks_tensor = torch.cat(input_masks_tensor)
 
-                assert input_ids_tensor.shape == input_masks_tensor.shape
-                assert input_ids_tensor.shape[0] <= self.batch_size
+                batch_ids = self.tokenizer.batch_encode_plus(
+                    batch_texts, return_tensors="pt", padding=True)
+                batch_ids = batch_ids.to(self.device)
 
                 tik2 = time.time()
-                logits_processor = LogitsProcessorList(
-                    [StopAfterSequence(
-                        self.tokenizer.eos_token_id, device=self.device)]
-                )
-                for i in range(len(input_ids_tensor)):
-                    ids, masks, dataset = (
-                        input_ids_tensor[i: i + 1],
-                        input_masks_tensor[i: i + 1],
-                        input_dataset[i],
-                    )
-                    pred_output_tokens = []
-                    if dataset == "MBXP":
-                        out = self.model.generate(
-                            input_ids=ids,
-                            attention_mask=masks,
-                            pad_token_id=self.tokenizer.pad_token_id,
-                            logits_processor=logits_processor,
-                            **gen_kwargs,
-                        )
-                    else:
-                        out = self.model.generate(
-                            input_ids=ids,
-                            attention_mask=masks,
-                            pad_token_id=self.tokenizer.pad_token_id,
-                            **gen_kwargs,
-                        )
-                    pred_output_tokens.append(out)
-                pred_output_tokens = torch.cat(pred_output_tokens)
+                _, length = batch_ids.input_ids.shape
+                out = self.model.generate(
+                    **batch_ids, num_return_sequences=1, **gen_kwargs)
+                pred_output_tokens = out
                 tik3 = time.time()
 
                 processed_output = self.data_object.postProcess(
                     pred_output_tokens,
-                    input_seq_lens=input_len,
+                    length=length,
                     query_id_list=query_ids,
+                    dataset_list=input_dataset,
                 )
 
             for i in range(len(qitem)):
@@ -342,10 +295,7 @@ def process_queries(self):
 
     def load_model(self):
         self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            torch_dtype=self.amp_dtype,
+            self.model_path, device_map="auto", trust_remote_code=True
         )
         print("Loaded model")
 
@@ -362,10 +312,7 @@ def load_model(self):
             pass
 
         self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            model_max_length=1024,
-            padding_side="left",
-            use_fast=False,
+            self.model_path, padding_side="left", trust_remote_code=True
         )
 
         self.tokenizer.pad_token = self.tokenizer.eos_token

@@ -67,6 +67,7 @@ def load_processed_dataset(self):
         processed_data = pd.read_pickle(self.dataset_path)
 
         input_tokens = processed_data["tok_input"]
+        self.input_texts = processed_data["input"].to_list()
 
         self.input_ids = []
         self.input_lens = []
@@ -85,12 +86,28 @@ def load_processed_dataset(self):
             self.dataset_names.append(dataset)
         print("Finished loading dataset.")
 
+    def remove_trailing_twos(self, lst, eos=2):
+        count = 0
+        for num in reversed(lst):
+            if num == eos or num == 0:
+                count += 1
+            else:
+                break
+        return lst[:-count] if count > 0 else lst
+
+    def mbxp_stop(self, lst, stop_tokens=[13, 13940, 28832, 13]):
+        for i in range(len(lst) - len(stop_tokens) + 1):
+            if (lst[i:i + len(stop_tokens)] == stop_tokens).all():
+                return lst[:i + len(stop_tokens)]
+        return lst
+
     def postProcess(
         self,
         out_tokens,
-        input_seq_lens=None,
+        length=None,
         query_id_list=None,
         sample_index_list=None,
+        dataset_list=None,
     ):
         """Postprocesses output prediction"""
 
@@ -106,13 +123,14 @@ def postProcess(
         """
         # Everything is padded to max_len (1024), so prune the input and parse
         # to numpy
-        output_seq = out_tokens[:, 1024:].cpu().numpy()
+        output_seq = out_tokens[:, length:].cpu().numpy()
         aux_seq = []
         assert len(query_id_list) == output_seq.shape[0]
         for i in range(len(output_seq)):
             aux = output_seq[i]
-            while len(output_seq[i]) <= 1:
-                aux = np.append(aux, self.tokenizer.eos_token_id)
+            aux = self.remove_trailing_twos(aux)
+            if (dataset_list[i] == "MBXP"):
+                aux = self.mbxp_stop(aux)
             aux_seq.append(aux)
         output_seq = np.stack(aux_seq)
 

@@ -1 +1 @@
-5.0.13
+5.0.14
@@ -19,20 +19,22 @@ llama2-70b-interactive.*.performance_sample_count_override = 24576
 llama3_1-405b.*.performance_sample_count_override = 8313
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
+pointpainting.*.performance_sample_count_override = 1024
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
 
 # Set seeds.
 *.*.qsl_rng_seed = 6023615788873153749
 *.*.sample_index_rng_seed = 15036839855038426416
 *.*.schedule_rng_seed = 9933818062894767841
-# Set seeds for TEST_05.
+
+# Set seeds for TEST_05 (not needed from v5.0 onwards)
 *.*.test05_qsl_rng_seed = 7975553102935885558
 *.*.test05_sample_index_rng_seed = 11403566307062068064
 *.*.test05_schedule_rng_seed = 15816800565822761601
 
-
 *.SingleStream.target_latency_percentile = 90
+pointpainting.SingleStream.target_latency_percentile = 99.9
 *.SingleStream.min_duration = 600000
 
 *.MultiStream.target_latency_percentile = 99

@@ -271,7 +271,7 @@
             "mixtral-8x7b",
             "llama3.1-405b",
             "rgat",
-            # TODO: add automotive?
+            "pointpainting",
         ],
         "required-scenarios-datacenter": {
             "resnet": ["Server", "Offline"],
@@ -302,6 +302,7 @@
             "gptj-99": ["SingleStream", "Offline"],
             "gptj-99.9": ["SingleStream", "Offline"],
             "stable-diffusion-xl": ["SingleStream", "Offline"],
+            "pointpainting": ["SingleStream"],
         },
         "optional-scenarios-edge": {},
         "required-scenarios-datacenter-edge": {
@@ -323,6 +324,7 @@
             "mixtral-8x7b": ["Server", "Offline"],
             "llama3.1-405b": ["Server", "Offline"],
             "rgat": ["Offline"],
+            "pointpainting": ["SingleStream"],
         },
         "optional-scenarios-datacenter-edge": {},
         "accuracy-target": {
@@ -424,6 +426,7 @@
                 684.68 * 0.9,
             ),
             "rgat": ("acc", 0.7286 * 0.99),
+            "pointpainting": ("mAP", 0.5425 * 0.999),
         },
         "accuracy-upper-limit": {
             "stable-diffusion-xl": (
@@ -460,8 +463,8 @@
             "stable-diffusion-xl": 5000,
             "mixtral-8x7b": 15000,
             "llama3.1-405b": 8313,
-            "rgat": 788379
-
+            "rgat": 788379,
+            "pointpainting": 1024,
         },
         # model_mapping.json is expected in the root directory of the
         # submission folder for open submissions and so the below dictionary is
@@ -527,7 +530,8 @@
             },
             "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
-            "rgat": {"SingleStream": 1024, "Offline": 1}
+            "rgat": {"SingleStream": 1024, "Offline": 1},
+            "pointpainting": {"SingleStream": 1024},
         },
     },
 }
@@ -731,7 +735,7 @@
 ACC_PATTERN = {
     "acc": r"^(?:\{\"accuracy|accuracy)[\": ]*=?\s*([\d\.]+).*",
     "AUC": r"^AUC=([\d\.]+).*",
-    "mAP": r"^mAP=([\d\.]+).*",
+    "mAP": r".*'(?:mAP|Total)':\s*([\d\.]+)",
     "bleu": r"^BLEU\:\s*([\d\.]+).*",
     "F1": r"^{[\"\']exact_match[\"\']\:\s*[\d\.]+,\s*[\"\']f1[\"\']\:\s*([\d\.]+)}",
     "WER": r"Word Error Rate\:.*, accuracy=([0-9\.]+)%",

@@ -3,7 +3,7 @@
 """
 
 # pylint: disable=unused-argument,missing-docstring
-import torch  # currently supports pytorch1.0
+import torch
 import torchvision
 import backend
 
@@ -25,7 +25,7 @@ def image_format(self):
         return "NCHW"
 
     def load(self, model_path, inputs=None, outputs=None):
-        self.model = torch.load(model_path)
+        self.model = torch.load(model_path, weights_only=False)
         self.model.eval()
         # find inputs from the model if not passed in by config
         if inputs: