From 59fdcffa3c4c49b32ff62c1927d469f0435a6d9b Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Tue, 18 Nov 2025 03:33:55 +0000
Subject: [PATCH 1/6] setup env and quantize model.

---
 .../mlperf/llama2-70b/quantize_70b.sh         |  24 ++++
 .../mlperf/llama2-70b/quantize_autoround.py   | 107 ++++++++++++++++++
 .../mlperf/llama2-70b/requirements.txt        |   4 +
 .../mlperf/llama2-70b/setup.sh                |  18 +++
 .../mlperf/llama2-70b/simple_autoround.py     |  23 ++++
 5 files changed, 176 insertions(+)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
new file mode 100644
index 00000000000..20afd8b22d3
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
@@ -0,0 +1,24 @@
+CHECKPOINT_PATH=meta-llama/Llama-2-70b-chat-hf
+CALIBRATION_DATA_PATH=./open_orca/open_orca_gpt4_tokenized_llama.calibration_1000.pkl
+NUM_GROUPS=-1
+NUM_SAMPLES=1000
+ITERS=200
+BATCH_SIZE=1
+NUM_CORES=$(($(lscpu | grep "Socket(s):" | awk '{print $2}') * $(lscpu | grep "Core(s) per socket:" | awk '{print $4}')))
+END_CORE=$(($NUM_CORES - 1))
+
+run_cmd="python -u"
+if (( XPU_COUNT < 1 )); then
+        export OMP_NUM_THREADS=$NUM_CORES
+        run_cmd="numactl -C 0-$END_CORE python -u"
+fi
+
+$run_cmd quantize_autoround.py \
+        --model_name ${CHECKPOINT_PATH} \
+        --dataset ${CALIBRATION_DATA_PATH} \
+        --group_size ${NUM_GROUPS} \
+        --bits 4 \
+        --iters ${ITERS} \
+        --batch_size ${BATCH_SIZE} \
+        --device auto \
+        --lr 2.5e-3 2>&1 | tee autoround_log_${NUM_GROUPS}g_${NUM_SAMPLES}nsamples_${ITERS}iters.log
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
new file mode 100644
index 00000000000..93a0483438f
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
@@ -0,0 +1,107 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from auto_round import AutoRound
+from datasets import Dataset
+import os
+import pandas as pd
+import argparse
+import json
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="facebook/opt-125m"
+    )
+    parser.add_argument(
+        "--dataset", type=str, required=True
+    )
+    parser.add_argument("--bits", default=4, type=int,
+                        help="number of  bits")
+    parser.add_argument("--group_size", default=128, type=int,
+                        help="group size")
+    parser.add_argument("--sym", default=False, action='store_true',
+                        help=" sym quantization")
+    parser.add_argument("--iters", default=200, type=int)
+    parser.add_argument("--batch_size", default=1, type=int)
+    parser.add_argument("--lr", default=None, type=float,
+                        help="learning rate, if None, it will be set to 1.0/iters automatically")
+    parser.add_argument("--minmax_lr", default=None, type=float,
+                        help="minmax learning rate, if None,it will beset to be the same with lr")
+    parser.add_argument("--device", default='fake', type=str,
+                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu' and 'xpu'."
+                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
+    parser.add_argument("--fp8_kv", default=False, action='store_true',
+                        help="set fp8 kv")
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Note: Using pickle with trusted dataset files only
+    # In production, consider using safer serialization formats like JSON or HDF5
+    dataframe = pd.read_pickle(args.dataset)  # nosec B301
+    dataframe_str = dataframe["input"]
+    print(len(dataframe_str))
+    dataset_list = []
+    token_list = []
+    for data_str in dataframe_str:
+        # print(data_str)
+        data_token = tokenizer.encode(
+            data_str,
+            max_length=1024,
+            truncation=True, add_special_tokens=False)
+        token_list.append(data_token)
+        data_str = tokenizer.decode(data_token)
+        dataset_list.append(data_str)
+
+    ds = Dataset.from_dict({"input_ids": token_list})
+    recipe = """
+    quant_stage:
+      quant_modifiers:
+        QuantizationModifier:
+          kv_cache_scheme:
+            num_bits: 8
+            type: int
+            strategy: tensor
+            dynamic: false
+            symmetric: true
+    """
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 1024
+
+    autoround = AutoRound(
+        model,
+        tokenizer,
+        scheme="MXFP4",
+        iters=args.iters,
+        lr=args.lr,
+        minmax_lr=args.minmax_lr,
+        nsamples=len(dataframe_str),
+        seqlen=MAX_SEQUENCE_LENGTH,
+        batch_size=args.batch_size,
+        dataset=dataset_list,
+        enable_torch_compile=False,
+        amp=False,
+        device_map="0",
+        static_kv_dtype="fp8" if args.fp8_kv else None,
+        # device_map="0,1",
+        #device=args.device
+        )
+    
+    orig_path = args.model_name
+    packing_format="llm_compressor"
+    if orig_path.endswith("/"):
+        output_dir=orig_path[:-1]+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}"
+    else:
+        output_dir=orig_path+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}"
+    if args.fp8_kv:
+        output_dir += "-fp8kv"
+    autoround.quantize_and_save(output_dir, format=f'{packing_format}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt
new file mode 100644
index 00000000000..bf2e5761647
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt
@@ -0,0 +1,4 @@
+lm-eval==0.4.9.1
+setuptools_scm
+torchao==0.12.0
+triton==3.3.1
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh
new file mode 100644
index 00000000000..5966ef95ad7
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh
@@ -0,0 +1,18 @@
+pip install -r requirements.txt
+
+mkdir -p /workspace/build_envs
+
+cd /workspace/build_envs
+
+pip install setuptools --upgrade
+pip install packaging --upgrade
+pip install -U "huggingface_hub[cli]"
+git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
+cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
+cd ..
+
+git clone https://github.com/mlcommons/inference.git mlperf-inference && cd mlperf-inference && cd loadgen && \
+python3 -m pip install .
+
+pip install auto-round==0.9.0
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py
new file mode 100644
index 00000000000..0a4e895a946
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py
@@ -0,0 +1,23 @@
+
+from auto_round import AutoRound
+
+# Load a model (supports FP8/BF16/FP16/FP32)
+model_name_or_path = "meta-llama/Llama-2-70b-chat-hf"
+#output_dir = "Llama-2-70b-chat-hf-mxfp4-fp8kv"
+output_dir = "Llama-2-70b-chat-hf-mxfp4"
+
+#ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200, static_kv_dtype="fp8",)
+ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200)
+
+# Highest accuracy (4–5× slower).
+# `low_gpu_mem_usage=True` saves ~20GB VRAM but runs ~30% slower.
+# ar = AutoRound(model_name_or_path, nsamples=512, iters=1000, low_gpu_mem_usage=True)
+
+# Faster quantization (2–3× speedup) with slight accuracy drop at W4G128.
+# ar = AutoRound(model_name_or_path, nsamples=128, iters=50, lr=5e-3)
+
+# Supported formats: "auto_round" (default), "auto_gptq", "auto_awq", "llm_compressor", "gguf:q4_k_m", etc.
+# ar.quantize_and_save(output_dir="./tmp_autoround", format="llm_compressor")
+ar.quantize_and_save(output_dir=output_dir, format="llm_compressor")
+
+

From 24f55a18d288d770d0d9473a274dfc137fd48e33 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Tue, 18 Nov 2025 05:50:40 +0000
Subject: [PATCH 2/6] add inference and evaluate code.

---
 .../mlperf/llama2-70b/SUT.py                  | 960 ++++++++++++++++++
 .../mlperf/llama2-70b/dataset.py              | 148 +++
 .../evaluate-accuracy-cnn.py                  | 140 +++
 .../evaluation_scripts/evaluate-accuracy.py   | 140 +++
 .../mlperf/llama2-70b/main.py                 | 162 +++
 .../mlperf/llama2-70b/run_quantization.py     | 116 +++
 .../mlperf/llama2-70b/user.conf               |  10 +
 .../mlperf/llama2-70b/utils.py                |  71 ++
 .../mlperf/llama2-70b/utils_model.py          | 100 ++
 9 files changed, 1847 insertions(+)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
new file mode 100644
index 00000000000..8af65d15d99
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
@@ -0,0 +1,960 @@
+import os
+import time
+import numpy as np
+import math
+import array
+import torch
+from typing import Tuple
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+
+import time
+import threading
+import torch.multiprocessing as mp
+import gc
+import types
+
+import logging
+
+import mlperf_loadgen as lg
+from dataset import Dataset
+
+import os
+from tqdm import tqdm
+import sys
+
+from vllm import LLM
+from vllm.inputs import TokensPrompt
+from vllm.config import CompilationConfig, CompilationLevel
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("SUT")
+
+#MAX_NUM_BATCHED_TOKENS=int(os.environ.get("MAX_NUM_BATCHED_TOKENS", 32768))
+MAX_NUM_BATCHED_TOKENS=4096
+MAX_MODEL_LEN=int(os.environ.get("MAX_MODEL_LEN", MAX_NUM_BATCHED_TOKENS))
+PREFILL_BLOCK_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_STEP", "128"))
+PREFILL_MIN_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_MIN", "128"))
+PREFILL_MAX_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_MAX", "1024"))
+SERVER_TIME_LIMIT = float(os.environ.get("SERVER_TIME_LIMIT", "2"))
+SERVER_COMPUTE_TIME = float(os.environ.get("SERVER_COMPUTE_TIME", "1"))
+BATCHED_PREFILL = os.environ.get("BATCHED_PREFILL", "1")=="1"
+MAX_BATCHED_PROMPT_LEN = int(os.environ.get('VLLM_MAX_BATCHED_PROMPT_LEN', sys.maxsize))
+ENABLE_TORCH_COMPILE = int(os.environ.get("ENABLE_TORCH_COMPILE", "0"))==1
+
+MODEL_NAME = os.environ.get("MODEL_NAME", "").lower()
+
+MODEL_INIT_DIR = os.environ.get("MODEL_INIT_DIR", "")
+XPU_COUNT = int(os.environ.get("XPU_COUNT", "0"))
+
+BLOCK_SIZE = 8
+sys.path.append(MODEL_INIT_DIR)
+
+from utils_model import *
+
+# Copied from vllm/worker/hpu_model_runner.py
+def warmup_range(config: Tuple[int, int, int]):
+    """Generate a warmup range.
+
+    Start from bmin and multiply by 2 until you reach bstep.
+    Then, increase the values in the range by the value of bstep until you 
+    reach bmax.
+
+    Example:
+    bmin = 2, bstep = 32, bmax = 64
+    => ramp_up = (2, 4, 8, 16)
+    => stable = (32, 64)
+    => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
+    """
+    import itertools
+    import operator
+    bmin, bstep, bmax = config
+    if bmin > bmax:
+        raise ValueError("Min. batch size cannot be greater than max. "
+                        "batch size. If you want to skip warmup, "
+                        "set VLLM_SKIP_WARMUP=true")
+    base = itertools.repeat(2)
+    ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
+    ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
+        ramp_up_acc)
+    stable = range(bstep, bmax + 1, bstep)
+    buckets = list(ramp_up_tw) + list(stable)
+    return list(filter(lambda bucket: bucket >= bmin, buckets))
+
+PREFILL_BUCKETS = warmup_range((PREFILL_MIN_SIZE, PREFILL_BLOCK_SIZE, PREFILL_MAX_SIZE))
+
+def len_to_bucket(length):
+    bucket = 0
+    while length>PREFILL_BUCKETS[bucket]:
+        bucket+=1
+    return bucket
+
+def void(*args, **kwargs):
+    pass
+
+# Disable prints if progress bar is active
+PBAR = int(os.environ.get("PBAR", "1"))
+# Update frequency of the progress bar
+PBAR_FREQ = int(os.environ.get("PBAR_FREQ", "10"))
+if PBAR==1:
+    print = void
+
+class Instance(mp.Process):
+    def __init__(self,**kwargs):
+        super(Instance, self).__init__()
+        for key, value in kwargs.items():
+            # Set each keyword argument as an attribute of the instance
+            setattr(self, key, value)
+
+        if hasattr(self, "lg_settings"):
+            self.max_ttft_latency = self.lg_settings.ttft_latency/1e9
+            self.max_tpot_latency = self.lg_settings.tpot_latency/1e9
+        else:
+            self.max_ttft_latency = 2
+            self.max_tpot_latency = 0.2
+        self.query_idx_mapping = []
+        self.qid_mapping = []
+        self.start_time_mapping = []
+        self.wait_time_mapping = []
+        self.first_time_mapping = []
+        self.first_token_id = []
+        self.finished = False
+        
+        self.tpot_list = []
+        self.tprefill_list = []
+        self.nprefill_list = []
+        
+
+        # Extend/override Instance methods if it's defined
+        try:
+            self.instance_override = types.MethodType(INSTANCE_OVERRIDE, self)
+        except NameError:
+            pass
+        else:
+            self.instance_override()
+
+    def do_warmup(self):
+        dummy_input = torch.zeros((128,), dtype=torch.int32)
+        inputs = TokensPrompt(prompt_token_ids=dummy_input.tolist())
+        from vllm import SamplingParams
+        sampling_params = SamplingParams(
+            max_tokens=8,
+            min_tokens=8,
+            temperature=0.0,
+            detokenize=False
+        )
+        # Using a large value for request_id to avoid collisions
+        for i in range(1):
+            self.model.llm_engine.add_request(str(int(1e8)),
+                inputs,
+                sampling_params)
+
+        while self.model.llm_engine.has_unfinished_requests():
+            step_outputs = self.model.llm_engine.step()
+            for output in step_outputs:
+                request_id = int(output.request_id)
+                if output.finished:
+                    token_ids = output.outputs[0].token_ids
+                    log.info(f"[{time.time():.3f}] Warmup finished, "
+                        f"Rank {self.rank}, "
+                        f"Generated {len(token_ids)} tokens")
+
+        with self.cond_var:
+            self.alive_counter.value += 1
+            self.cond_var.notify()
+
+    def step_engine_prompt(self, *args):
+        self.step_engine(*args)
+
+    def run(self):
+        gc.disable()
+
+        # Select correct XPU device
+        if XPU_COUNT>0:
+            os.environ["ONEAPI_DEVICE_SELECTOR"]=f'level_zero:{self.xpu_devices}'
+            pass
+        
+        self.load_dataset()
+
+        log.info("Loading Model")
+        self.load_model()
+
+        self.sampling_params = SAMPLING_PARAMS
+
+        self.warmup_req_idx = set([])
+        if self.warmup:
+            self.do_warmup()
+        else:
+            with self.cond_var:
+                self.alive_counter.value += 1
+                self.cond_var.notify()
+
+        keep_alive = True
+        self.req_counter = 0 # Request counter
+        
+        # Running seq_len array
+        self.running_seq_len = np.zeros(self.batch_size)
+        # Map ind within a batch to request id
+        self.running_ind_to_id = np.zeros(self.batch_size)
+
+        while keep_alive:
+            keep_alive = self.process_queries()
+
+        # V1 shutdown engine
+        self.model.llm_engine.engine_core.shutdown()
+
+        with self.cond_var:
+            self.alive_counter.value -= 1
+            # The instance needs to get restarted
+            if self.alive_counter.value == 0:
+                pass
+            else:
+                self.input_queue.put(None)
+            self.cond_var.notify()
+
+    def load_model(self):
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            use_inductor=True,
+            # compile_sizes=[64,128,256],
+        )
+        
+        model_kwargs={
+            "model": self.model_path,
+            #"dtype": 'float16',
+            "dtype": "auto",
+            "skip_tokenizer_init": False,
+            "tensor_parallel_size": self.tp,
+            "pipeline_parallel_size": self.pp,
+            "max_num_seqs": self.batch_size,
+            "max_model_len": MAX_MODEL_LEN,
+            # "enforce_eager": True,
+            #"max_num_batched_tokens": MAX_NUM_BATCHED_TOKENS,
+            #"enable_prefix_caching": False,
+            #"distributed_executor_backend": "spawn",
+            # "calculate_kv_scales": True,
+        }
+
+        #if XPU_COUNT>0:
+        #    model_kwargs["kv_cache_dtype"] = "int8"
+        #else:
+        #    model_kwargs["kv_cache_dtype"] = "fp8_e5m2"
+        # model_kwargs["kv_cache_dtype"] = "fp8"
+        if ENABLE_TORCH_COMPILE:
+            model_kwargs["compilation_config"] = compilation_config
+        if self.quantized:
+            pass
+
+        log.info(model_kwargs)
+        # Model-specific params
+        #model_kwargs = model_kwargs | ADDITIONAL_MODEL_KWARGS
+
+        self.model = LLM(**model_kwargs)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path
+        )
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def load_dataset(self):
+        if "llama3_1-8b" in self.workload_name:
+            self.data_object = Dataset(model_name=self.model_path,
+                                            dataset_path=self.dataset_path,
+                                            total_sample_count=self.total_sample_count)
+        else:
+            self.data_object = Dataset(self.model_path,
+                                        dataset_path=self.dataset_path,
+                                        total_sample_count=self.total_sample_count,
+                                       )
+                                        #device="cpu")
+
+    def step_engine(
+        self,
+        pred_output_tokens,
+        query_ids,
+        qid,
+        pred_first_tokens,
+        query_ids_first,
+        qid_first,
+        ):
+        if self.model.llm_engine.has_unfinished_requests():
+            if len(self.first_token_id)>0:
+                end = time.time()
+            step_outputs = self.model.llm_engine.step()
+            if len(self.first_token_id)>0:
+                step_time = time.time()-end
+            for output in step_outputs:
+                request_id = int(output.request_id)
+
+                if request_id in self.warmup_req_idx: # Skip requests that are from the warmup 
+                    continue
+
+                if len(self.first_token_id) > 0:
+                    # Add step time to everything other than the one being prefilled
+                    if not PBAR:
+                        for i in range(len(self.tprefill_list)-len(self.first_token_id)):
+                            self.tprefill_list[i] += step_time
+                            self.nprefill_list[i] += len(self.first_token_id)
+
+                if output.finished:
+                    token_ids = output.outputs[0].token_ids
+                    pred_output_tokens.append(token_ids)
+                    query_ids.append(self.query_idx_mapping[request_id])
+                    qid.append(self.qid_mapping[request_id])
+
+                    finish_time = time.time()
+                    time_elapsed = finish_time - self.start_time_mapping[request_id]
+                    ttft = self.first_time_mapping[request_id]
+                    wait = self.wait_time_mapping[request_id]
+                    # Rest of the tokens
+                    tpot = (time_elapsed - ttft)/(len(token_ids)-1)
+                    with self.cond_var:
+                        if ttft+wait >= self.max_ttft_latency:
+                            self.over_ttft_counter.value += 1
+                        if tpot >= self.max_tpot_latency:
+                            self.over_tpot_counter.value += 1
+                        over_ttft_count = self.over_ttft_counter.value
+                        over_tpot_count = self.over_tpot_counter.value
+                        self.cond_var.notify()
+
+                    if not PBAR:
+                        self.tpot_list.append(tpot)
+                        tpot_sorted = np.sort(self.tpot_list)
+                        tpot_mean = np.mean(self.tpot_list)
+                        print(f"[{time.time():.3f}] Query {request_id} finished after {time_elapsed:.3f}s, "
+                            f"Rank {self.rank}, "
+                            f"TTFT {ttft:.3f}s ({over_ttft_count}), "
+                            f"TPOT {tpot:.3f}s ({over_tpot_count}), "
+                            f"prefill time {self.tprefill_list[request_id]:.3f}s, "
+                            f"prefill count {self.nprefill_list[request_id]}, "
+                            f"wait time {wait:.3f}s, "
+                            f"bs_c {self.model.llm_engine.get_num_unfinished_requests()}, "
+                            f"p99 tpot {tpot_sorted[int(0.99*len(tpot_sorted))]:.3f}s, "
+                            f"p50 tpot {tpot_mean:.3f}s, "
+                            f"generated {len(token_ids)} tokens")
+                
+                if request_id in self.first_token_id:
+                    self.first_time_mapping[request_id] = time.time()-self.start_time_mapping[request_id]
+                    if self.server:
+                        token_ids = output.outputs[0].token_ids
+                        pred_first_tokens.append(token_ids)
+                        query_ids_first.append(self.query_idx_mapping[request_id])
+                        qid_first.append(self.qid_mapping[request_id])
+                    self.first_token_id.remove(request_id)
+                    
+    def fetch_queries(self):
+        # For chunked prefill
+        # samples_to_fill = self.batch_size + self.prefill_batch_size - self.model.llm_engine.get_num_unfinished_requests()
+        samples_to_fill = self.batch_size - self.model.llm_engine.get_num_unfinished_requests()
+
+        add_new_qitem = samples_to_fill>=self.prefill_batch_size
+        return_value = True
+        qitem = -1
+        
+        if self.finished:
+            add_new_qitem = False
+            if not self.model.llm_engine.has_unfinished_requests():
+                print(f"Rank {self.rank} setting return_value to False")
+                return_value = False
+
+        # Receive request if there is an empty decode slot
+        qitem_list = []
+        added_new_queries = 0
+
+        # Ensure that after insertion, the running batch size doesn't exceed max batch size
+        while add_new_qitem and added_new_queries+self.prefill_batch_size<=samples_to_fill:
+            try:
+                # Continue decoding if nothing in queue
+                qitem = self.input_queue.get(False)
+            except Exception:
+                qitem = -1
+            else:
+                pass
+            
+            if qitem is None:
+                self.finished = True
+                add_new_qitem = False
+            elif type(qitem) == int:
+                add_new_qitem = False
+            else:
+                qitem_list.append(qitem)
+                added_new_queries += len(qitem[0])
+                # In case main thread cannot keep up
+                time.sleep(0.01)
+        return qitem_list, return_value
+
+    def run_one_step(self, qitem_list):
+        pred_output_tokens = []
+        query_ids = []
+        qid = []
+        pred_first_tokens = []
+        query_ids_first = []
+        qid_first = []
+        step_engine_args = (
+            pred_output_tokens,
+            query_ids,
+            qid,
+            pred_first_tokens,
+            query_ids_first,
+            qid_first
+        )
+
+        if len(qitem_list)>0:
+
+            for qitem in qitem_list:
+                qitem, start_time = qitem
+                # Construct / collate batch
+
+                input_ids_tensor = []
+                # q = qitem[0] # All qitems are of length 1
+                for i,q in enumerate(qitem):
+                    sampling_params = self.sampling_params
+                    if "llama3_1-8b" in self.workload_name:
+                        input_ids_tensor = [self.data_object.input_ids[q.index]]
+                    else:
+                        input_ids_tensor = self.data_object.input_ids[q.index].tolist()
+
+                    self.first_token_id.append(len(self.query_idx_mapping))
+                    self.wait_time_mapping.append(time.time()-start_time[i])
+                    self.query_idx_mapping.append(q.index)
+                    self.qid_mapping.append(q.id)
+                    self.start_time_mapping.append(time.time())
+                    self.first_time_mapping.append(0)
+                    self.tprefill_list.append(0)
+                    self.nprefill_list.append(0)
+
+                    for prompt_id in input_ids_tensor:
+                        if isinstance(prompt_id, torch.Tensor):
+                            prompt_id = prompt_id.tolist()
+                        inputs = TokensPrompt(prompt_token_ids=prompt_id)#.tolist())
+
+                        self.model.llm_engine.add_request(str(self.req_counter),
+                            inputs,
+                            sampling_params)
+
+                        self.req_counter += 1
+
+                        # Find the ind that this request can be inserted
+                        batch_ind = np.argmax(self.running_ind_to_id==0)
+                        self.running_seq_len[batch_ind] = self.data_object.input_lens[q.index]+1
+                        self.running_ind_to_id[batch_ind] = q.id
+                        # print(f"adding {q.id}. Mapped to {batch_ind}")
+
+            print(f"Step prompt, req_counter {self.req_counter}")
+            self.step_engine_prompt(*step_engine_args)
+        elif self.model.llm_engine.has_unfinished_requests():
+            self.step_engine(*step_engine_args)
+            self.running_seq_len = self.running_seq_len + 1
+        else:
+            # Avoid excessive empty steps
+            time.sleep(0.05)
+        return pred_output_tokens, query_ids, qid, pred_first_tokens, query_ids_first, qid_first
+
+    def process_step_outputs(self, step_outputs):
+        (
+            pred_output_tokens,
+            query_ids,
+            qid,
+            pred_first_tokens,
+            query_ids_first,
+            qid_first
+        ) = step_outputs
+        # Process first tokens
+        if len(pred_first_tokens)>0:
+            processed_first = []
+            first_time = time.time()
+            for pred_first_token in pred_first_tokens:
+                processed_first.append(np.array(pred_first_token).reshape(-1))
+            self.first_queue.put((qid_first, processed_first, first_time))
+
+        # Process finished requests
+        if len(pred_output_tokens)>0:
+            processed_output = self.data_object.postProcess(pred_output_tokens,
+                                                            input_seq_lens=0,
+                                                            query_id_list=query_ids)
+            
+            # Zero-out corresponding entries
+            for qid_c in qid:
+                ind_c = np.argmax(self.running_ind_to_id==qid_c)
+                # print(f"zeroing out {qid_c}. Mapped to {ind_c}")
+                self.running_ind_to_id[ind_c] = 0
+                self.running_seq_len[ind_c] = 0
+
+            self.output_queue.put((qid,processed_output))
+
+            with self.cond_var:
+                self.sample_counter.value += len(query_ids)
+                print(f"Samples run: {self.sample_counter.value}, rank {self.rank}, current_count {self.current_counter.value}")
+                self.current_counter.value -= len(query_ids)
+                self.cond_var.notify()
+
+        # Update number of blocks
+        ind_valid = self.running_ind_to_id > 0
+        # Don't update when idle
+        if np.sum(ind_valid)>0:
+            with self.cond_var:
+                self.blocks_counter.value = int(np.sum((self.running_seq_len[ind_valid]+BLOCK_SIZE-1)//BLOCK_SIZE))
+                # print(f"Rank {self.rank} updated blocks_counter {self.blocks_counter.value}")
+                self.cond_var.notify()
+
+    def process_queries(self):
+        # Attempt to fetch queries from input queue
+        qitem_list, return_value = self.fetch_queries()
+        # Run inference (prefill/decode) for one step
+        step_outputs = self.run_one_step(qitem_list)
+        # Process (finished) step outputs
+        self.process_step_outputs(step_outputs)
+        return return_value
+
+def find_best_bucket(query_lists, prefill_tokens, input_len):
+    query_lists_new = query_lists + [0]
+    query_lists_new = (np.array(query_lists_new)+input_len)%(prefill_tokens+1)
+    # Try to fill the bucket with the minimum number of total tokens
+    best_ind = np.argmax(query_lists_new)
+    ind_full = query_lists_new>(prefill_tokens-64)
+    if ind_full.any():
+        best_ind_full = np.argmax(ind_full)
+    else:
+        best_ind_full = len(query_lists_new)
+    best_ind = min(best_ind, best_ind_full)
+    return best_ind
+
+class SUT():
+    def __init__(self,
+                 model_path=None,
+                 workload_name="llama2-70b",
+                 lg_settings=None,
+                 scenario="offline",
+                 dtype="bfloat16",
+                 device="cpu",
+                 batch_size=None,
+                 total_sample_count=24576,
+                 dataset_path=None,
+                 use_cached_outputs=False,  # Set this to True *only for test accuracy runs* in case your prior session was killed partway through
+                 workers=1,
+                 tp=1,
+                 pp=1,
+                 quantized=False,
+                 warmup=False,
+                 partial_output_dir_list=[]):
+        if XPU_COUNT>0: # XPU needs spawn method. Can be problematic when using INSTANCE_OVERRIDE
+            try:
+                mp.set_start_method('spawn')
+            except RuntimeError:
+                print("Start method can only be set once")
+
+        # If ONEAPI_DEVICE_SELECTOR is set outside, use that to limit device selection within instances
+        preset_device_selector = os.environ.get("ONEAPI_DEVICE_SELECTOR", "")
+        if preset_device_selector=="":
+            self.xpu_devices = range(XPU_COUNT)
+        else:
+            self.xpu_devices = preset_device_selector.split("level_zero:")[1].split(",")
+        self.xpu_devices = [str(d) for d in self.xpu_devices]
+
+        self.tp = tp
+        self.pp = pp
+
+        self.model_path = model_path or "meta-llama/Llama-3.1-70B-Instruct"
+        self.workload_name = workload_name
+        self.lg_settings=lg_settings
+        self.scenario = scenario
+        self.device = device
+
+        if not batch_size:
+            if device == "cpu":
+                batch_size = 1
+            else:
+                batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
+        self.batch_size = batch_size
+        self.total_sample_count = total_sample_count
+        #self.tp = tp
+        self.quantized=quantized
+        self.warmup = warmup
+
+        # dtype
+        if dtype == 'bfloat16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.bfloat16
+        elif dtype == 'float16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.float16
+        else:
+            self.amp_enabled = False
+            self.amp_dtype = torch.float32
+
+        self.dataset_path = dataset_path
+        self.qsl = lg.ConstructQSL(self.total_sample_count, self.total_sample_count,
+                                   self.LoadSamplesToRam, self.UnloadSamplesFromRam)
+
+        self.num_workers = workers
+        self.worker_threads = [None] * self.num_workers
+        self.query_queue_list = [mp.JoinableQueue() for _ in range(self.num_workers)]
+        self.query_queue_int = mp.Queue()
+        self.output_queue = mp.Queue()
+        self.alive_counter = mp.Value("i", 0)
+        self.dead_counter = mp.Value("i", 0)
+        self.cond_var = mp.Condition(lock=mp.Lock())
+
+        self.use_cached_outputs = use_cached_outputs
+        self.sample_counter = mp.Value("i", 0)
+        self.current_counter_list = [mp.Value("i", 0) for _ in range(self.num_workers)]
+        self.blocks_counter_list = [mp.Value("i", 0) for _ in range(self.num_workers)]
+        self.over_ttft_counter = mp.Value("i", 0)
+        self.over_tpot_counter = mp.Value("i", 0)
+
+        # TODO: CHANGE THIS
+        if BATCHED_PREFILL:
+            # Adjust for chunked prefill
+            self.prefill_batch_size = 1
+        else:
+            self.prefill_batch_size = 1
+
+        self.progress = None
+        self.tp_sizes = []
+        self.load_dataset() # In case SUT may balance load using query lens
+        self.core_lists = [[]]*self.num_workers
+        self.completed_queries = self.find_completed_queries(partial_output_dir_list)
+        self.first_token_queue = None
+
+        # Shared worker kwargs. These will be passed as is to the workers
+        self.shared_worker_kwargs={
+            "model_path": self.model_path,
+            "workload_name": self.workload_name,
+            "num_workers": self.num_workers,
+            # "lg_settings": self.lg_settings, # issue with spawn
+            "dataset_path": self.dataset_path,
+            #"device": self.device,
+            "batch_size": self.batch_size,
+            "total_sample_count": self.total_sample_count,
+            "dtype": self.amp_dtype,
+            "output_queue": self.output_queue,
+            "first_queue": self.first_token_queue,
+            "cond_var": self.cond_var,
+            "alive_counter": self.alive_counter,
+            "dead_counter": self.dead_counter,
+            "sample_counter": self.sample_counter,
+            "over_ttft_counter": self.over_ttft_counter,
+            "over_tpot_counter": self.over_tpot_counter,
+            "server": self.scenario.lower()=="server",
+            "tp": self.tp,
+            "pp": self.pp,
+            "quantized": self.quantized,
+            "warmup": self.warmup,
+            "prefill_batch_size": self.prefill_batch_size,
+        }
+
+        # Individual worker kwargs. These will be lists of size self.num_workers. i-th worker will get i-th element of each list
+        self.individual_worker_kwargs={
+            "input_queue": self.query_queue_list,
+            "current_counter": self.current_counter_list,
+            "blocks_counter": self.blocks_counter_list,
+            "rank": list(range(self.num_workers))
+        }
+
+        if XPU_COUNT>0:
+            world_size = self.tp * self.pp
+            self.individual_worker_kwargs["xpu_devices"]=[",".join(self.xpu_devices[j*world_size:(j+1)*world_size]) for j in range(self.num_workers)]
+
+        # Extend/override SUT method if it's defined
+        try:
+            self.sut_override = types.MethodType(SUT_OVERRIDE, self)
+        except NameError:
+            pass
+        else:
+            self.sut_override()
+
+    def load_dataset(self):
+        if self.workload_name=="llama3_1-8b":
+            self.data_object = Dataset(model_name=self.model_path,
+                                            dataset_path=self.dataset_path,
+                                            total_sample_count=self.total_sample_count)
+        else:
+            self.data_object = Dataset(self.model_path,
+                                        dataset_path=self.dataset_path,
+                                        total_sample_count=self.total_sample_count,
+                                       )
+                                        #device="cpu")
+    
+    def find_completed_queries(self, partial_output_dir_list):
+        import json
+        completed_queries = set()
+        for directory in partial_output_dir_list:
+            with open(os.path.join(directory,"mlperf_log_accuracy.json"), "r", encoding="UTF-8") as f:
+                lines = f.readlines()
+            combined_line = ""
+            for i,line in enumerate(lines):
+                if i==len(lines)-1:
+                    if line[-1]!="]":
+                        line += "]"
+                combined_line += line
+            log.info(f"End of combined_line {combined_line[-10:]}")
+            results = json.loads(combined_line)
+            for pred in results:
+                qsl_idx = pred["qsl_idx"]
+                if qsl_idx not in completed_queries:
+                    completed_queries.add(qsl_idx)
+        log.info(f"Completed_queries {completed_queries}, partial list {partial_output_dir_list}")
+        return completed_queries
+
+
+    def LoadSamplesToRam(self, query_samples):
+        pass
+
+    def UnloadSamplesFromRam(self, query_samples):
+        pass
+
+    def start(self):
+        cur_copies = self.num_workers
+        # Create worker threads
+        for j in range(self.num_workers):
+
+            # Build kwargs for inidividual workers
+            private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs}
+            private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs
+
+            worker = Instance(**private_worker_kwargs)
+            worker.start()
+            self.worker_threads[j] = worker
+            # Prevent host OOM
+            if (j%cur_copies==cur_copies-1):
+                # For some nodes it takes a long time to load the checkpoint so all instance weight load overlap, causing memory stress
+                with self.cond_var:
+                    self.cond_var.wait_for(lambda: self.alive_counter.value >= self.num_workers//2, 120)
+
+        with self.cond_var:
+            print(f"Waiting for alive_counter to be equal to {self.num_workers}")
+            self.cond_var.wait_for(lambda: self.alive_counter.value == self.num_workers)
+
+        log.info(f"Starting internal issue query thread")
+        self.query_thread = threading.Thread(target=self.issue_queries_int_merged)
+        self.query_thread.daemon = True
+        self.query_thread.start()
+
+        log.info(f"Starting Loadgen response thread")
+        self.response_thread = threading.Thread(target=self.response_loadgen)
+        self.response_thread.daemon = True
+        self.response_thread.start()
+
+    def stop(self):
+        self.output_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+        if self.scenario=="server":
+            with self.cond_var:
+                log.info(f"[{time.time():.3f}] Test finished "
+                        f"TTFT {self.over_ttft_counter.value} "
+                        f"TPOT {self.over_tpot_counter.value} ")
+                try:
+                    log.info(f"Max first time {np.max(self.first_time_list):.6f}s")
+                    log.info(f"Average first time {np.mean(self.first_time_list):.6f}s")
+                except (AttributeError, ValueError):
+                    pass
+
+                self.cond_var.notify()
+        if PBAR:
+            # Ensure the last progress bar is updated
+            time.sleep(0.5)
+            self.progress.close()
+
+    def update_pbar(self, tok_count, last_count, update_value):
+        postfix_str = f"{tok_count/self.progress.format_dict['elapsed']:.1f}toks/s"
+        self.progress.set_postfix_str(postfix_str, refresh=False)
+        self.progress.update(update_value)
+        self.progress.refresh()
+        last_count += update_value
+        return last_count
+        
+    def response_loadgen(self):
+        keep_alive = True
+        processed_count = 0
+        last_count = 0
+        tok_count = 0
+        while keep_alive:
+            result = self.output_queue.get()
+            if result is None:
+                keep_alive = False
+            else:
+                qid, processed_output = result
+
+                for i in range(len(processed_output)):
+                    n_tokens = processed_output[i].shape[0]
+                    response_array = array.array("B", processed_output[i].tobytes())
+                    bi = response_array.buffer_info()
+                    response = [lg.QuerySampleResponse(qid[i], bi[0], bi[1], n_tokens)]
+                    lg.QuerySamplesComplete(response)
+                    processed_count += 1
+                    tok_count += n_tokens
+                
+                if PBAR:
+                    # Limit prints to once every PBAR_FREQ completed samples
+                    if processed_count - last_count >= PBAR_FREQ:
+                        last_count = self.update_pbar(tok_count, last_count, PBAR_FREQ)
+
+        # Update progress bar at the end
+        if PBAR and (processed_count > last_count):
+            last_count = self.update_pbar(tok_count, last_count, processed_count-last_count)
+
+    def get_sut(self):
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        return self.sut
+
+    def get_qsl(self):
+        return self.qsl
+
+    def predict(self,**kwargs):
+        raise NotImplementedError
+
+    def get_best_rank(self, value_added):
+        current_counters = np.array([(self.current_counter_list[i].value+value_added) for i in range(self.num_workers)]) # Instances priority will be ordered by their respective in-flight queries
+        target_rank = np.argmin(current_counters)
+        # if current_counters[target_rank]>self.batch_size:
+        #     return -1
+        # Instead of trying to only fill up to batch size, allow one queued up
+        if self.query_queue_list[target_rank].qsize()>0:
+            return -1
+        else:
+            return target_rank
+
+    def issue_queries(self, query_samples):
+        """ Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+        start_time = time.time()
+        if PBAR and (self.progress is None):
+            if len(query_samples)>1:
+                self.progress = tqdm(total=len(query_samples), smoothing=0.0)
+            else:
+                self.progress = tqdm(smoothing=0.0)
+
+        query_sample_list = []
+        for query_sample in query_samples:
+            # Continuous batching
+            self.query_queue_int.put((query_sample, start_time))
+
+    def try_dispatch(
+        self,
+        query_list,
+        delete_list,
+        bucket,
+        input_len_bucket,
+        time_start_bucket,
+        server=False): # Inactive for now
+        target_rank = -1
+        wait_to_dispatch = True
+        while wait_to_dispatch:
+            with self.cond_var:
+                target_rank = self.get_best_rank(1) # With prepacking, prefill batch size is always 1
+                if target_rank!=-1:
+                    self.current_counter_list[target_rank].value += 1
+            # Always wait for an instance with an empty slot(for now)
+            if target_rank==-1:
+                time.sleep(0.01)
+            else:
+                wait_to_dispatch = False
+        if target_rank!=-1:
+            print(f"Sending to rank {target_rank}, num_queries {len(query_list)}, before add {self.current_counter_list[target_rank].value}")
+            self.query_queue_list[target_rank].put((query_list, time_start_bucket))
+            delete_list.append(bucket)
+            return True
+        return False
+
+    # Load balancer for merged prefill
+    def issue_queries_int_merged(self):
+        keep_alive = True
+
+        query_list = []
+        time_start_list = []
+
+        # TODO: use the actual latency target instead of the hard-coded numbers
+        time_left = 9999# if self.scenario=="offline" else SERVER_TIME_LIMIT
+        #time_compute = SERVER_COMPUTE_TIME
+        time_compute = 0
+
+        while keep_alive:
+            new_query = False
+            try:
+                query = self.query_queue_int.get(timeout=0.05)
+            except Exception:
+                # No query available within timeout, continue
+                pass
+            else:
+                if query is None:
+                    keep_alive = False
+                else:
+                    query, start_time = query
+                    start_time_c = start_time
+                    new_query = True
+            
+            if new_query:
+                time_start_list.append(start_time_c)
+                query_list.append(query)
+            
+            if len(query_list)>0:
+                time_wait = time.time()-time_start_list[0]
+                time_needed = time_wait + time_compute
+                # When to dispatch
+                # 1. Time limit passed
+                # 2. Reached prefill batch size
+                # 3. End of inference
+                if (time_needed > time_left) or (len(query_list)==self.prefill_batch_size) or (not keep_alive):
+                    target_rank = -1
+                    # Continue trying to dispatch until there is an empty spot
+                    while target_rank==-1:
+                        with self.cond_var:
+                            target_rank = self.get_best_rank(len(query_list))
+                            if target_rank!=-1:
+                                self.current_counter_list[target_rank].value += len(query_list)
+                        if target_rank==-1:
+                            time.sleep(0.01)
+                    print(f"Sending to rank {target_rank}, length {len(query_list)}, wait_time {time_wait:.2f}s")
+                    self.query_queue_list[target_rank].put((query_list, time_start_list))
+                    query_list = []
+                    time_start_list = []
+        
+        for i in range(self.num_workers):
+            print("Putting none in", i)
+            self.query_queue_list[i].put(None)
+    
+    def flush_queries(self):
+        self.query_queue_int.put(None)
+        pass
+
+    def __del__(self):
+        pass
+
+
+class SUTServer(SUT):
+    def __init__(self, **kwargs):
+        kwargs["scenario"]="server"
+        super().__init__(**kwargs)
+        self.first_token_queue = mp.Queue()
+        self.shared_worker_kwargs["first_queue"]=self.first_token_queue
+        self.first_time_list = []
+
+    def start(self):
+        super().start()
+        # Create first token response thread
+        log.info(f"Starting first-token response thread")
+        self.ft_response_thread = threading.Thread(target=self.process_first_tokens)
+        self.ft_response_thread.daemon = True
+        self.ft_response_thread.start()
+
+    def process_first_tokens(self):
+        while True:
+            first_token_item = self.first_token_queue.get()
+
+            if first_token_item is None:
+                log.info("Exiting First token response thread")
+                break
+
+            qid, processed_output, send_time = first_token_item
+            
+            for i in range(len(processed_output)):
+                response_data = array.array("B", processed_output[i].tobytes())
+                bi = response_data.buffer_info()
+                response = [lg.QuerySampleResponse(qid[i], bi[0], bi[1])]
+                lg.FirstTokenComplete(response)
+            receive_time = time.time()-send_time
+            self.first_time_list.append(receive_time)
+
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py
new file mode 100644
index 00000000000..bc66165a0cc
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py
@@ -0,0 +1,148 @@
+import os
+import time
+import numpy as np
+import torch
+from datasets import load_dataset, load_from_disk
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from typing import Optional, Dict, Sequence
+import io
+#import utils
+import copy
+
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Llama-70B-Dataset")
+
+import random
+
+class Dataset():
+    def __init__(self, model_name=None, total_sample_count=24576, perf_count_override=None, dataset_path=None, device="cpu"):
+        self.model_name = model_name or "meta-llama/Llama-2-70b-chat-hf"
+        self.dataset_path = dataset_path
+        self.max_length = 1024
+        self.device = device
+
+        #self.total_sample_count = total_sample_count
+
+        self.load_tokenizer()
+        self.load_processed_dataset()
+
+        self.total_sample_count = min(len(self.input_ids), total_sample_count)
+        self.perf_count = perf_count_override or self.total_sample_count
+
+    def load_tokenizer(self):
+        """ Returns tokenizer """
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name)
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def load_processed_dataset(self):
+        if not os.path.isfile(self.dataset_path):
+            log.warn("Processed pickle file {} not found. Please check that the path is correct".format(self.dataset_path))
+
+        if "Llama-3" in self.model_name and "orca" not in self.dataset_path:
+            import pandas as pd
+
+            self.processed_data = pd.read_json(self.dataset_path)
+
+            self.input = self.processed_data.input.tolist()
+            self.input_ids = self.processed_data.tok_input.tolist()
+            self.input_lens = [len(x) for x in self.input_ids]
+            self.targets = self.processed_data.output.tolist()
+
+            del self.processed_data
+            return
+
+        import pandas as pd
+        # Note: Using pickle with trusted dataset files only
+        # In production, consider using safer serialization formats like JSON or HDF5
+        processed_data = pd.read_pickle(self.dataset_path)  # nosec B301
+        # input_tokens = processed_data['tok_input']
+        
+        if ("405b" in self.dataset_path) and ("llama" not in self.model_name):
+            # Running 405b dataset with a different model
+            # Tokenize the dataset instead of using the tokenized dataset
+            input_strs = processed_data['input']
+            encode_bs = 128
+            self.input_ids = []
+            self.input_lens = []
+            self.attention_masks = []
+            input_strs_batch = []
+            for i,strs in enumerate(input_strs):
+                # if i%10==9:
+                #     print(i)
+                input_strs_batch.append(strs)
+                if len(input_strs_batch)>=encode_bs:
+                    input_ids_batch = self.tokenizer.batch_encode_plus(input_strs_batch)['input_ids']
+                    for input_ids in input_ids_batch:
+                        input_ids = torch.tensor([input_ids])
+                        attn_mask = torch.ones_like(input_ids)
+                        self.input_ids.append(input_ids)
+                        self.attention_masks.append(attn_mask)
+                        self.input_lens.append(input_ids.shape[-1])
+                    input_strs_batch = []
+                    # exit()
+            if len(input_strs_batch)>0:
+                input_ids_batch = self.tokenizer.batch_encode_plus(input_strs_batch)['input_ids']
+                for input_ids in input_ids_batch:
+                    input_ids = torch.tensor([input_ids])
+                    attn_mask = torch.ones_like(input_ids)
+                    self.input_ids.append(input_ids)
+                    self.attention_masks.append(attn_mask)
+                    self.input_lens.append(input_ids.shape[-1])
+                input_strs_batch = []
+        elif ("orca" in self.dataset_path) and ("llama2" not in self.model_name.lower()):
+            user_prompts = processed_data['question']
+            system_prompts = processed_data['system_prompt']
+
+            self.input_ids = []
+            self.input_lens = []
+            self.attention_masks = []
+
+            for i in range(len(processed_data)):
+                user_prompt = user_prompts.iloc[i]
+                system_prompt = system_prompts.iloc[i]
+                message = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ]
+                input_ids = self.tokenizer.apply_chat_template(message, add_generation_prompt=True)
+                input_ids = torch.tensor(input_ids, dtype=torch.int32).view(1,-1).to(self.device)
+                attn_mask = torch.ones_like(input_ids)
+                self.input_ids.append(input_ids)
+                self.attention_masks.append(attn_mask)
+                self.input_lens.append(input_ids.shape[-1])
+        else:
+            input_tokens = processed_data['tok_input']
+            self.input_ids = []
+            self.input_lens = []
+            self.attention_masks = []
+
+            for ids in input_tokens:
+                input_ids = torch.tensor(ids, dtype=torch.int32).view(1,-1).to(self.device)
+                attn_mask = torch.ones_like(input_ids)
+                self.input_ids.append(input_ids)
+                self.attention_masks.append(attn_mask)
+                self.input_lens.append(input_ids.shape[-1])
+
+    def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sample_index_list=None):
+        """ Postprocesses output prediction """
+
+        output_seqs = []
+        for i,out_token in enumerate(out_tokens):
+            output_seq = np.array(out_token).reshape(-1)
+            output_seqs.append(output_seq)
+
+        return output_seqs
+
+    def LoadSamplesToRam(self, sample_list):
+        pass
+
+    def UnloadSamplesFromRam(self, sample_list):
+        pass
+
+    def __del__(self):
+        pass
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py
new file mode 100644
index 00000000000..0731818808b
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py
@@ -0,0 +1,140 @@
+import os
+import time
+import numpy as np
+import json
+import nltk
+import array
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+import evaluate
+import argparse
+import nltk
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from pathlib import Path
+import sys
+
+file = Path(__file__).resolve()
+parent, root = file.parent, file.parents[1]
+sys.path.append(str(root))
+
+from dataset import Dataset
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to cnn_eval.json")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64"],
+    )
+    parser.add_argument(
+        "--checkpoint-path",
+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        help="Model name")
+    parser.add_argument(
+        "--total-sample-count",
+        default=13368,
+        type=int,
+        help="Model name")
+    args = parser.parse_args()
+    return args
+
+
+def postprocess_text(preds, targets):
+    preds = [pred.strip() for pred in preds]
+    targets = [target.strip() for target in targets]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+    return preds, targets
+
+
+def main():
+
+    args = get_args()
+    model_name = args.checkpoint_path
+    dataset_path = args.dataset_file
+    total_sample_count = args.total_sample_count
+    metric = evaluate.load("rouge")
+    nltk.download("punkt")
+    nltk.download('punkt_tab')
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        model_max_length=2048,
+        padding_side="left",
+        use_fast=True,
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    data_object = Dataset(
+        model_name=model_name,
+        dataset_path=dataset_path,
+        total_sample_count=total_sample_count,
+    )
+    
+    targets = data_object.targets
+
+    with open(args.mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    # Deduplicate the results loaded from the json
+    dedup_results = []
+    seen = set()
+    for result in results:
+        item = result["qsl_idx"]
+        if item not in seen:
+            seen.add(item)
+            dedup_results.append(result)
+    results = dedup_results
+
+    target_required = []
+    preds_token_ids = []
+
+    eval_dtype = np.int64
+    if args.dtype == "int32":
+        eval_dtype = np.int32
+
+    for pred in results:
+        qsl_idx = pred["qsl_idx"]
+        target = targets[qsl_idx]
+        target_required.append(target)
+        preds_token_ids.append(
+            np.frombuffer(
+                bytes.fromhex(
+                    pred["data"]),
+                eval_dtype))
+
+    preds_decoded_text = tokenizer.batch_decode(
+        preds_token_ids, skip_special_tokens=True
+    )
+
+    preds, targets = postprocess_text(preds_decoded_text, target_required)
+
+    result = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
+    result = {k: f"{round(np.mean(v) * 100, 4)}" for k, v in result.items()}
+    prediction_lens = [len(pred) for pred in preds]
+    result["gen_len"] = np.sum(prediction_lens)
+    result["gen_num"] = len(preds)
+    print("\nResults\n")
+    print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py
new file mode 100644
index 00000000000..9efb9bd7763
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py
@@ -0,0 +1,140 @@
+import argparse
+from transformers import AutoTokenizer
+import nltk
+import evaluate
+import numpy as np
+import json
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
+    )
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to processed openorca validation set",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64", "float"],
+    )
+    parser.add_argument("--estimate-performance",
+        action='store_true',
+        help="Estimate performance using mlperf_log_detail.txt"
+        )
+    parser.add_argument("--mlperf-log-detail",
+        help="Path to mlperf_log_detail.txt. Only used if --estimate-performance is true"
+        )
+    args = parser.parse_args()
+    return args
+
+def get_groundtruth(processed_dataset_file):
+    import pandas as pd
+
+    # Note: Using pickle with trusted dataset files only
+    # In production, consider using safer serialization formats like JSON or HDF5
+    data = pd.read_pickle(processed_dataset_file)  # nosec B301
+    ground_truths = data["output"]
+    return ground_truths
+
+
+def postprocess_text(preds, targets):
+    preds = [pred.strip() for pred in preds]
+    targets = [target.strip() for target in targets]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+    return preds, targets
+
+
+def main():
+
+    args = get_args()
+    dataset_path = args.dataset_file
+    checkpoint_path = args.checkpoint_path
+    metric = evaluate.load("rouge")
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_path,
+        # model_max_length=2048,
+        # padding_side="left",
+        # use_fast=False,
+    )
+
+    targets = get_groundtruth(args.dataset_file)
+
+    target_required = []
+    preds_token_ids = []
+
+    eval_dtype = np.int64
+    if args.dtype == "int32":
+        eval_dtype = np.int32
+    elif args.dtype == "float":
+        eval_dtype = np.float32
+
+    with open(args.mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    seen = set()
+    gen_tok_len = 0
+    for pred in results:
+        qsl_idx = pred["qsl_idx"]
+        if qsl_idx in seen:
+            continue
+
+        seen.add(qsl_idx)
+        target = targets[qsl_idx]
+        target_required.append(target)
+        pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
+
+        gen_tok_len += len(pred)
+        preds_token_ids.append(pred)
+
+    preds_decoded_text = tokenizer.batch_decode(
+        preds_token_ids, skip_special_tokens=True
+    )
+
+    preds, targets = postprocess_text(preds_decoded_text, target_required)
+
+    result = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
+    result = {k: f"{round(np.mean(v) * 100, 4)}" for k, v in result.items()}
+    prediction_lens = [len(pred) for pred in preds]
+    gen_num = len(preds)
+
+    result = {
+        **result,
+        "gen_len": f"{np.sum(prediction_lens)}",
+        "gen_num": gen_num,
+        "gen_tok_len": gen_tok_len,
+        "tokens_per_sample": round(gen_tok_len / gen_num, 1),
+    }
+
+    if args.estimate_performance:
+        import sys
+        import os
+        from utils import estimate_performance
+        tokens_per_second = estimate_performance(args.mlperf_log_detail, result)
+        result["estimated_performance"] = tokens_per_second
+
+    print("\nResults\n")
+    print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py
new file mode 100644
index 00000000000..597ad43765f
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py
@@ -0,0 +1,162 @@
+import subprocess  # nosec B404
+import mlperf_loadgen as lg
+import argparse
+import os
+import logging
+import sys
+import hashlib
+import time
+from SUT import SUT, SUTServer
+
+sys.path.insert(0, os.getcwd())
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenario", type=str, choices=["Offline", "Server", "SingleStream"], default="Offline", help="Scenario")
+    parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name")
+    parser.add_argument("--workload-name", type=str, default="llama2-70b")
+    parser.add_argument("--dataset-path", type=str, default="/software/users/mlperf/datasets/open_orca_gpt4_tokenized_llama.sampled_24576.pkl", help="") # TODO: Reset default before submission
+    parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="data type of the model, choose from float16, bfloat16 and float32")
+    parser.add_argument("--device", type=str, default="hpu", help="device to use")
+    parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
+    parser.add_argument("--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config")
+    parser.add_argument("--user-conf", type=str, default="user.conf", help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument("--total-sample-count", type=int, default=24576, help="Number of samples to use in benchmark.") # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it
+    parser.add_argument("--output-log-dir", type=str, default="build/logs", help="Where logs are saved")
+    parser.add_argument("--enable-log-trace", action="store_true", help="Enable log tracing. This file can become quite large")
+    parser.add_argument("--tensor-parallel", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--pipeline-parallel", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--num-workers", type=int, default=1, help="Number of workers to process queries")
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--quantized", action='store_true', help="If using a AWQ quantized model")
+    parser.add_argument("--warmup", action='store_true', help="Do warmup")
+
+    args = parser.parse_args()
+
+    print("### Benchmark parameters")
+    params_list = [action.dest for action in parser._actions]
+    for param in params_list:
+        try:
+            value = getattr(args, param)
+            print(f"{param} = {value}")
+        except AttributeError:
+            continue
+    # Define grep patterns for environment and package info
+    env_patterns = ['VLLM', 'PT_', 'HL_', 'MAX_']
+    pip_patterns = ['vllm', 'neural', 'loadgen', 'intel', 'triton', 'ccl']
+    
+    # Get environment variables
+    for pattern in env_patterns:
+        try:
+            # Using standard system command - trusted input
+            result = subprocess.run(['env'], capture_output=True, text=True, check=True)  # nosec B607 B603
+            filtered_output = '\n'.join([line for line in result.stdout.split('\n') if pattern in line])
+            if filtered_output:
+                print(filtered_output)
+        except Exception:
+            # Environment variable lookup failed, continue
+            continue
+    
+    # Get pip package info
+    for pattern in pip_patterns:
+        try:
+            # Using standard system command - trusted input
+            result = subprocess.run(['pip', 'list'], capture_output=True, text=True, check=True)  # nosec B607 B603
+            filtered_output = '\n'.join([line for line in result.stdout.split('\n') if pattern in line])
+            if filtered_output:
+                print(filtered_output)
+        except Exception:
+            # Package listing failed, continue
+            continue
+
+    with open('user.conf', 'r') as file:
+        for line in file:
+            if line.strip():
+                print(line, end='')
+    print("########################")
+    return args
+
+scenario_map = {
+    "offline": lg.TestScenario.Offline,
+    "server": lg.TestScenario.Server,
+    "singlestream": lg.TestScenario.SingleStream
+    }
+
+def main():
+    args = get_args()
+
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(args.workload_name + "-MAIN")
+
+    sut_map = {
+        "offline": SUT,
+        "server": SUTServer,
+        "singlestream": SUTServer
+    }
+
+    settings = lg.TestSettings()
+    settings.scenario = scenario_map[args.scenario.lower()]
+    # Need to update the conf
+    settings.FromConfig(args.user_conf, args.workload_name, args.scenario)
+
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+        log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    partial_output_dir_list = []
+    cur_output_log_dir = args.output_log_dir
+
+    os.makedirs(cur_output_log_dir, exist_ok=True)
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = cur_output_log_dir
+    log_output_settings.copy_summary_to_stdout = True
+    log_settings = lg.LogSettings()
+    log_settings.log_output = log_output_settings
+    log_settings.enable_trace = args.enable_log_trace
+
+    sut_cls = sut_map[args.scenario.lower()]
+
+    sut = sut_cls(
+        model_path=args.model_path,
+        workload_name=args.workload_name,
+        lg_settings=settings,
+        dtype=args.dtype,
+        dataset_path=args.dataset_path,
+        total_sample_count=args.total_sample_count,
+        device=args.device,
+        workers=args.num_workers,
+        tp=args.tensor_parallel,
+        pp=args.pipeline_parallel,
+        batch_size=args.batch_size,
+        quantized=args.quantized,
+        warmup=args.warmup,
+        partial_output_dir_list=partial_output_dir_list
+    )
+    # import pdb;pdb.set_trace()
+
+    # Start sut before loadgen starts
+    sut.start()
+    lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
+    log.info("Starting Benchmark run")
+    t_start = time.time()
+    lg.StartTestWithLogSettings(lgSUT, sut.qsl, settings, log_settings, args.audit_conf)
+    t_end = time.time()
+    log.info("Test took {:.1f} sec".format(t_end - t_start))
+
+    # Stop sut after completion
+    sut.stop()
+
+    log.info("Run Completed!")
+
+    log.info("Destroying SUT...")
+    lg.DestroySUT(lgSUT)
+
+    log.info("Destroying QSL...")
+    lg.DestroyQSL(sut.qsl)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py
new file mode 100644
index 00000000000..9e25237f05f
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py
@@ -0,0 +1,116 @@
+from argparse import ArgumentParser
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+#from auto_round import AutoRound
+from dataset import Dataset
+import torch
+torch.use_deterministic_algorithms(True, warn_only=True)
+
+#=========================
+import os
+import subprocess
+import pandas as pd
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+from transformers import set_seed
+
+import re
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#=========================
+
+def get_args():
+    parser = ArgumentParser(description="AutoRound quantization script")
+    parser.add_argument("--model_name", type=str, required=True, help="Name of the model to quantize")
+    parser.add_argument("--dataset-path", type=str, required=True, help="Path to the dataset for quantization")
+    parser.add_argument("--bits", type=int, default=4, help="Number of bits for quantization")
+    parser.add_argument("--group_size", type=int, default=128, help="Group size for quantization")
+    parser.add_argument("--sym", action='store_true', help="Use symmetric quantization")
+    parser.add_argument("--nsamples", type=int, default=512, help="Number of samples for quantization")
+    parser.add_argument("--iters", type=int, default=1000, help="Number of iterations for quantization")
+    parser.add_argument("--output_dir", type=str, default="./tmp_autoround", help="Directory to save the quantized model")
+    parser.add_argument("--quant_lm_head", action='store_true', help="Quantize the language model head")
+    parser.add_argument("--act_bits", type=int, default=8, help="Number of bits for activation quantization")
+    parser.add_argument("--device", type=str, default="xpu", help="Device to run the quantization on")
+    return parser.parse_args()
+
+def main():
+    args = get_args()
+    model_name = args.model_name
+    dataset_path = args.dataset_path
+    nsamples = args.nsamples
+    iters = args.iters
+    bits = args.bits
+    group_size = args.group_size
+    sym = args.sym
+    output_dir = args.output_dir
+
+    # Load dataset
+    calib_dataset = Dataset(model_name=model_name, dataset_path=dataset_path, total_sample_count=nsamples)
+    tokenizer = calib_dataset.tokenizer
+    calib_dataset = [torch.tensor(input).reshape(1, -1) for input in calib_dataset.input_ids[:nsamples]]
+
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+                                                torch_dtype=torch.float16,
+                                                low_cpu_mem_usage=True, 
+                                                trust_remote_code=True,)
+    
+    model = model.eval()  # Set model to evaluation mode
+    model = model.to(torch.float16)  # Convert model to float16 for quantization
+    #model.seqlen = 3072
+
+    layer_config = {}
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                layer_config[n] = {"bits": 32}
+                print(
+                    f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
+    lm_head_layer_name = "lm_head"
+    for n, _ in model.named_modules():
+        lm_head_layer_name = n
+    if args.quant_lm_head:
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+            tied_keys = model._tied_weights_keys
+            for item in tied_keys:
+                if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
+                    args.quant_lm_head = False
+                    print(
+                        f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                        f"supported currently")
+                    break
+    if args.quant_lm_head:
+        layer_config[lm_head_layer_name] = {"bits": args.bits}
+        transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+        if transformers_version[0] == 4 and transformers_version[1] < 38:
+            error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
+            raise EnvironmentError(error_message)
+        
+    from auto_round import AutoRound
+
+    qmodel = AutoRound(model=model,
+                       tokenizer=tokenizer,
+                       bits=bits,
+                       group_size=group_size,
+                       sym=sym,
+                       dataset=calib_dataset,
+                          seqlen=256,
+                          iters=iters,
+                          device=args.device,
+                          gradient_accumulate_steps=1,
+                          batch_size=1,
+                          layer_config=layer_config,
+                          model_dtype="float16"
+                          )
+    
+    export_dir = output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{bits}g{group_size}-iters{iters}-{args.device}"
+    
+    qmodel.quantize_and_save(export_dir, format='auto_awq')
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf
new file mode 100644
index 00000000000..98958f6e0d7
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf
@@ -0,0 +1,10 @@
+llama2-70b.Offline.min_query_count = 24576 
+llama2-70b.Server.min_query_count = 24576
+llama2-70b.Server.target_qps = .90
+
+llama3_1-8b.Server.target_qps = 44.0
+llama3_1-8b.Offline.target_qps = 50.0
+
+llama3_1-8b-edge.Offline.target_qps = 12.50
+llama3_1-8b-edge.SingleStream.target_latency = 3500
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py
new file mode 100644
index 00000000000..2204c179be2
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py
@@ -0,0 +1,71 @@
+import subprocess  # nosec B404
+import json
+import os
+import io
+
+def _make_r_io_base(f, mode: str):
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode)
+    return f
+
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    f = _make_r_io_base(f, mode)
+    jdict = json.load(f)
+    f.close()
+    return jdict
+
+def get_start_cores():
+    # Use safer subprocess call without shell=True - single process approach
+    try:
+        # Using standard system command - trusted input
+        result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True)  # nosec B607 B603
+        start_cores = []
+        
+        # Process each line to find NUMA node CPU information
+        for line in result.stdout.split('\n'):
+            if 'NUMA node' in line and 'CPU' in line:
+                # Extract the CPU range (e.g., "0-15" from "NUMA node0 CPU(s): 0-15")
+                parts = line.split()
+                if len(parts) >= 3:
+                    cpu_range = parts[-1]  # Last part should be the CPU range
+                    # Get the start core (before the '-')
+                    if '-' in cpu_range:
+                        start_core = int(cpu_range.split('-')[0])
+                        start_cores.append(start_core)
+        
+        return start_cores if start_cores else [0]
+    except Exception:
+        # Fallback to basic approach
+        return [0]
+
+def get_total_runtime(log_detail_file):
+    import re
+    from datetime import datetime
+
+    with open(log_detail_file) as fid:
+        text = fid.read()
+
+    bpat = '"power_begin", "value": "(.*?)",'
+    epat = '"power_end", "value": "(.*?)",'
+
+    pbegin = re.search(bpat, text).group(1)
+    pend = re.search(epat, text).group(1)
+
+    print(f"Test started at {pbegin}")
+    print(f"Test ended at {pend}")
+
+    start_time = datetime.strptime(pbegin, "%m-%d-%Y %H:%M:%S.%f")
+    end_time = datetime.strptime(pend, "%m-%d-%Y %H:%M:%S.%f")
+    time_difference = end_time - start_time
+    total_time_in_seconds = time_difference.total_seconds()
+    print(f"Total time taken is {total_time_in_seconds}")
+    return total_time_in_seconds
+
+def estimate_performance(log_detail_file, acc_result_dict):
+    total_time_in_seconds = get_total_runtime(log_detail_file)
+    tokens_per_sample = acc_result_dict["tokens_per_sample"]
+    num_samples = acc_result_dict["gen_num"]
+    samples_per_second = round(num_samples / total_time_in_seconds, 3)
+    tokens_per_second = round(tokens_per_sample * samples_per_second, 2)
+    return tokens_per_second
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py
new file mode 100644
index 00000000000..1185f3ae19e
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py
@@ -0,0 +1,100 @@
+import os
+from vllm import SamplingParams
+import numpy as np
+import time
+import types
+
+# Disable prints if progress bar is active
+def void(*args, **kwargs):
+    pass
+
+
+PBAR = int(os.environ.get("PBAR", "1"))
+# Update frequency of the progress bar
+PBAR_FREQ = int(os.environ.get("PBAR_FREQ", "100"))
+XPU_COUNT = int(os.environ.get("XPU_COUNT", "0"))
+WORKLOAD_NAME= os.environ.get("WORKLOAD_NAME", "llama2-70b")
+GPU_MEMORY_UTILIZATION= float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.95"))
+if PBAR==1:
+    print = void
+
+#***********************Values to be imported into SUT.py
+
+SAMPLING_PARAMS = SamplingParams(
+    max_tokens=1024 if WORKLOAD_NAME=="llama2-70b" else 128,
+    min_tokens=1,
+    temperature=0.0,
+    detokenize=False
+)
+
+ADDITIONAL_MODEL_KWARGS = {
+    "gpu_memory_utilization": GPU_MEMORY_UTILIZATION
+}
+
+if XPU_COUNT>0:
+    ########################XPU-specific
+    #ADDITIONAL_MODEL_KWARGS["enforce_eager"] = True
+    pass
+else:
+    ########################CPU-specific
+
+    from numa import schedule, memory
+    import os
+    import math
+    import subprocess  # nosec B404
+    cores_per_inst = int(os.environ.get("CORES_PER_INST", "1"))
+    num_numa_nodes = int(os.environ.get("NUM_NUMA_NODES", "1"))
+    nodes_per_inst = int(os.environ["NUM_NUMA_NODES"])/int(os.environ["NUM_INSTS"])
+    insts_per_node = int(os.environ["INSTS_PER_NODE"])
+    # Only start workers in allowed NUMA nodes. Useful when node sharing
+    ALLOWED_NODES = os.environ.get("ALLOWED_NODES", "all")
+
+    def get_start_cores():
+        # Use safer subprocess call without shell=True - single process approach
+        try:
+            # Using standard system command - trusted input
+            result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True)  # nosec B607 B603
+            start_cores = []
+            
+            # Process each line to find NUMA node CPU information
+            for line in result.stdout.split('\n'):
+                if 'NUMA node' in line and 'CPU' in line:
+                    # Extract the CPU range (e.g., "0-15" from "NUMA node0 CPU(s): 0-15")
+                    parts = line.split()
+                    if len(parts) >= 3:
+                        cpu_range = parts[-1]  # Last part should be the CPU range
+                        # Get the start core (before the '-')
+                        if '-' in cpu_range:
+                            start_core = int(cpu_range.split('-')[0])
+                            start_cores.append(start_core)
+            
+            return start_cores if start_cores else [0]
+        except Exception:
+            # Fallback to basic approach
+            return [0]
+
+    def SUT_OVERRIDE(self):
+        node_start_cores = get_start_cores()
+        core_lists = []
+        if insts_per_node>0:
+            for i in range(num_numa_nodes):
+                for j in range(insts_per_node):
+                    core_lists.append(tuple(range(node_start_cores[i]+j*cores_per_inst, node_start_cores[i]+(j+1)*cores_per_inst)))
+
+        node_list = list(range(len(node_start_cores)))
+        allowed_nodes = [str(node) for node in node_list]
+        if ALLOWED_NODES!="all":
+            allowed_nodes = ALLOWED_NODES.split(",")
+        self.individual_worker_kwargs["core_list"] = core_lists
+
+        instance_node_list = []
+        for j in range(self.num_workers):
+            cur_node = math.floor(j*nodes_per_inst)
+            instance_node_list.append(tuple([cur_node]))
+        self.individual_worker_kwargs["node_list"] = instance_node_list
+        
+
+    def INSTANCE_OVERRIDE(self):
+        print("self.node_list", self.node_list)
+        os.environ["VLLM_CPU_OMP_THREADS_BIND"]=f"{self.core_list[0]}-{self.core_list[-1]}"
+        memory.set_membind_nodes(*self.node_list)

From 4555df1398cc1f12f7415ed00a6e1342b82334ec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 05:54:09 +0000
Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language-modeling/mlperf/llama2-70b/SUT.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
index 8af65d15d99..428788afa6f 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
@@ -708,7 +708,7 @@ def start(self):
         # Create worker threads
         for j in range(self.num_workers):
 
-            # Build kwargs for inidividual workers
+            # Build kwargs for individual workers
             private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs}
             private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs
 

From 1f5178557e58c3261cc58cd70c0f71a547e0e6bd Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Tue, 18 Nov 2025 06:29:48 +0000
Subject: [PATCH 4/6] update doc.

---
 .../mlperf/llama2-70b/README.md               | 79 +++++++++++++++++++
 .../mlperf/llama2-70b/SUT.py                  |  2 +-
 .../mlperf/llama2-70b/quantize_70b.sh         |  4 +-
 .../mlperf/llama2-70b/quantize_autoround.py   |  7 +-
 .../mlperf/llama2-70b/simple_autoround.py     |  5 +-
 5 files changed, 87 insertions(+), 10 deletions(-)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
new file mode 100644
index 00000000000..4723276c1e4
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
@@ -0,0 +1,79 @@
+# Llama2
+
+## Getting started
+
+Please first download the data, model and preprocess the data folloing the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data.
+
+### Download Model
+
+Please download model files by following the mlcommons README.md with instructions:
+
+```bash
+# following steps: https://github.com/mlcommons/inference/blob/master/language/llama2-70b/README.md#get-dataset
+```
+
+### Download and Prepare Data
+
+Please download data files by following the mlcommons README.md with instructions.
+Please move the downloaded pickle into expected path and follow steps to run the required data pre-processing:
+
+```bash
+# follow: https://github.com/mlcommons/inference/blob/master/language/llama2-70b/README.md#get-dataset
+# to download file: open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz, open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz
+
+# unzip files
+gzip -dk open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz
+gzip -dk open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz
+
+# make sure you are in mlperf's container
+make prebuild
+
+# move into right directory
+mv open_orca_gpt4_tokenized_llama.*.pkl build/data/llama2-70b/
+
+```
+
+Make sure after the 2 steps above, you have:
+
+1. model downloaded at: `build/models/Llama2/Llama-2-70b-chat-hf/`,
+2. preprocessed data at `build/preprocessed_data/llama2-70b/`:
+
+- `build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl`
+- `build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.calibration_1000.pkl`
+
+## Build and run the benchmarks
+
+Please follow the steps below in MLPerf container.
+
+### Prepare environments
+
+```bash
+# make sure you are in mlperf's container
+bash setup.sh
+
+```
+
+### Quantize model
+
+Use [Intel/auto-round](https://github.com/intel/auto-round) to make the quantization model
+
+```bash
+# quantize model
+bash quantize_70b.sh
+
+# you can also use the default quantization config of autoround
+# run the `python simple_autoround.py` directly
+```
+
+### Benchmark
+
+```bash
+# inference
+
+VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python main.py --mlperf-conf mlperf.conf --model-path build/models/Llama2/Llama-2-70b-chat-hf-quantized --dataset-path build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl --tensor-parallel 2 --warmup --user-conf user.conf --accuracy --batch-size 128
+
+# evaluation
+CUDA_VISIBLE_DEVICES=2 python evaluation_scripts/evaluate-accuracy.py --checkpoint-path build/models/Llama2/Llama-2-70b-chat-hf-quantized  --mlperf-accuracy-file build/logs/mlperf_log_accuracy.json --dataset-file build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl --dtype int64
+
+```
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
index 428788afa6f..8af65d15d99 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
@@ -708,7 +708,7 @@ def start(self):
         # Create worker threads
         for j in range(self.num_workers):
 
-            # Build kwargs for individual workers
+            # Build kwargs for inidividual workers
             private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs}
             private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
index 20afd8b22d3..5bcec92be54 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh
@@ -1,5 +1,5 @@
-CHECKPOINT_PATH=meta-llama/Llama-2-70b-chat-hf
-CALIBRATION_DATA_PATH=./open_orca/open_orca_gpt4_tokenized_llama.calibration_1000.pkl
+CHECKPOINT_PATH=build/models/Llama2/Llama-2-70b-chat-hf/
+CALIBRATION_DATA_PATH=build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.calibration_1000.pkl
 NUM_GROUPS=-1
 NUM_SAMPLES=1000
 ITERS=200
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
index 93a0483438f..cb5639bb163 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py
@@ -95,11 +95,10 @@ def main():
     orig_path = args.model_name
     packing_format="llm_compressor"
     if orig_path.endswith("/"):
-        output_dir=orig_path[:-1]+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}"
+        output_dir=orig_path[:-1]+f"-quantized"
     else:
-        output_dir=orig_path+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}"
-    if args.fp8_kv:
-        output_dir += "-fp8kv"
+        output_dir=orig_path+f"-quantized"
+
     autoround.quantize_and_save(output_dir, format=f'{packing_format}')
 
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py
index 0a4e895a946..3bc624b1cae 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py
@@ -2,9 +2,8 @@
 from auto_round import AutoRound
 
 # Load a model (supports FP8/BF16/FP16/FP32)
-model_name_or_path = "meta-llama/Llama-2-70b-chat-hf"
-#output_dir = "Llama-2-70b-chat-hf-mxfp4-fp8kv"
-output_dir = "Llama-2-70b-chat-hf-mxfp4"
+model_name_or_path = "build/models/Llama2/Llama-2-70b-chat-hf/"
+output_dir = "build/models/Llama2/Llama-2-70b-chat-hf-quantized"
 
 #ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200, static_kv_dtype="fp8",)
 ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200)

From 70b8cfc763d1719fd48cc2f2514d553686abbeec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 06:31:36 +0000
Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language-modeling/mlperf/llama2-70b/README.md               | 2 +-
 .../language-modeling/mlperf/llama2-70b/SUT.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
index 4723276c1e4..09358bee2e1 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
@@ -2,7 +2,7 @@
 
 ## Getting started
 
-Please first download the data, model and preprocess the data folloing the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data.
+Please first download the data, model and preprocess the data following the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data.
 
 ### Download Model
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
index 8af65d15d99..428788afa6f 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py
@@ -708,7 +708,7 @@ def start(self):
         # Create worker threads
         for j in range(self.num_workers):
 
-            # Build kwargs for inidividual workers
+            # Build kwargs for individual workers
             private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs}
             private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs
 

From 4284c70310f5252efa01cc5d7f6a19f6a6736ce3 Mon Sep 17 00:00:00 2001
From: lkk <33276950+lkk12014402@users.noreply.github.com>
Date: Tue, 18 Nov 2025 14:32:46 +0800
Subject: [PATCH 6/6] fix typo

---
 .../language-modeling/mlperf/llama2-70b/README.md              | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
index 09358bee2e1..15d5a8e65b2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md
@@ -25,9 +25,6 @@ Please move the downloaded pickle into expected path and follow steps to run the
 gzip -dk open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz
 gzip -dk open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz
 
-# make sure you are in mlperf's container
-make prebuild
-
 # move into right directory
 mv open_orca_gpt4_tokenized_llama.*.pkl build/data/llama2-70b/