From 59fdcffa3c4c49b32ff62c1927d469f0435a6d9b Mon Sep 17 00:00:00 2001 From: lkk Date: Tue, 18 Nov 2025 03:33:55 +0000 Subject: [PATCH 1/6] setup env and quantize model. --- .../mlperf/llama2-70b/quantize_70b.sh | 24 ++++ .../mlperf/llama2-70b/quantize_autoround.py | 107 ++++++++++++++++++ .../mlperf/llama2-70b/requirements.txt | 4 + .../mlperf/llama2-70b/setup.sh | 18 +++ .../mlperf/llama2-70b/simple_autoround.py | 23 ++++ 5 files changed, 176 insertions(+) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh new file mode 100644 index 00000000000..20afd8b22d3 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh @@ -0,0 +1,24 @@ +CHECKPOINT_PATH=meta-llama/Llama-2-70b-chat-hf +CALIBRATION_DATA_PATH=./open_orca/open_orca_gpt4_tokenized_llama.calibration_1000.pkl +NUM_GROUPS=-1 +NUM_SAMPLES=1000 +ITERS=200 +BATCH_SIZE=1 +NUM_CORES=$(($(lscpu | grep "Socket(s):" | awk '{print $2}') * $(lscpu | grep "Core(s) per socket:" | awk '{print $4}'))) +END_CORE=$(($NUM_CORES - 1)) + +run_cmd="python -u" +if (( XPU_COUNT < 1 )); then + export OMP_NUM_THREADS=$NUM_CORES + run_cmd="numactl -C 0-$END_CORE python -u" +fi + +$run_cmd quantize_autoround.py \ + --model_name ${CHECKPOINT_PATH} \ + --dataset ${CALIBRATION_DATA_PATH} \ + --group_size ${NUM_GROUPS} \ + --bits 4 \ + --iters ${ITERS} \ + --batch_size ${BATCH_SIZE} \ + --device auto \ + --lr 2.5e-3 2>&1 | tee autoround_log_${NUM_GROUPS}g_${NUM_SAMPLES}nsamples_${ITERS}iters.log diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py new file mode 100644 index 00000000000..93a0483438f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py @@ -0,0 +1,107 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from auto_round import AutoRound +from datasets import Dataset +import os +import pandas as pd +import argparse +import json + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", default="facebook/opt-125m" + ) + parser.add_argument( + "--dataset", type=str, required=True + ) + parser.add_argument("--bits", default=4, type=int, + help="number of bits") + parser.add_argument("--group_size", default=128, type=int, + help="group size") + parser.add_argument("--sym", default=False, action='store_true', + help=" sym quantization") + parser.add_argument("--iters", default=200, type=int) + parser.add_argument("--batch_size", default=1, type=int) + parser.add_argument("--lr", default=None, type=float, + help="learning rate, if None, it will be set to 1.0/iters automatically") + parser.add_argument("--minmax_lr", default=None, type=float, + help="minmax learning rate, if None,it will beset to be the same with lr") + parser.add_argument("--device", default='fake', type=str, + help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu' and 'xpu'." + "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.") + parser.add_argument("--fp8_kv", default=False, action='store_true', + help="set fp8 kv") + + args = parser.parse_args() + return args + +def main(): + args = get_args() + model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + tokenizer.pad_token = tokenizer.eos_token + + # Note: Using pickle with trusted dataset files only + # In production, consider using safer serialization formats like JSON or HDF5 + dataframe = pd.read_pickle(args.dataset) # nosec B301 + dataframe_str = dataframe["input"] + print(len(dataframe_str)) + dataset_list = [] + token_list = [] + for data_str in dataframe_str: + # print(data_str) + data_token = tokenizer.encode( + data_str, + max_length=1024, + truncation=True, add_special_tokens=False) + token_list.append(data_token) + data_str = tokenizer.decode(data_token) + dataset_list.append(data_str) + + ds = Dataset.from_dict({"input_ids": token_list}) + recipe = """ + quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: int + strategy: tensor + dynamic: false + symmetric: true + """ + NUM_CALIBRATION_SAMPLES = 512 + MAX_SEQUENCE_LENGTH = 1024 + + autoround = AutoRound( + model, + tokenizer, + scheme="MXFP4", + iters=args.iters, + lr=args.lr, + minmax_lr=args.minmax_lr, + nsamples=len(dataframe_str), + seqlen=MAX_SEQUENCE_LENGTH, + batch_size=args.batch_size, + dataset=dataset_list, + enable_torch_compile=False, + amp=False, + device_map="0", + static_kv_dtype="fp8" if args.fp8_kv else None, + # device_map="0,1", + #device=args.device + ) + + orig_path = args.model_name + packing_format="llm_compressor" + if orig_path.endswith("/"): + output_dir=orig_path[:-1]+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}" + else: + output_dir=orig_path+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}" + if args.fp8_kv: + output_dir += "-fp8kv" + autoround.quantize_and_save(output_dir, format=f'{packing_format}') + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt new file mode 100644 index 00000000000..bf2e5761647 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/requirements.txt @@ -0,0 +1,4 @@ +lm-eval==0.4.9.1 +setuptools_scm +torchao==0.12.0 +triton==3.3.1 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh new file mode 100644 index 00000000000..5966ef95ad7 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/setup.sh @@ -0,0 +1,18 @@ +pip install -r requirements.txt + +mkdir -p /workspace/build_envs + +cd /workspace/build_envs + +pip install setuptools --upgrade +pip install packaging --upgrade +pip install -U "huggingface_hub[cli]" +git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git +cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation +cd .. + +git clone https://github.com/mlcommons/inference.git mlperf-inference && cd mlperf-inference && cd loadgen && \ +python3 -m pip install . + +pip install auto-round==0.9.0 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py new file mode 100644 index 00000000000..0a4e895a946 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py @@ -0,0 +1,23 @@ + +from auto_round import AutoRound + +# Load a model (supports FP8/BF16/FP16/FP32) +model_name_or_path = "meta-llama/Llama-2-70b-chat-hf" +#output_dir = "Llama-2-70b-chat-hf-mxfp4-fp8kv" +output_dir = "Llama-2-70b-chat-hf-mxfp4" + +#ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200, static_kv_dtype="fp8",) +ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200) + +# Highest accuracy (4–5× slower). +# `low_gpu_mem_usage=True` saves ~20GB VRAM but runs ~30% slower. +# ar = AutoRound(model_name_or_path, nsamples=512, iters=1000, low_gpu_mem_usage=True) + +# Faster quantization (2–3× speedup) with slight accuracy drop at W4G128. +# ar = AutoRound(model_name_or_path, nsamples=128, iters=50, lr=5e-3) + +# Supported formats: "auto_round" (default), "auto_gptq", "auto_awq", "llm_compressor", "gguf:q4_k_m", etc. +# ar.quantize_and_save(output_dir="./tmp_autoround", format="llm_compressor") +ar.quantize_and_save(output_dir=output_dir, format="llm_compressor") + + From 24f55a18d288d770d0d9473a274dfc137fd48e33 Mon Sep 17 00:00:00 2001 From: lkk Date: Tue, 18 Nov 2025 05:50:40 +0000 Subject: [PATCH 2/6] add inference and evaluate code. --- .../mlperf/llama2-70b/SUT.py | 960 ++++++++++++++++++ .../mlperf/llama2-70b/dataset.py | 148 +++ .../evaluate-accuracy-cnn.py | 140 +++ .../evaluation_scripts/evaluate-accuracy.py | 140 +++ .../mlperf/llama2-70b/main.py | 162 +++ .../mlperf/llama2-70b/run_quantization.py | 116 +++ .../mlperf/llama2-70b/user.conf | 10 + .../mlperf/llama2-70b/utils.py | 71 ++ .../mlperf/llama2-70b/utils_model.py | 100 ++ 9 files changed, 1847 insertions(+) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py new file mode 100644 index 00000000000..8af65d15d99 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py @@ -0,0 +1,960 @@ +import os +import time +import numpy as np +import math +import array +import torch +from typing import Tuple +from transformers import AutoTokenizer, AutoConfig, AutoModel + +import time +import threading +import torch.multiprocessing as mp +import gc +import types + +import logging + +import mlperf_loadgen as lg +from dataset import Dataset + +import os +from tqdm import tqdm +import sys + +from vllm import LLM +from vllm.inputs import TokensPrompt +from vllm.config import CompilationConfig, CompilationLevel + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("SUT") + +#MAX_NUM_BATCHED_TOKENS=int(os.environ.get("MAX_NUM_BATCHED_TOKENS", 32768)) +MAX_NUM_BATCHED_TOKENS=4096 +MAX_MODEL_LEN=int(os.environ.get("MAX_MODEL_LEN", MAX_NUM_BATCHED_TOKENS)) +PREFILL_BLOCK_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_STEP", "128")) +PREFILL_MIN_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_MIN", "128")) +PREFILL_MAX_SIZE = int(os.environ.get("VLLM_PROMPT_SEQ_BUCKET_MAX", "1024")) +SERVER_TIME_LIMIT = float(os.environ.get("SERVER_TIME_LIMIT", "2")) +SERVER_COMPUTE_TIME = float(os.environ.get("SERVER_COMPUTE_TIME", "1")) +BATCHED_PREFILL = os.environ.get("BATCHED_PREFILL", "1")=="1" +MAX_BATCHED_PROMPT_LEN = int(os.environ.get('VLLM_MAX_BATCHED_PROMPT_LEN', sys.maxsize)) +ENABLE_TORCH_COMPILE = int(os.environ.get("ENABLE_TORCH_COMPILE", "0"))==1 + +MODEL_NAME = os.environ.get("MODEL_NAME", "").lower() + +MODEL_INIT_DIR = os.environ.get("MODEL_INIT_DIR", "") +XPU_COUNT = int(os.environ.get("XPU_COUNT", "0")) + +BLOCK_SIZE = 8 +sys.path.append(MODEL_INIT_DIR) + +from utils_model import * + +# Copied from vllm/worker/hpu_model_runner.py +def warmup_range(config: Tuple[int, int, int]): + """Generate a warmup range. + + Start from bmin and multiply by 2 until you reach bstep. + Then, increase the values in the range by the value of bstep until you + reach bmax. + + Example: + bmin = 2, bstep = 32, bmax = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => return ramp_up + stable => (2, 4, 8, 16, 32, 64) + """ + import itertools + import operator + bmin, bstep, bmax = config + if bmin > bmax: + raise ValueError("Min. batch size cannot be greater than max. " + "batch size. If you want to skip warmup, " + "set VLLM_SKIP_WARMUP=true") + base = itertools.repeat(2) + ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) + ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ + ramp_up_acc) + stable = range(bstep, bmax + 1, bstep) + buckets = list(ramp_up_tw) + list(stable) + return list(filter(lambda bucket: bucket >= bmin, buckets)) + +PREFILL_BUCKETS = warmup_range((PREFILL_MIN_SIZE, PREFILL_BLOCK_SIZE, PREFILL_MAX_SIZE)) + +def len_to_bucket(length): + bucket = 0 + while length>PREFILL_BUCKETS[bucket]: + bucket+=1 + return bucket + +def void(*args, **kwargs): + pass + +# Disable prints if progress bar is active +PBAR = int(os.environ.get("PBAR", "1")) +# Update frequency of the progress bar +PBAR_FREQ = int(os.environ.get("PBAR_FREQ", "10")) +if PBAR==1: + print = void + +class Instance(mp.Process): + def __init__(self,**kwargs): + super(Instance, self).__init__() + for key, value in kwargs.items(): + # Set each keyword argument as an attribute of the instance + setattr(self, key, value) + + if hasattr(self, "lg_settings"): + self.max_ttft_latency = self.lg_settings.ttft_latency/1e9 + self.max_tpot_latency = self.lg_settings.tpot_latency/1e9 + else: + self.max_ttft_latency = 2 + self.max_tpot_latency = 0.2 + self.query_idx_mapping = [] + self.qid_mapping = [] + self.start_time_mapping = [] + self.wait_time_mapping = [] + self.first_time_mapping = [] + self.first_token_id = [] + self.finished = False + + self.tpot_list = [] + self.tprefill_list = [] + self.nprefill_list = [] + + + # Extend/override Instance methods if it's defined + try: + self.instance_override = types.MethodType(INSTANCE_OVERRIDE, self) + except NameError: + pass + else: + self.instance_override() + + def do_warmup(self): + dummy_input = torch.zeros((128,), dtype=torch.int32) + inputs = TokensPrompt(prompt_token_ids=dummy_input.tolist()) + from vllm import SamplingParams + sampling_params = SamplingParams( + max_tokens=8, + min_tokens=8, + temperature=0.0, + detokenize=False + ) + # Using a large value for request_id to avoid collisions + for i in range(1): + self.model.llm_engine.add_request(str(int(1e8)), + inputs, + sampling_params) + + while self.model.llm_engine.has_unfinished_requests(): + step_outputs = self.model.llm_engine.step() + for output in step_outputs: + request_id = int(output.request_id) + if output.finished: + token_ids = output.outputs[0].token_ids + log.info(f"[{time.time():.3f}] Warmup finished, " + f"Rank {self.rank}, " + f"Generated {len(token_ids)} tokens") + + with self.cond_var: + self.alive_counter.value += 1 + self.cond_var.notify() + + def step_engine_prompt(self, *args): + self.step_engine(*args) + + def run(self): + gc.disable() + + # Select correct XPU device + if XPU_COUNT>0: + os.environ["ONEAPI_DEVICE_SELECTOR"]=f'level_zero:{self.xpu_devices}' + pass + + self.load_dataset() + + log.info("Loading Model") + self.load_model() + + self.sampling_params = SAMPLING_PARAMS + + self.warmup_req_idx = set([]) + if self.warmup: + self.do_warmup() + else: + with self.cond_var: + self.alive_counter.value += 1 + self.cond_var.notify() + + keep_alive = True + self.req_counter = 0 # Request counter + + # Running seq_len array + self.running_seq_len = np.zeros(self.batch_size) + # Map ind within a batch to request id + self.running_ind_to_id = np.zeros(self.batch_size) + + while keep_alive: + keep_alive = self.process_queries() + + # V1 shutdown engine + self.model.llm_engine.engine_core.shutdown() + + with self.cond_var: + self.alive_counter.value -= 1 + # The instance needs to get restarted + if self.alive_counter.value == 0: + pass + else: + self.input_queue.put(None) + self.cond_var.notify() + + def load_model(self): + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor=True, + # compile_sizes=[64,128,256], + ) + + model_kwargs={ + "model": self.model_path, + #"dtype": 'float16', + "dtype": "auto", + "skip_tokenizer_init": False, + "tensor_parallel_size": self.tp, + "pipeline_parallel_size": self.pp, + "max_num_seqs": self.batch_size, + "max_model_len": MAX_MODEL_LEN, + # "enforce_eager": True, + #"max_num_batched_tokens": MAX_NUM_BATCHED_TOKENS, + #"enable_prefix_caching": False, + #"distributed_executor_backend": "spawn", + # "calculate_kv_scales": True, + } + + #if XPU_COUNT>0: + # model_kwargs["kv_cache_dtype"] = "int8" + #else: + # model_kwargs["kv_cache_dtype"] = "fp8_e5m2" + # model_kwargs["kv_cache_dtype"] = "fp8" + if ENABLE_TORCH_COMPILE: + model_kwargs["compilation_config"] = compilation_config + if self.quantized: + pass + + log.info(model_kwargs) + # Model-specific params + #model_kwargs = model_kwargs | ADDITIONAL_MODEL_KWARGS + + self.model = LLM(**model_kwargs) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path + ) + + self.tokenizer.pad_token = self.tokenizer.eos_token + + def load_dataset(self): + if "llama3_1-8b" in self.workload_name: + self.data_object = Dataset(model_name=self.model_path, + dataset_path=self.dataset_path, + total_sample_count=self.total_sample_count) + else: + self.data_object = Dataset(self.model_path, + dataset_path=self.dataset_path, + total_sample_count=self.total_sample_count, + ) + #device="cpu") + + def step_engine( + self, + pred_output_tokens, + query_ids, + qid, + pred_first_tokens, + query_ids_first, + qid_first, + ): + if self.model.llm_engine.has_unfinished_requests(): + if len(self.first_token_id)>0: + end = time.time() + step_outputs = self.model.llm_engine.step() + if len(self.first_token_id)>0: + step_time = time.time()-end + for output in step_outputs: + request_id = int(output.request_id) + + if request_id in self.warmup_req_idx: # Skip requests that are from the warmup + continue + + if len(self.first_token_id) > 0: + # Add step time to everything other than the one being prefilled + if not PBAR: + for i in range(len(self.tprefill_list)-len(self.first_token_id)): + self.tprefill_list[i] += step_time + self.nprefill_list[i] += len(self.first_token_id) + + if output.finished: + token_ids = output.outputs[0].token_ids + pred_output_tokens.append(token_ids) + query_ids.append(self.query_idx_mapping[request_id]) + qid.append(self.qid_mapping[request_id]) + + finish_time = time.time() + time_elapsed = finish_time - self.start_time_mapping[request_id] + ttft = self.first_time_mapping[request_id] + wait = self.wait_time_mapping[request_id] + # Rest of the tokens + tpot = (time_elapsed - ttft)/(len(token_ids)-1) + with self.cond_var: + if ttft+wait >= self.max_ttft_latency: + self.over_ttft_counter.value += 1 + if tpot >= self.max_tpot_latency: + self.over_tpot_counter.value += 1 + over_ttft_count = self.over_ttft_counter.value + over_tpot_count = self.over_tpot_counter.value + self.cond_var.notify() + + if not PBAR: + self.tpot_list.append(tpot) + tpot_sorted = np.sort(self.tpot_list) + tpot_mean = np.mean(self.tpot_list) + print(f"[{time.time():.3f}] Query {request_id} finished after {time_elapsed:.3f}s, " + f"Rank {self.rank}, " + f"TTFT {ttft:.3f}s ({over_ttft_count}), " + f"TPOT {tpot:.3f}s ({over_tpot_count}), " + f"prefill time {self.tprefill_list[request_id]:.3f}s, " + f"prefill count {self.nprefill_list[request_id]}, " + f"wait time {wait:.3f}s, " + f"bs_c {self.model.llm_engine.get_num_unfinished_requests()}, " + f"p99 tpot {tpot_sorted[int(0.99*len(tpot_sorted))]:.3f}s, " + f"p50 tpot {tpot_mean:.3f}s, " + f"generated {len(token_ids)} tokens") + + if request_id in self.first_token_id: + self.first_time_mapping[request_id] = time.time()-self.start_time_mapping[request_id] + if self.server: + token_ids = output.outputs[0].token_ids + pred_first_tokens.append(token_ids) + query_ids_first.append(self.query_idx_mapping[request_id]) + qid_first.append(self.qid_mapping[request_id]) + self.first_token_id.remove(request_id) + + def fetch_queries(self): + # For chunked prefill + # samples_to_fill = self.batch_size + self.prefill_batch_size - self.model.llm_engine.get_num_unfinished_requests() + samples_to_fill = self.batch_size - self.model.llm_engine.get_num_unfinished_requests() + + add_new_qitem = samples_to_fill>=self.prefill_batch_size + return_value = True + qitem = -1 + + if self.finished: + add_new_qitem = False + if not self.model.llm_engine.has_unfinished_requests(): + print(f"Rank {self.rank} setting return_value to False") + return_value = False + + # Receive request if there is an empty decode slot + qitem_list = [] + added_new_queries = 0 + + # Ensure that after insertion, the running batch size doesn't exceed max batch size + while add_new_qitem and added_new_queries+self.prefill_batch_size<=samples_to_fill: + try: + # Continue decoding if nothing in queue + qitem = self.input_queue.get(False) + except Exception: + qitem = -1 + else: + pass + + if qitem is None: + self.finished = True + add_new_qitem = False + elif type(qitem) == int: + add_new_qitem = False + else: + qitem_list.append(qitem) + added_new_queries += len(qitem[0]) + # In case main thread cannot keep up + time.sleep(0.01) + return qitem_list, return_value + + def run_one_step(self, qitem_list): + pred_output_tokens = [] + query_ids = [] + qid = [] + pred_first_tokens = [] + query_ids_first = [] + qid_first = [] + step_engine_args = ( + pred_output_tokens, + query_ids, + qid, + pred_first_tokens, + query_ids_first, + qid_first + ) + + if len(qitem_list)>0: + + for qitem in qitem_list: + qitem, start_time = qitem + # Construct / collate batch + + input_ids_tensor = [] + # q = qitem[0] # All qitems are of length 1 + for i,q in enumerate(qitem): + sampling_params = self.sampling_params + if "llama3_1-8b" in self.workload_name: + input_ids_tensor = [self.data_object.input_ids[q.index]] + else: + input_ids_tensor = self.data_object.input_ids[q.index].tolist() + + self.first_token_id.append(len(self.query_idx_mapping)) + self.wait_time_mapping.append(time.time()-start_time[i]) + self.query_idx_mapping.append(q.index) + self.qid_mapping.append(q.id) + self.start_time_mapping.append(time.time()) + self.first_time_mapping.append(0) + self.tprefill_list.append(0) + self.nprefill_list.append(0) + + for prompt_id in input_ids_tensor: + if isinstance(prompt_id, torch.Tensor): + prompt_id = prompt_id.tolist() + inputs = TokensPrompt(prompt_token_ids=prompt_id)#.tolist()) + + self.model.llm_engine.add_request(str(self.req_counter), + inputs, + sampling_params) + + self.req_counter += 1 + + # Find the ind that this request can be inserted + batch_ind = np.argmax(self.running_ind_to_id==0) + self.running_seq_len[batch_ind] = self.data_object.input_lens[q.index]+1 + self.running_ind_to_id[batch_ind] = q.id + # print(f"adding {q.id}. Mapped to {batch_ind}") + + print(f"Step prompt, req_counter {self.req_counter}") + self.step_engine_prompt(*step_engine_args) + elif self.model.llm_engine.has_unfinished_requests(): + self.step_engine(*step_engine_args) + self.running_seq_len = self.running_seq_len + 1 + else: + # Avoid excessive empty steps + time.sleep(0.05) + return pred_output_tokens, query_ids, qid, pred_first_tokens, query_ids_first, qid_first + + def process_step_outputs(self, step_outputs): + ( + pred_output_tokens, + query_ids, + qid, + pred_first_tokens, + query_ids_first, + qid_first + ) = step_outputs + # Process first tokens + if len(pred_first_tokens)>0: + processed_first = [] + first_time = time.time() + for pred_first_token in pred_first_tokens: + processed_first.append(np.array(pred_first_token).reshape(-1)) + self.first_queue.put((qid_first, processed_first, first_time)) + + # Process finished requests + if len(pred_output_tokens)>0: + processed_output = self.data_object.postProcess(pred_output_tokens, + input_seq_lens=0, + query_id_list=query_ids) + + # Zero-out corresponding entries + for qid_c in qid: + ind_c = np.argmax(self.running_ind_to_id==qid_c) + # print(f"zeroing out {qid_c}. Mapped to {ind_c}") + self.running_ind_to_id[ind_c] = 0 + self.running_seq_len[ind_c] = 0 + + self.output_queue.put((qid,processed_output)) + + with self.cond_var: + self.sample_counter.value += len(query_ids) + print(f"Samples run: {self.sample_counter.value}, rank {self.rank}, current_count {self.current_counter.value}") + self.current_counter.value -= len(query_ids) + self.cond_var.notify() + + # Update number of blocks + ind_valid = self.running_ind_to_id > 0 + # Don't update when idle + if np.sum(ind_valid)>0: + with self.cond_var: + self.blocks_counter.value = int(np.sum((self.running_seq_len[ind_valid]+BLOCK_SIZE-1)//BLOCK_SIZE)) + # print(f"Rank {self.rank} updated blocks_counter {self.blocks_counter.value}") + self.cond_var.notify() + + def process_queries(self): + # Attempt to fetch queries from input queue + qitem_list, return_value = self.fetch_queries() + # Run inference (prefill/decode) for one step + step_outputs = self.run_one_step(qitem_list) + # Process (finished) step outputs + self.process_step_outputs(step_outputs) + return return_value + +def find_best_bucket(query_lists, prefill_tokens, input_len): + query_lists_new = query_lists + [0] + query_lists_new = (np.array(query_lists_new)+input_len)%(prefill_tokens+1) + # Try to fill the bucket with the minimum number of total tokens + best_ind = np.argmax(query_lists_new) + ind_full = query_lists_new>(prefill_tokens-64) + if ind_full.any(): + best_ind_full = np.argmax(ind_full) + else: + best_ind_full = len(query_lists_new) + best_ind = min(best_ind, best_ind_full) + return best_ind + +class SUT(): + def __init__(self, + model_path=None, + workload_name="llama2-70b", + lg_settings=None, + scenario="offline", + dtype="bfloat16", + device="cpu", + batch_size=None, + total_sample_count=24576, + dataset_path=None, + use_cached_outputs=False, # Set this to True *only for test accuracy runs* in case your prior session was killed partway through + workers=1, + tp=1, + pp=1, + quantized=False, + warmup=False, + partial_output_dir_list=[]): + if XPU_COUNT>0: # XPU needs spawn method. Can be problematic when using INSTANCE_OVERRIDE + try: + mp.set_start_method('spawn') + except RuntimeError: + print("Start method can only be set once") + + # If ONEAPI_DEVICE_SELECTOR is set outside, use that to limit device selection within instances + preset_device_selector = os.environ.get("ONEAPI_DEVICE_SELECTOR", "") + if preset_device_selector=="": + self.xpu_devices = range(XPU_COUNT) + else: + self.xpu_devices = preset_device_selector.split("level_zero:")[1].split(",") + self.xpu_devices = [str(d) for d in self.xpu_devices] + + self.tp = tp + self.pp = pp + + self.model_path = model_path or "meta-llama/Llama-3.1-70B-Instruct" + self.workload_name = workload_name + self.lg_settings=lg_settings + self.scenario = scenario + self.device = device + + if not batch_size: + if device == "cpu": + batch_size = 1 + else: + batch_size = 32 # Reduce to 8 if using 4 GPUs, 16 for 8. + self.batch_size = batch_size + self.total_sample_count = total_sample_count + #self.tp = tp + self.quantized=quantized + self.warmup = warmup + + # dtype + if dtype == 'bfloat16': + self.amp_enabled = True + self.amp_dtype = torch.bfloat16 + elif dtype == 'float16': + self.amp_enabled = True + self.amp_dtype = torch.float16 + else: + self.amp_enabled = False + self.amp_dtype = torch.float32 + + self.dataset_path = dataset_path + self.qsl = lg.ConstructQSL(self.total_sample_count, self.total_sample_count, + self.LoadSamplesToRam, self.UnloadSamplesFromRam) + + self.num_workers = workers + self.worker_threads = [None] * self.num_workers + self.query_queue_list = [mp.JoinableQueue() for _ in range(self.num_workers)] + self.query_queue_int = mp.Queue() + self.output_queue = mp.Queue() + self.alive_counter = mp.Value("i", 0) + self.dead_counter = mp.Value("i", 0) + self.cond_var = mp.Condition(lock=mp.Lock()) + + self.use_cached_outputs = use_cached_outputs + self.sample_counter = mp.Value("i", 0) + self.current_counter_list = [mp.Value("i", 0) for _ in range(self.num_workers)] + self.blocks_counter_list = [mp.Value("i", 0) for _ in range(self.num_workers)] + self.over_ttft_counter = mp.Value("i", 0) + self.over_tpot_counter = mp.Value("i", 0) + + # TODO: CHANGE THIS + if BATCHED_PREFILL: + # Adjust for chunked prefill + self.prefill_batch_size = 1 + else: + self.prefill_batch_size = 1 + + self.progress = None + self.tp_sizes = [] + self.load_dataset() # In case SUT may balance load using query lens + self.core_lists = [[]]*self.num_workers + self.completed_queries = self.find_completed_queries(partial_output_dir_list) + self.first_token_queue = None + + # Shared worker kwargs. These will be passed as is to the workers + self.shared_worker_kwargs={ + "model_path": self.model_path, + "workload_name": self.workload_name, + "num_workers": self.num_workers, + # "lg_settings": self.lg_settings, # issue with spawn + "dataset_path": self.dataset_path, + #"device": self.device, + "batch_size": self.batch_size, + "total_sample_count": self.total_sample_count, + "dtype": self.amp_dtype, + "output_queue": self.output_queue, + "first_queue": self.first_token_queue, + "cond_var": self.cond_var, + "alive_counter": self.alive_counter, + "dead_counter": self.dead_counter, + "sample_counter": self.sample_counter, + "over_ttft_counter": self.over_ttft_counter, + "over_tpot_counter": self.over_tpot_counter, + "server": self.scenario.lower()=="server", + "tp": self.tp, + "pp": self.pp, + "quantized": self.quantized, + "warmup": self.warmup, + "prefill_batch_size": self.prefill_batch_size, + } + + # Individual worker kwargs. These will be lists of size self.num_workers. i-th worker will get i-th element of each list + self.individual_worker_kwargs={ + "input_queue": self.query_queue_list, + "current_counter": self.current_counter_list, + "blocks_counter": self.blocks_counter_list, + "rank": list(range(self.num_workers)) + } + + if XPU_COUNT>0: + world_size = self.tp * self.pp + self.individual_worker_kwargs["xpu_devices"]=[",".join(self.xpu_devices[j*world_size:(j+1)*world_size]) for j in range(self.num_workers)] + + # Extend/override SUT method if it's defined + try: + self.sut_override = types.MethodType(SUT_OVERRIDE, self) + except NameError: + pass + else: + self.sut_override() + + def load_dataset(self): + if self.workload_name=="llama3_1-8b": + self.data_object = Dataset(model_name=self.model_path, + dataset_path=self.dataset_path, + total_sample_count=self.total_sample_count) + else: + self.data_object = Dataset(self.model_path, + dataset_path=self.dataset_path, + total_sample_count=self.total_sample_count, + ) + #device="cpu") + + def find_completed_queries(self, partial_output_dir_list): + import json + completed_queries = set() + for directory in partial_output_dir_list: + with open(os.path.join(directory,"mlperf_log_accuracy.json"), "r", encoding="UTF-8") as f: + lines = f.readlines() + combined_line = "" + for i,line in enumerate(lines): + if i==len(lines)-1: + if line[-1]!="]": + line += "]" + combined_line += line + log.info(f"End of combined_line {combined_line[-10:]}") + results = json.loads(combined_line) + for pred in results: + qsl_idx = pred["qsl_idx"] + if qsl_idx not in completed_queries: + completed_queries.add(qsl_idx) + log.info(f"Completed_queries {completed_queries}, partial list {partial_output_dir_list}") + return completed_queries + + + def LoadSamplesToRam(self, query_samples): + pass + + def UnloadSamplesFromRam(self, query_samples): + pass + + def start(self): + cur_copies = self.num_workers + # Create worker threads + for j in range(self.num_workers): + + # Build kwargs for inidividual workers + private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs} + private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs + + worker = Instance(**private_worker_kwargs) + worker.start() + self.worker_threads[j] = worker + # Prevent host OOM + if (j%cur_copies==cur_copies-1): + # For some nodes it takes a long time to load the checkpoint so all instance weight load overlap, causing memory stress + with self.cond_var: + self.cond_var.wait_for(lambda: self.alive_counter.value >= self.num_workers//2, 120) + + with self.cond_var: + print(f"Waiting for alive_counter to be equal to {self.num_workers}") + self.cond_var.wait_for(lambda: self.alive_counter.value == self.num_workers) + + log.info(f"Starting internal issue query thread") + self.query_thread = threading.Thread(target=self.issue_queries_int_merged) + self.query_thread.daemon = True + self.query_thread.start() + + log.info(f"Starting Loadgen response thread") + self.response_thread = threading.Thread(target=self.response_loadgen) + self.response_thread.daemon = True + self.response_thread.start() + + def stop(self): + self.output_queue.put(None) + + for worker in self.worker_threads: + worker.join() + + if self.scenario=="server": + with self.cond_var: + log.info(f"[{time.time():.3f}] Test finished " + f"TTFT {self.over_ttft_counter.value} " + f"TPOT {self.over_tpot_counter.value} ") + try: + log.info(f"Max first time {np.max(self.first_time_list):.6f}s") + log.info(f"Average first time {np.mean(self.first_time_list):.6f}s") + except (AttributeError, ValueError): + pass + + self.cond_var.notify() + if PBAR: + # Ensure the last progress bar is updated + time.sleep(0.5) + self.progress.close() + + def update_pbar(self, tok_count, last_count, update_value): + postfix_str = f"{tok_count/self.progress.format_dict['elapsed']:.1f}toks/s" + self.progress.set_postfix_str(postfix_str, refresh=False) + self.progress.update(update_value) + self.progress.refresh() + last_count += update_value + return last_count + + def response_loadgen(self): + keep_alive = True + processed_count = 0 + last_count = 0 + tok_count = 0 + while keep_alive: + result = self.output_queue.get() + if result is None: + keep_alive = False + else: + qid, processed_output = result + + for i in range(len(processed_output)): + n_tokens = processed_output[i].shape[0] + response_array = array.array("B", processed_output[i].tobytes()) + bi = response_array.buffer_info() + response = [lg.QuerySampleResponse(qid[i], bi[0], bi[1], n_tokens)] + lg.QuerySamplesComplete(response) + processed_count += 1 + tok_count += n_tokens + + if PBAR: + # Limit prints to once every PBAR_FREQ completed samples + if processed_count - last_count >= PBAR_FREQ: + last_count = self.update_pbar(tok_count, last_count, PBAR_FREQ) + + # Update progress bar at the end + if PBAR and (processed_count > last_count): + last_count = self.update_pbar(tok_count, last_count, processed_count-last_count) + + def get_sut(self): + self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) + return self.sut + + def get_qsl(self): + return self.qsl + + def predict(self,**kwargs): + raise NotImplementedError + + def get_best_rank(self, value_added): + current_counters = np.array([(self.current_counter_list[i].value+value_added) for i in range(self.num_workers)]) # Instances priority will be ordered by their respective in-flight queries + target_rank = np.argmin(current_counters) + # if current_counters[target_rank]>self.batch_size: + # return -1 + # Instead of trying to only fill up to batch size, allow one queued up + if self.query_queue_list[target_rank].qsize()>0: + return -1 + else: + return target_rank + + def issue_queries(self, query_samples): + """ Receives samples from loadgen and adds them to queue. Users may choose to batch here""" + start_time = time.time() + if PBAR and (self.progress is None): + if len(query_samples)>1: + self.progress = tqdm(total=len(query_samples), smoothing=0.0) + else: + self.progress = tqdm(smoothing=0.0) + + query_sample_list = [] + for query_sample in query_samples: + # Continuous batching + self.query_queue_int.put((query_sample, start_time)) + + def try_dispatch( + self, + query_list, + delete_list, + bucket, + input_len_bucket, + time_start_bucket, + server=False): # Inactive for now + target_rank = -1 + wait_to_dispatch = True + while wait_to_dispatch: + with self.cond_var: + target_rank = self.get_best_rank(1) # With prepacking, prefill batch size is always 1 + if target_rank!=-1: + self.current_counter_list[target_rank].value += 1 + # Always wait for an instance with an empty slot(for now) + if target_rank==-1: + time.sleep(0.01) + else: + wait_to_dispatch = False + if target_rank!=-1: + print(f"Sending to rank {target_rank}, num_queries {len(query_list)}, before add {self.current_counter_list[target_rank].value}") + self.query_queue_list[target_rank].put((query_list, time_start_bucket)) + delete_list.append(bucket) + return True + return False + + # Load balancer for merged prefill + def issue_queries_int_merged(self): + keep_alive = True + + query_list = [] + time_start_list = [] + + # TODO: use the actual latency target instead of the hard-coded numbers + time_left = 9999# if self.scenario=="offline" else SERVER_TIME_LIMIT + #time_compute = SERVER_COMPUTE_TIME + time_compute = 0 + + while keep_alive: + new_query = False + try: + query = self.query_queue_int.get(timeout=0.05) + except Exception: + # No query available within timeout, continue + pass + else: + if query is None: + keep_alive = False + else: + query, start_time = query + start_time_c = start_time + new_query = True + + if new_query: + time_start_list.append(start_time_c) + query_list.append(query) + + if len(query_list)>0: + time_wait = time.time()-time_start_list[0] + time_needed = time_wait + time_compute + # When to dispatch + # 1. Time limit passed + # 2. Reached prefill batch size + # 3. End of inference + if (time_needed > time_left) or (len(query_list)==self.prefill_batch_size) or (not keep_alive): + target_rank = -1 + # Continue trying to dispatch until there is an empty spot + while target_rank==-1: + with self.cond_var: + target_rank = self.get_best_rank(len(query_list)) + if target_rank!=-1: + self.current_counter_list[target_rank].value += len(query_list) + if target_rank==-1: + time.sleep(0.01) + print(f"Sending to rank {target_rank}, length {len(query_list)}, wait_time {time_wait:.2f}s") + self.query_queue_list[target_rank].put((query_list, time_start_list)) + query_list = [] + time_start_list = [] + + for i in range(self.num_workers): + print("Putting none in", i) + self.query_queue_list[i].put(None) + + def flush_queries(self): + self.query_queue_int.put(None) + pass + + def __del__(self): + pass + + +class SUTServer(SUT): + def __init__(self, **kwargs): + kwargs["scenario"]="server" + super().__init__(**kwargs) + self.first_token_queue = mp.Queue() + self.shared_worker_kwargs["first_queue"]=self.first_token_queue + self.first_time_list = [] + + def start(self): + super().start() + # Create first token response thread + log.info(f"Starting first-token response thread") + self.ft_response_thread = threading.Thread(target=self.process_first_tokens) + self.ft_response_thread.daemon = True + self.ft_response_thread.start() + + def process_first_tokens(self): + while True: + first_token_item = self.first_token_queue.get() + + if first_token_item is None: + log.info("Exiting First token response thread") + break + + qid, processed_output, send_time = first_token_item + + for i in range(len(processed_output)): + response_data = array.array("B", processed_output[i].tobytes()) + bi = response_data.buffer_info() + response = [lg.QuerySampleResponse(qid[i], bi[0], bi[1])] + lg.FirstTokenComplete(response) + receive_time = time.time()-send_time + self.first_time_list.append(receive_time) + + diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py new file mode 100644 index 00000000000..bc66165a0cc --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/dataset.py @@ -0,0 +1,148 @@ +import os +import time +import numpy as np +import torch +from datasets import load_dataset, load_from_disk +from transformers import AutoModelForCausalLM, AutoTokenizer +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from typing import Optional, Dict, Sequence +import io +#import utils +import copy + +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("Llama-70B-Dataset") + +import random + +class Dataset(): + def __init__(self, model_name=None, total_sample_count=24576, perf_count_override=None, dataset_path=None, device="cpu"): + self.model_name = model_name or "meta-llama/Llama-2-70b-chat-hf" + self.dataset_path = dataset_path + self.max_length = 1024 + self.device = device + + #self.total_sample_count = total_sample_count + + self.load_tokenizer() + self.load_processed_dataset() + + self.total_sample_count = min(len(self.input_ids), total_sample_count) + self.perf_count = perf_count_override or self.total_sample_count + + def load_tokenizer(self): + """ Returns tokenizer """ + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name) + + self.tokenizer.pad_token = self.tokenizer.eos_token + + def load_processed_dataset(self): + if not os.path.isfile(self.dataset_path): + log.warn("Processed pickle file {} not found. Please check that the path is correct".format(self.dataset_path)) + + if "Llama-3" in self.model_name and "orca" not in self.dataset_path: + import pandas as pd + + self.processed_data = pd.read_json(self.dataset_path) + + self.input = self.processed_data.input.tolist() + self.input_ids = self.processed_data.tok_input.tolist() + self.input_lens = [len(x) for x in self.input_ids] + self.targets = self.processed_data.output.tolist() + + del self.processed_data + return + + import pandas as pd + # Note: Using pickle with trusted dataset files only + # In production, consider using safer serialization formats like JSON or HDF5 + processed_data = pd.read_pickle(self.dataset_path) # nosec B301 + # input_tokens = processed_data['tok_input'] + + if ("405b" in self.dataset_path) and ("llama" not in self.model_name): + # Running 405b dataset with a different model + # Tokenize the dataset instead of using the tokenized dataset + input_strs = processed_data['input'] + encode_bs = 128 + self.input_ids = [] + self.input_lens = [] + self.attention_masks = [] + input_strs_batch = [] + for i,strs in enumerate(input_strs): + # if i%10==9: + # print(i) + input_strs_batch.append(strs) + if len(input_strs_batch)>=encode_bs: + input_ids_batch = self.tokenizer.batch_encode_plus(input_strs_batch)['input_ids'] + for input_ids in input_ids_batch: + input_ids = torch.tensor([input_ids]) + attn_mask = torch.ones_like(input_ids) + self.input_ids.append(input_ids) + self.attention_masks.append(attn_mask) + self.input_lens.append(input_ids.shape[-1]) + input_strs_batch = [] + # exit() + if len(input_strs_batch)>0: + input_ids_batch = self.tokenizer.batch_encode_plus(input_strs_batch)['input_ids'] + for input_ids in input_ids_batch: + input_ids = torch.tensor([input_ids]) + attn_mask = torch.ones_like(input_ids) + self.input_ids.append(input_ids) + self.attention_masks.append(attn_mask) + self.input_lens.append(input_ids.shape[-1]) + input_strs_batch = [] + elif ("orca" in self.dataset_path) and ("llama2" not in self.model_name.lower()): + user_prompts = processed_data['question'] + system_prompts = processed_data['system_prompt'] + + self.input_ids = [] + self.input_lens = [] + self.attention_masks = [] + + for i in range(len(processed_data)): + user_prompt = user_prompts.iloc[i] + system_prompt = system_prompts.iloc[i] + message = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + input_ids = self.tokenizer.apply_chat_template(message, add_generation_prompt=True) + input_ids = torch.tensor(input_ids, dtype=torch.int32).view(1,-1).to(self.device) + attn_mask = torch.ones_like(input_ids) + self.input_ids.append(input_ids) + self.attention_masks.append(attn_mask) + self.input_lens.append(input_ids.shape[-1]) + else: + input_tokens = processed_data['tok_input'] + self.input_ids = [] + self.input_lens = [] + self.attention_masks = [] + + for ids in input_tokens: + input_ids = torch.tensor(ids, dtype=torch.int32).view(1,-1).to(self.device) + attn_mask = torch.ones_like(input_ids) + self.input_ids.append(input_ids) + self.attention_masks.append(attn_mask) + self.input_lens.append(input_ids.shape[-1]) + + def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sample_index_list=None): + """ Postprocesses output prediction """ + + output_seqs = [] + for i,out_token in enumerate(out_tokens): + output_seq = np.array(out_token).reshape(-1) + output_seqs.append(output_seq) + + return output_seqs + + def LoadSamplesToRam(self, sample_list): + pass + + def UnloadSamplesFromRam(self, sample_list): + pass + + def __del__(self): + pass diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py new file mode 100644 index 00000000000..0731818808b --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy-cnn.py @@ -0,0 +1,140 @@ +import os +import time +import numpy as np +import json +import nltk +import array +import torch +from torch.nn.functional import pad +from torch.utils.data import DataLoader +import evaluate +import argparse +import nltk +from transformers import AutoModelForCausalLM, AutoTokenizer +from pathlib import Path +import sys + +file = Path(__file__).resolve() +parent, root = file.parent, file.parents[1] +sys.path.append(str(root)) + +from dataset import Dataset + +def get_args(): + """Parse commandline.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json" + ) + parser.add_argument( + "--dataset-file", + required=True, + help="path to cnn_eval.json") + parser.add_argument( + "--verbose", + action="store_true", + help="verbose messages") + parser.add_argument( + "--dtype", + default="int64", + help="dtype of the accuracy log", + choices=["int32", "int64"], + ) + parser.add_argument( + "--checkpoint-path", + default="meta-llama/Meta-Llama-3.1-8B-Instruct", + help="Model name") + parser.add_argument( + "--total-sample-count", + default=13368, + type=int, + help="Model name") + args = parser.parse_args() + return args + + +def postprocess_text(preds, targets): + preds = [pred.strip() for pred in preds] + targets = [target.strip() for target in targets] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets] + + return preds, targets + + +def main(): + + args = get_args() + model_name = args.checkpoint_path + dataset_path = args.dataset_file + total_sample_count = args.total_sample_count + metric = evaluate.load("rouge") + nltk.download("punkt") + nltk.download('punkt_tab') + + tokenizer = AutoTokenizer.from_pretrained( + model_name, + model_max_length=2048, + padding_side="left", + use_fast=True, + ) + tokenizer.pad_token = tokenizer.eos_token + data_object = Dataset( + model_name=model_name, + dataset_path=dataset_path, + total_sample_count=total_sample_count, + ) + + targets = data_object.targets + + with open(args.mlperf_accuracy_file, "r") as f: + results = json.load(f) + + # Deduplicate the results loaded from the json + dedup_results = [] + seen = set() + for result in results: + item = result["qsl_idx"] + if item not in seen: + seen.add(item) + dedup_results.append(result) + results = dedup_results + + target_required = [] + preds_token_ids = [] + + eval_dtype = np.int64 + if args.dtype == "int32": + eval_dtype = np.int32 + + for pred in results: + qsl_idx = pred["qsl_idx"] + target = targets[qsl_idx] + target_required.append(target) + preds_token_ids.append( + np.frombuffer( + bytes.fromhex( + pred["data"]), + eval_dtype)) + + preds_decoded_text = tokenizer.batch_decode( + preds_token_ids, skip_special_tokens=True + ) + + preds, targets = postprocess_text(preds_decoded_text, target_required) + + result = metric.compute( + predictions=preds, references=targets, use_stemmer=True, use_aggregator=False + ) + result = {k: f"{round(np.mean(v) * 100, 4)}" for k, v in result.items()} + prediction_lens = [len(pred) for pred in preds] + result["gen_len"] = np.sum(prediction_lens) + result["gen_num"] = len(preds) + print("\nResults\n") + print(result) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py new file mode 100644 index 00000000000..9efb9bd7763 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/evaluation_scripts/evaluate-accuracy.py @@ -0,0 +1,140 @@ +import argparse +from transformers import AutoTokenizer +import nltk +import evaluate +import numpy as np +import json + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint" + ) + parser.add_argument( + "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json" + ) + parser.add_argument( + "--dataset-file", + required=True, + help="path to processed openorca validation set", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose messages") + parser.add_argument( + "--dtype", + default="int64", + help="dtype of the accuracy log", + choices=["int32", "int64", "float"], + ) + parser.add_argument("--estimate-performance", + action='store_true', + help="Estimate performance using mlperf_log_detail.txt" + ) + parser.add_argument("--mlperf-log-detail", + help="Path to mlperf_log_detail.txt. Only used if --estimate-performance is true" + ) + args = parser.parse_args() + return args + +def get_groundtruth(processed_dataset_file): + import pandas as pd + + # Note: Using pickle with trusted dataset files only + # In production, consider using safer serialization formats like JSON or HDF5 + data = pd.read_pickle(processed_dataset_file) # nosec B301 + ground_truths = data["output"] + return ground_truths + + +def postprocess_text(preds, targets): + preds = [pred.strip() for pred in preds] + targets = [target.strip() for target in targets] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets] + + return preds, targets + + +def main(): + + args = get_args() + dataset_path = args.dataset_file + checkpoint_path = args.checkpoint_path + metric = evaluate.load("rouge") + nltk.download("punkt") + nltk.download("punkt_tab") + + tokenizer = AutoTokenizer.from_pretrained( + checkpoint_path, + # model_max_length=2048, + # padding_side="left", + # use_fast=False, + ) + + targets = get_groundtruth(args.dataset_file) + + target_required = [] + preds_token_ids = [] + + eval_dtype = np.int64 + if args.dtype == "int32": + eval_dtype = np.int32 + elif args.dtype == "float": + eval_dtype = np.float32 + + with open(args.mlperf_accuracy_file, "r") as f: + results = json.load(f) + + seen = set() + gen_tok_len = 0 + for pred in results: + qsl_idx = pred["qsl_idx"] + if qsl_idx in seen: + continue + + seen.add(qsl_idx) + target = targets[qsl_idx] + target_required.append(target) + pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype) + + gen_tok_len += len(pred) + preds_token_ids.append(pred) + + preds_decoded_text = tokenizer.batch_decode( + preds_token_ids, skip_special_tokens=True + ) + + preds, targets = postprocess_text(preds_decoded_text, target_required) + + result = metric.compute( + predictions=preds, references=targets, use_stemmer=True, use_aggregator=False + ) + result = {k: f"{round(np.mean(v) * 100, 4)}" for k, v in result.items()} + prediction_lens = [len(pred) for pred in preds] + gen_num = len(preds) + + result = { + **result, + "gen_len": f"{np.sum(prediction_lens)}", + "gen_num": gen_num, + "gen_tok_len": gen_tok_len, + "tokens_per_sample": round(gen_tok_len / gen_num, 1), + } + + if args.estimate_performance: + import sys + import os + from utils import estimate_performance + tokens_per_second = estimate_performance(args.mlperf_log_detail, result) + result["estimated_performance"] = tokens_per_second + + print("\nResults\n") + print(result) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py new file mode 100644 index 00000000000..597ad43765f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/main.py @@ -0,0 +1,162 @@ +import subprocess # nosec B404 +import mlperf_loadgen as lg +import argparse +import os +import logging +import sys +import hashlib +import time +from SUT import SUT, SUTServer + +sys.path.insert(0, os.getcwd()) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--scenario", type=str, choices=["Offline", "Server", "SingleStream"], default="Offline", help="Scenario") + parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name") + parser.add_argument("--workload-name", type=str, default="llama2-70b") + parser.add_argument("--dataset-path", type=str, default="/software/users/mlperf/datasets/open_orca_gpt4_tokenized_llama.sampled_24576.pkl", help="") # TODO: Reset default before submission + parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode") + parser.add_argument("--dtype", type=str, default="bfloat16", help="data type of the model, choose from float16, bfloat16 and float32") + parser.add_argument("--device", type=str, default="hpu", help="device to use") + parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs") + parser.add_argument("--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config") + parser.add_argument("--user-conf", type=str, default="user.conf", help="user config for user LoadGen settings such as target QPS") + parser.add_argument("--total-sample-count", type=int, default=24576, help="Number of samples to use in benchmark.") # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it + parser.add_argument("--output-log-dir", type=str, default="build/logs", help="Where logs are saved") + parser.add_argument("--enable-log-trace", action="store_true", help="Enable log tracing. This file can become quite large") + parser.add_argument("--tensor-parallel", type=int, default=1, help="Tensor parallel size") + parser.add_argument("--pipeline-parallel", type=int, default=1, help="Pipeline parallel size") + parser.add_argument("--num-workers", type=int, default=1, help="Number of workers to process queries") + parser.add_argument("--batch-size", type=int, default=1) + parser.add_argument("--quantized", action='store_true', help="If using a AWQ quantized model") + parser.add_argument("--warmup", action='store_true', help="Do warmup") + + args = parser.parse_args() + + print("### Benchmark parameters") + params_list = [action.dest for action in parser._actions] + for param in params_list: + try: + value = getattr(args, param) + print(f"{param} = {value}") + except AttributeError: + continue + # Define grep patterns for environment and package info + env_patterns = ['VLLM', 'PT_', 'HL_', 'MAX_'] + pip_patterns = ['vllm', 'neural', 'loadgen', 'intel', 'triton', 'ccl'] + + # Get environment variables + for pattern in env_patterns: + try: + # Using standard system command - trusted input + result = subprocess.run(['env'], capture_output=True, text=True, check=True) # nosec B607 B603 + filtered_output = '\n'.join([line for line in result.stdout.split('\n') if pattern in line]) + if filtered_output: + print(filtered_output) + except Exception: + # Environment variable lookup failed, continue + continue + + # Get pip package info + for pattern in pip_patterns: + try: + # Using standard system command - trusted input + result = subprocess.run(['pip', 'list'], capture_output=True, text=True, check=True) # nosec B607 B603 + filtered_output = '\n'.join([line for line in result.stdout.split('\n') if pattern in line]) + if filtered_output: + print(filtered_output) + except Exception: + # Package listing failed, continue + continue + + with open('user.conf', 'r') as file: + for line in file: + if line.strip(): + print(line, end='') + print("########################") + return args + +scenario_map = { + "offline": lg.TestScenario.Offline, + "server": lg.TestScenario.Server, + "singlestream": lg.TestScenario.SingleStream + } + +def main(): + args = get_args() + + logging.basicConfig(level=logging.INFO) + log = logging.getLogger(args.workload_name + "-MAIN") + + sut_map = { + "offline": SUT, + "server": SUTServer, + "singlestream": SUTServer + } + + settings = lg.TestSettings() + settings.scenario = scenario_map[args.scenario.lower()] + # Need to update the conf + settings.FromConfig(args.user_conf, args.workload_name, args.scenario) + + if args.accuracy: + settings.mode = lg.TestMode.AccuracyOnly + log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet") + else: + settings.mode = lg.TestMode.PerformanceOnly + + partial_output_dir_list = [] + cur_output_log_dir = args.output_log_dir + + os.makedirs(cur_output_log_dir, exist_ok=True) + log_output_settings = lg.LogOutputSettings() + log_output_settings.outdir = cur_output_log_dir + log_output_settings.copy_summary_to_stdout = True + log_settings = lg.LogSettings() + log_settings.log_output = log_output_settings + log_settings.enable_trace = args.enable_log_trace + + sut_cls = sut_map[args.scenario.lower()] + + sut = sut_cls( + model_path=args.model_path, + workload_name=args.workload_name, + lg_settings=settings, + dtype=args.dtype, + dataset_path=args.dataset_path, + total_sample_count=args.total_sample_count, + device=args.device, + workers=args.num_workers, + tp=args.tensor_parallel, + pp=args.pipeline_parallel, + batch_size=args.batch_size, + quantized=args.quantized, + warmup=args.warmup, + partial_output_dir_list=partial_output_dir_list + ) + # import pdb;pdb.set_trace() + + # Start sut before loadgen starts + sut.start() + lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries) + log.info("Starting Benchmark run") + t_start = time.time() + lg.StartTestWithLogSettings(lgSUT, sut.qsl, settings, log_settings, args.audit_conf) + t_end = time.time() + log.info("Test took {:.1f} sec".format(t_end - t_start)) + + # Stop sut after completion + sut.stop() + + log.info("Run Completed!") + + log.info("Destroying SUT...") + lg.DestroySUT(lgSUT) + + log.info("Destroying QSL...") + lg.DestroyQSL(sut.qsl) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py new file mode 100644 index 00000000000..9e25237f05f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/run_quantization.py @@ -0,0 +1,116 @@ +from argparse import ArgumentParser +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +#from auto_round import AutoRound +from dataset import Dataset +import torch +torch.use_deterministic_algorithms(True, warn_only=True) + +#========================= +import os +import subprocess +import pandas as pd + +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + +from transformers import set_seed + +import re + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +#========================= + +def get_args(): + parser = ArgumentParser(description="AutoRound quantization script") + parser.add_argument("--model_name", type=str, required=True, help="Name of the model to quantize") + parser.add_argument("--dataset-path", type=str, required=True, help="Path to the dataset for quantization") + parser.add_argument("--bits", type=int, default=4, help="Number of bits for quantization") + parser.add_argument("--group_size", type=int, default=128, help="Group size for quantization") + parser.add_argument("--sym", action='store_true', help="Use symmetric quantization") + parser.add_argument("--nsamples", type=int, default=512, help="Number of samples for quantization") + parser.add_argument("--iters", type=int, default=1000, help="Number of iterations for quantization") + parser.add_argument("--output_dir", type=str, default="./tmp_autoround", help="Directory to save the quantized model") + parser.add_argument("--quant_lm_head", action='store_true', help="Quantize the language model head") + parser.add_argument("--act_bits", type=int, default=8, help="Number of bits for activation quantization") + parser.add_argument("--device", type=str, default="xpu", help="Device to run the quantization on") + return parser.parse_args() + +def main(): + args = get_args() + model_name = args.model_name + dataset_path = args.dataset_path + nsamples = args.nsamples + iters = args.iters + bits = args.bits + group_size = args.group_size + sym = args.sym + output_dir = args.output_dir + + # Load dataset + calib_dataset = Dataset(model_name=model_name, dataset_path=dataset_path, total_sample_count=nsamples) + tokenizer = calib_dataset.tokenizer + calib_dataset = [torch.tensor(input).reshape(1, -1) for input in calib_dataset.input_ids[:nsamples]] + + # Load model + model = AutoModelForCausalLM.from_pretrained(model_name, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + trust_remote_code=True,) + + model = model.eval() # Set model to evaluation mode + model = model.to(torch.float16) # Convert model to float16 for quantization + #model.seqlen = 3072 + + layer_config = {} + for n, m in model.named_modules(): + if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): + if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: + layer_config[n] = {"bits": 32} + print( + f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq") + lm_head_layer_name = "lm_head" + for n, _ in model.named_modules(): + lm_head_layer_name = n + if args.quant_lm_head: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code) + if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"): + tied_keys = model._tied_weights_keys + for item in tied_keys: + if lm_head_layer_name in item: ##TODO extend to encoder-decoder layer, seq classification model + args.quant_lm_head = False + print( + f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been " + f"supported currently") + break + if args.quant_lm_head: + layer_config[lm_head_layer_name] = {"bits": args.bits} + transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]] + if transformers_version[0] == 4 and transformers_version[1] < 38: + error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization." + raise EnvironmentError(error_message) + + from auto_round import AutoRound + + qmodel = AutoRound(model=model, + tokenizer=tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + dataset=calib_dataset, + seqlen=256, + iters=iters, + device=args.device, + gradient_accumulate_steps=1, + batch_size=1, + layer_config=layer_config, + model_dtype="float16" + ) + + export_dir = output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{bits}g{group_size}-iters{iters}-{args.device}" + + qmodel.quantize_and_save(export_dir, format='auto_awq') + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf new file mode 100644 index 00000000000..98958f6e0d7 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/user.conf @@ -0,0 +1,10 @@ +llama2-70b.Offline.min_query_count = 24576 +llama2-70b.Server.min_query_count = 24576 +llama2-70b.Server.target_qps = .90 + +llama3_1-8b.Server.target_qps = 44.0 +llama3_1-8b.Offline.target_qps = 50.0 + +llama3_1-8b-edge.Offline.target_qps = 12.50 +llama3_1-8b-edge.SingleStream.target_latency = 3500 + diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py new file mode 100644 index 00000000000..2204c179be2 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils.py @@ -0,0 +1,71 @@ +import subprocess # nosec B404 +import json +import os +import io + +def _make_r_io_base(f, mode: str): + if not isinstance(f, io.IOBase): + f = open(f, mode=mode) + return f + +def jload(f, mode="r"): + """Load a .json file into a dictionary.""" + f = _make_r_io_base(f, mode) + jdict = json.load(f) + f.close() + return jdict + +def get_start_cores(): + # Use safer subprocess call without shell=True - single process approach + try: + # Using standard system command - trusted input + result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True) # nosec B607 B603 + start_cores = [] + + # Process each line to find NUMA node CPU information + for line in result.stdout.split('\n'): + if 'NUMA node' in line and 'CPU' in line: + # Extract the CPU range (e.g., "0-15" from "NUMA node0 CPU(s): 0-15") + parts = line.split() + if len(parts) >= 3: + cpu_range = parts[-1] # Last part should be the CPU range + # Get the start core (before the '-') + if '-' in cpu_range: + start_core = int(cpu_range.split('-')[0]) + start_cores.append(start_core) + + return start_cores if start_cores else [0] + except Exception: + # Fallback to basic approach + return [0] + +def get_total_runtime(log_detail_file): + import re + from datetime import datetime + + with open(log_detail_file) as fid: + text = fid.read() + + bpat = '"power_begin", "value": "(.*?)",' + epat = '"power_end", "value": "(.*?)",' + + pbegin = re.search(bpat, text).group(1) + pend = re.search(epat, text).group(1) + + print(f"Test started at {pbegin}") + print(f"Test ended at {pend}") + + start_time = datetime.strptime(pbegin, "%m-%d-%Y %H:%M:%S.%f") + end_time = datetime.strptime(pend, "%m-%d-%Y %H:%M:%S.%f") + time_difference = end_time - start_time + total_time_in_seconds = time_difference.total_seconds() + print(f"Total time taken is {total_time_in_seconds}") + return total_time_in_seconds + +def estimate_performance(log_detail_file, acc_result_dict): + total_time_in_seconds = get_total_runtime(log_detail_file) + tokens_per_sample = acc_result_dict["tokens_per_sample"] + num_samples = acc_result_dict["gen_num"] + samples_per_second = round(num_samples / total_time_in_seconds, 3) + tokens_per_second = round(tokens_per_sample * samples_per_second, 2) + return tokens_per_second \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py new file mode 100644 index 00000000000..1185f3ae19e --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/utils_model.py @@ -0,0 +1,100 @@ +import os +from vllm import SamplingParams +import numpy as np +import time +import types + +# Disable prints if progress bar is active +def void(*args, **kwargs): + pass + + +PBAR = int(os.environ.get("PBAR", "1")) +# Update frequency of the progress bar +PBAR_FREQ = int(os.environ.get("PBAR_FREQ", "100")) +XPU_COUNT = int(os.environ.get("XPU_COUNT", "0")) +WORKLOAD_NAME= os.environ.get("WORKLOAD_NAME", "llama2-70b") +GPU_MEMORY_UTILIZATION= float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.95")) +if PBAR==1: + print = void + +#***********************Values to be imported into SUT.py + +SAMPLING_PARAMS = SamplingParams( + max_tokens=1024 if WORKLOAD_NAME=="llama2-70b" else 128, + min_tokens=1, + temperature=0.0, + detokenize=False +) + +ADDITIONAL_MODEL_KWARGS = { + "gpu_memory_utilization": GPU_MEMORY_UTILIZATION +} + +if XPU_COUNT>0: + ########################XPU-specific + #ADDITIONAL_MODEL_KWARGS["enforce_eager"] = True + pass +else: + ########################CPU-specific + + from numa import schedule, memory + import os + import math + import subprocess # nosec B404 + cores_per_inst = int(os.environ.get("CORES_PER_INST", "1")) + num_numa_nodes = int(os.environ.get("NUM_NUMA_NODES", "1")) + nodes_per_inst = int(os.environ["NUM_NUMA_NODES"])/int(os.environ["NUM_INSTS"]) + insts_per_node = int(os.environ["INSTS_PER_NODE"]) + # Only start workers in allowed NUMA nodes. Useful when node sharing + ALLOWED_NODES = os.environ.get("ALLOWED_NODES", "all") + + def get_start_cores(): + # Use safer subprocess call without shell=True - single process approach + try: + # Using standard system command - trusted input + result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True) # nosec B607 B603 + start_cores = [] + + # Process each line to find NUMA node CPU information + for line in result.stdout.split('\n'): + if 'NUMA node' in line and 'CPU' in line: + # Extract the CPU range (e.g., "0-15" from "NUMA node0 CPU(s): 0-15") + parts = line.split() + if len(parts) >= 3: + cpu_range = parts[-1] # Last part should be the CPU range + # Get the start core (before the '-') + if '-' in cpu_range: + start_core = int(cpu_range.split('-')[0]) + start_cores.append(start_core) + + return start_cores if start_cores else [0] + except Exception: + # Fallback to basic approach + return [0] + + def SUT_OVERRIDE(self): + node_start_cores = get_start_cores() + core_lists = [] + if insts_per_node>0: + for i in range(num_numa_nodes): + for j in range(insts_per_node): + core_lists.append(tuple(range(node_start_cores[i]+j*cores_per_inst, node_start_cores[i]+(j+1)*cores_per_inst))) + + node_list = list(range(len(node_start_cores))) + allowed_nodes = [str(node) for node in node_list] + if ALLOWED_NODES!="all": + allowed_nodes = ALLOWED_NODES.split(",") + self.individual_worker_kwargs["core_list"] = core_lists + + instance_node_list = [] + for j in range(self.num_workers): + cur_node = math.floor(j*nodes_per_inst) + instance_node_list.append(tuple([cur_node])) + self.individual_worker_kwargs["node_list"] = instance_node_list + + + def INSTANCE_OVERRIDE(self): + print("self.node_list", self.node_list) + os.environ["VLLM_CPU_OMP_THREADS_BIND"]=f"{self.core_list[0]}-{self.core_list[-1]}" + memory.set_membind_nodes(*self.node_list) From 4555df1398cc1f12f7415ed00a6e1342b82334ec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 05:54:09 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language-modeling/mlperf/llama2-70b/SUT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py index 8af65d15d99..428788afa6f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py @@ -708,7 +708,7 @@ def start(self): # Create worker threads for j in range(self.num_workers): - # Build kwargs for inidividual workers + # Build kwargs for individual workers private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs} private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs From 1f5178557e58c3261cc58cd70c0f71a547e0e6bd Mon Sep 17 00:00:00 2001 From: lkk Date: Tue, 18 Nov 2025 06:29:48 +0000 Subject: [PATCH 4/6] update doc. --- .../mlperf/llama2-70b/README.md | 79 +++++++++++++++++++ .../mlperf/llama2-70b/SUT.py | 2 +- .../mlperf/llama2-70b/quantize_70b.sh | 4 +- .../mlperf/llama2-70b/quantize_autoround.py | 7 +- .../mlperf/llama2-70b/simple_autoround.py | 5 +- 5 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md new file mode 100644 index 00000000000..4723276c1e4 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md @@ -0,0 +1,79 @@ +# Llama2 + +## Getting started + +Please first download the data, model and preprocess the data folloing the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data. + +### Download Model + +Please download model files by following the mlcommons README.md with instructions: + +```bash +# following steps: https://github.com/mlcommons/inference/blob/master/language/llama2-70b/README.md#get-dataset +``` + +### Download and Prepare Data + +Please download data files by following the mlcommons README.md with instructions. +Please move the downloaded pickle into expected path and follow steps to run the required data pre-processing: + +```bash +# follow: https://github.com/mlcommons/inference/blob/master/language/llama2-70b/README.md#get-dataset +# to download file: open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz, open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz + +# unzip files +gzip -dk open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz +gzip -dk open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz + +# make sure you are in mlperf's container +make prebuild + +# move into right directory +mv open_orca_gpt4_tokenized_llama.*.pkl build/data/llama2-70b/ + +``` + +Make sure after the 2 steps above, you have: + +1. model downloaded at: `build/models/Llama2/Llama-2-70b-chat-hf/`, +2. preprocessed data at `build/preprocessed_data/llama2-70b/`: + +- `build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl` +- `build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.calibration_1000.pkl` + +## Build and run the benchmarks + +Please follow the steps below in MLPerf container. + +### Prepare environments + +```bash +# make sure you are in mlperf's container +bash setup.sh + +``` + +### Quantize model + +Use [Intel/auto-round](https://github.com/intel/auto-round) to make the quantization model + +```bash +# quantize model +bash quantize_70b.sh + +# you can also use the default quantization config of autoround +# run the `python simple_autoround.py` directly +``` + +### Benchmark + +```bash +# inference + +VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python main.py --mlperf-conf mlperf.conf --model-path build/models/Llama2/Llama-2-70b-chat-hf-quantized --dataset-path build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl --tensor-parallel 2 --warmup --user-conf user.conf --accuracy --batch-size 128 + +# evaluation +CUDA_VISIBLE_DEVICES=2 python evaluation_scripts/evaluate-accuracy.py --checkpoint-path build/models/Llama2/Llama-2-70b-chat-hf-quantized --mlperf-accuracy-file build/logs/mlperf_log_accuracy.json --dataset-file build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.sampled_24576.pkl --dtype int64 + +``` + diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py index 428788afa6f..8af65d15d99 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py @@ -708,7 +708,7 @@ def start(self): # Create worker threads for j in range(self.num_workers): - # Build kwargs for individual workers + # Build kwargs for inidividual workers private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs} private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh index 20afd8b22d3..5bcec92be54 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_70b.sh @@ -1,5 +1,5 @@ -CHECKPOINT_PATH=meta-llama/Llama-2-70b-chat-hf -CALIBRATION_DATA_PATH=./open_orca/open_orca_gpt4_tokenized_llama.calibration_1000.pkl +CHECKPOINT_PATH=build/models/Llama2/Llama-2-70b-chat-hf/ +CALIBRATION_DATA_PATH=build/preprocessed_data/llama2-70b/open_orca_gpt4_tokenized_llama.calibration_1000.pkl NUM_GROUPS=-1 NUM_SAMPLES=1000 ITERS=200 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py index 93a0483438f..cb5639bb163 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/quantize_autoround.py @@ -95,11 +95,10 @@ def main(): orig_path = args.model_name packing_format="llm_compressor" if orig_path.endswith("/"): - output_dir=orig_path[:-1]+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}" + output_dir=orig_path[:-1]+f"-quantized" else: - output_dir=orig_path+f"-{packing_format}-w{args.bits}g{args.group_size}-iters{args.iters}" - if args.fp8_kv: - output_dir += "-fp8kv" + output_dir=orig_path+f"-quantized" + autoround.quantize_and_save(output_dir, format=f'{packing_format}') diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py index 0a4e895a946..3bc624b1cae 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/simple_autoround.py @@ -2,9 +2,8 @@ from auto_round import AutoRound # Load a model (supports FP8/BF16/FP16/FP32) -model_name_or_path = "meta-llama/Llama-2-70b-chat-hf" -#output_dir = "Llama-2-70b-chat-hf-mxfp4-fp8kv" -output_dir = "Llama-2-70b-chat-hf-mxfp4" +model_name_or_path = "build/models/Llama2/Llama-2-70b-chat-hf/" +output_dir = "build/models/Llama2/Llama-2-70b-chat-hf-quantized" #ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200, static_kv_dtype="fp8",) ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=200) From 70b8cfc763d1719fd48cc2f2514d553686abbeec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 06:31:36 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language-modeling/mlperf/llama2-70b/README.md | 2 +- .../language-modeling/mlperf/llama2-70b/SUT.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md index 4723276c1e4..09358bee2e1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md @@ -2,7 +2,7 @@ ## Getting started -Please first download the data, model and preprocess the data folloing the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data. +Please first download the data, model and preprocess the data following the steps below _within the mlperf container_. Note that if you have already downloaded the model and data prior to v4.1, you don't need to redo them. But you _need to re-run_ the preprocess_data step for the updated calibration data. ### Download Model diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py index 8af65d15d99..428788afa6f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/SUT.py @@ -708,7 +708,7 @@ def start(self): # Create worker threads for j in range(self.num_workers): - # Build kwargs for inidividual workers + # Build kwargs for individual workers private_worker_kwargs = {key_i: self.individual_worker_kwargs[key_i][j] for key_i in self.individual_worker_kwargs} private_worker_kwargs = private_worker_kwargs | self.shared_worker_kwargs From 4284c70310f5252efa01cc5d7f6a19f6a6736ce3 Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Tue, 18 Nov 2025 14:32:46 +0800 Subject: [PATCH 6/6] fix typo --- .../language-modeling/mlperf/llama2-70b/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md index 09358bee2e1..15d5a8e65b2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/llama2-70b/README.md @@ -25,9 +25,6 @@ Please move the downloaded pickle into expected path and follow steps to run the gzip -dk open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz gzip -dk open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz -# make sure you are in mlperf's container -make prebuild - # move into right directory mv open_orca_gpt4_tokenized_llama.*.pkl build/data/llama2-70b/