diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py index d5b250b9eb943..a2e8f4e2a4081 100644 --- a/devops/scripts/benchmarks/benches/llamacpp.py +++ b/devops/scripts/benchmarks/benches/llamacpp.py @@ -29,7 +29,7 @@ def git_url(self) -> str: return "https://github.com/ggerganov/llama.cpp" def git_hash(self) -> str: - return "1ee9eea094fe5846c7d8d770aa7caa749d246b23" + return "916c83bfe7f8b08ada609c3b8e583cf5301e594b" def setup(self): if options.sycl is None: @@ -47,9 +47,9 @@ def setup(self): self.model = download( self.models_dir, - "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf", - "Phi-3-mini-4k-instruct-q4.gguf", - checksum="fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4", + "https://huggingface.co/ggml-org/DeepSeek-R1-Distill-Qwen-1.5B-Q4_0-GGUF/resolve/main/deepseek-r1-distill-qwen-1.5b-q4_0.gguf", + "deepseek-r1-distill-qwen-1.5b-q4_0.gguf", + checksum="791f6091059b653a24924b9f2b9c3141c8f892ae13fff15725f77a2bf7f9b1b6b71c85718f1e9c0f26c2549aba44d191", ) self.oneapi = get_oneapi() @@ -64,10 +64,11 @@ def setup(self): f"-DGGML_SYCL=ON", f"-DCMAKE_C_COMPILER=clang", f"-DCMAKE_CXX_COMPILER=clang++", - f"-DDNNL_DIR={self.oneapi.dnn_cmake()}", + f"-DDNNL_GPU_VENDOR=INTEL", f"-DTBB_DIR={self.oneapi.tbb_cmake()}", - f'-DCMAKE_CXX_FLAGS=-I"{self.oneapi.mkl_include()}"', - f"-DCMAKE_SHARED_LINKER_FLAGS=-L{self.oneapi.compiler_lib()} -L{self.oneapi.mkl_lib()}", + f"-DDNNL_DIR={self.oneapi.dnn_cmake()}", + f"-DSYCL_COMPILER=ON", + f"-DMKL_DIR={self.oneapi.mkl_cmake()}", ] run(configure_command, add_sycl=True) @@ -96,14 +97,17 @@ def __init__(self, bench): def setup(self): self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench") + def model(self): + return "DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf" + def name(self): - return f"llama.cpp" + return f"llama.cpp {self.model()}" def description(self) -> str: return ( "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. " "Runs both prompt processing (initial context processing) and text generation benchmarks with " - "different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct " + f"different batch sizes. Higher values indicate better performance. Uses the {self.model()} " "quantized model and leverages SYCL with oneDNN for acceleration." ) @@ -122,12 +126,18 @@ def run(self, env_vars) -> list[Result]: "128", "-p", "512", - "-b", - "128,256,512", + "-pg", + "0,0", + "-sm", + "none", + "-ngl", + "99", "--numa", "isolate", "-t", - "56", # TODO: use only as many threads as numa node 0 has cpus + "8", + "--mmap", + "0", "--model", f"{self.bench.model}", ] diff --git a/devops/scripts/benchmarks/utils/oneapi.py b/devops/scripts/benchmarks/utils/oneapi.py index fc27b9a8b2d3e..f68aa84a003c3 100644 --- a/devops/scripts/benchmarks/utils/oneapi.py +++ b/devops/scripts/benchmarks/utils/oneapi.py @@ -16,16 +16,10 @@ def __init__(self): Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True) self.oneapi_instance_id = self.generate_unique_oneapi_id(self.oneapi_dir) - # can we just hardcode these links? self.install_package( - "dnnl", - "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh", - "6866feb5b8dfefd6ff45d6bfabed44f01d7fba8fd452480ae1fd86b92e9481ae052c24842da14f112f672f5c4859945b", - ) - self.install_package( - "mkl", - "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh", - "122bb84cf943ea27753cb399c81ab2ae218ebd51b789c74d273240157722925ab4d5a43cb0b5de41b854f2c5a59a4002", + "base", + "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/cca951e1-31e7-485e-b300-fe7627cb8c08/intel-oneapi-base-toolkit-2025.1.0.651_offline.sh", + "98cad2489f2c90a2b328568a59371cf35855a3338643f61a9fc2d16a265d29f22feb2d673916dd7be18fa12a5e6d2475", ) return diff --git a/devops/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py index 54f2ef7fb9c1f..524afe32f5edf 100644 --- a/devops/scripts/benchmarks/utils/utils.py +++ b/devops/scripts/benchmarks/utils/utils.py @@ -9,10 +9,11 @@ import subprocess import tarfile -import urllib # nosec B404 from options import options from pathlib import Path import hashlib +from urllib.request import urlopen # nosec B404 +from shutil import copyfileobj def run( @@ -147,7 +148,9 @@ def download(dir, url, file, untar=False, unzip=False, checksum=""): data_file = os.path.join(dir, file) if not Path(data_file).exists(): print(f"{data_file} does not exist, downloading") - urllib.request.urlretrieve(url, data_file) + with urlopen(url) as in_stream, open(data_file, "wb") as out_file: + copyfileobj(in_stream, out_file) + calculated_checksum = calculate_checksum(data_file) if calculated_checksum != checksum: print(