@@ -29,7 +29,7 @@ def git_url(self) -> str:
2929 return "https://github.com/ggerganov/llama.cpp"
3030
3131 def git_hash (self ) -> str :
32- return "1ee9eea094fe5846c7d8d770aa7caa749d246b23 "
32+ return "916c83bfe7f8b08ada609c3b8e583cf5301e594b "
3333
3434 def setup (self ):
3535 if options .sycl is None :
@@ -47,9 +47,9 @@ def setup(self):
4747
4848 self .model = download (
4949 self .models_dir ,
50- "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf /resolve/main/Phi-3-mini-4k-instruct-q4 .gguf" ,
51- "Phi-3-mini-4k-instruct-q4 .gguf" ,
52- checksum = "fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4 " ,
50+ "https://huggingface.co/ggml-org/DeepSeek-R1-Distill-Qwen-1.5B-Q4_0-GGUF /resolve/main/deepseek-r1-distill-qwen-1.5b-q4_0 .gguf" ,
51+ "deepseek-r1-distill-qwen-1.5b-q4_0 .gguf" ,
52+ checksum = "791f6091059b653a24924b9f2b9c3141c8f892ae13fff15725f77a2bf7f9b1b6b71c85718f1e9c0f26c2549aba44d191 " ,
5353 )
5454
5555 self .oneapi = get_oneapi ()
@@ -64,10 +64,11 @@ def setup(self):
6464 f"-DGGML_SYCL=ON" ,
6565 f"-DCMAKE_C_COMPILER=clang" ,
6666 f"-DCMAKE_CXX_COMPILER=clang++" ,
67- f"-DDNNL_DIR= { self . oneapi . dnn_cmake () } " ,
67+ f"-DDNNL_GPU_VENDOR=INTEL " ,
6868 f"-DTBB_DIR={ self .oneapi .tbb_cmake ()} " ,
69- f'-DCMAKE_CXX_FLAGS=-I"{ self .oneapi .mkl_include ()} "' ,
70- f"-DCMAKE_SHARED_LINKER_FLAGS=-L{ self .oneapi .compiler_lib ()} -L{ self .oneapi .mkl_lib ()} " ,
69+ f"-DDNNL_DIR={ self .oneapi .dnn_cmake ()} " ,
70+ f"-DSYCL_COMPILER=ON" ,
71+ f"-DMKL_DIR={ self .oneapi .mkl_cmake ()} " ,
7172 ]
7273
7374 run (configure_command , add_sycl = True )
@@ -96,14 +97,17 @@ def __init__(self, bench):
9697 def setup (self ):
9798 self .benchmark_bin = os .path .join (self .bench .build_path , "bin" , "llama-bench" )
9899
100+ def model (self ):
101+ return "DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf"
102+
99103 def name (self ):
100- return f"llama.cpp"
104+ return f"llama.cpp { self . model () } "
101105
102106 def description (self ) -> str :
103107 return (
104108 "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. "
105109 "Runs both prompt processing (initial context processing) and text generation benchmarks with "
106- "different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct "
110+ f "different batch sizes. Higher values indicate better performance. Uses the { self . model () } "
107111 "quantized model and leverages SYCL with oneDNN for acceleration."
108112 )
109113
@@ -122,12 +126,18 @@ def run(self, env_vars) -> list[Result]:
122126 "128" ,
123127 "-p" ,
124128 "512" ,
125- "-b" ,
126- "128,256,512" ,
129+ "-pg" ,
130+ "0,0" ,
131+ "-sm" ,
132+ "none" ,
133+ "-ngl" ,
134+ "99" ,
127135 "--numa" ,
128136 "isolate" ,
129137 "-t" ,
130- "56" , # TODO: use only as many threads as numa node 0 has cpus
138+ "8" ,
139+ "--mmap" ,
140+ "0" ,
131141 "--model" ,
132142 f"{ self .bench .model } " ,
133143 ]
0 commit comments