Skip to content

Commit 911da87

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # README.md # examples/llama.android/llama/src/main/cpp/llama-android.cpp # examples/run/run.cpp # examples/server/README.md # examples/server/bench/README.md # examples/server/tests/README.md # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # tests/test-backend-ops.cpp
2 parents 22fd7a0 + 2f0ee84 commit 911da87

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+82450
-73482
lines changed

common/common.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <cstdarg>
2121
#include <cstring>
2222
#include <ctime>
23+
#include <filesystem>
2324
#include <fstream>
2425
#include <iostream>
2526
#include <iterator>
@@ -64,7 +65,9 @@
6465
#ifdef __linux__
6566
#include <linux/limits.h>
6667
#elif defined(_WIN32)
67-
#define PATH_MAX MAX_PATH
68+
# if !defined(PATH_MAX)
69+
# define PATH_MAX MAX_PATH
70+
# endif
6871
#else
6972
#include <sys/syslimits.h>
7073
#endif
@@ -1150,8 +1153,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
11501153
#endif
11511154

11521155
// Check if the file already exists locally
1153-
struct stat model_file_info;
1154-
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1156+
auto file_exists = std::filesystem::exists(path);
11551157

11561158
// If the file exists, check its JSON metadata companion file.
11571159
std::string metadata_path = path + ".json";
@@ -1614,6 +1616,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
16141616
// Chat template utils
16151617
//
16161618

1619+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
1620+
static const char * template_key = "tokenizer.chat_template";
1621+
// call with NULL buffer to get the total size of the string
1622+
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1623+
if (res > 0) {
1624+
std::vector<char> model_template(res + 1, 0);
1625+
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1626+
return std::string(model_template.data(), model_template.size() - 1);
1627+
}
1628+
return "";
1629+
}
1630+
16171631
bool common_chat_verify_template(const std::string & tmpl) {
16181632
llama_chat_message chat[] = {{"user", "test"}};
16191633
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,9 @@ struct common_chat_msg {
567567
std::string content;
568568
};
569569

570+
// Get the built-in chat template for the model. Return empty string if not present.
571+
std::string common_get_builtin_chat_template(const struct llama_model * model);
572+
570573
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
571574
bool common_chat_verify_template(const std::string & tmpl);
572575

convert_hf_to_gguf.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,25 +1764,19 @@ def set_vocab(self):
17641764
self.gguf_writer.add_token_list(tokens)
17651765
self.gguf_writer.add_token_types(toktypes)
17661766

1767-
special_vocab = gguf.SpecialVocab(
1768-
self.dir_model, load_merges=True,
1769-
special_token_types = ['bos', 'eos', 'eom', 'eot']
1770-
)
1771-
special_vocab._set_special_token("bos", 128000)
1772-
special_vocab._set_special_token("eos", 128001)
1773-
special_vocab._set_special_token("eom", 128008)
1774-
special_vocab._set_special_token("eot", 128009)
1767+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
17751768
special_vocab.add_to_gguf(self.gguf_writer)
17761769
else:
17771770
# DeciLM-7B
17781771
self._set_vocab_llama_hf()
1779-
# self._set_vocab_gpt2()
17801772

17811773
def set_gguf_parameters(self):
17821774
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
17831775
assert self.block_count == len(self._num_kv_heads)
17841776
assert self.block_count == len(self._num_heads)
17851777
assert self.block_count == len(self._ffn_dims)
1778+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
1779+
self.gguf_writer.add_rope_freq_base(rope_theta)
17861780
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
17871781
self.gguf_writer.add_head_count(self._num_heads)
17881782
self.gguf_writer.add_feed_forward_length(self._ffn_dims)

examples/server/bench/bench.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,12 @@ def main(args_in: list[str] | None = None) -> None:
189189
"pp": {
190190
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
191191
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
192-
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
192+
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
193193
},
194194
"tg": {
195195
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
196196
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
197-
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
197+
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
198198
},
199199
}
200200
with open("results.github.env", 'a') as github_env:
@@ -214,11 +214,14 @@ def start_benchmark(args):
214214
k6_args = [
215215
'run', args.scenario,
216216
'--no-color',
217+
'--no-connection-reuse',
218+
'--no-vu-connection-reuse',
217219
]
218220
k6_args.extend(['--duration', args.duration])
219221
k6_args.extend(['--iterations', args.n_prompts])
220222
k6_args.extend(['--vus', args.parallel])
221223
k6_args.extend(['--summary-export', 'k6-results.json'])
224+
k6_args.extend(['--out', 'csv=k6-results.csv'])
222225
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
223226
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
224227
print(f"bench: starting k6 with: {args}")
@@ -231,7 +234,7 @@ def start_server(args):
231234
server_process = start_server_background(args)
232235

233236
attempts = 0
234-
max_attempts = 20
237+
max_attempts = 600
235238
if 'GITHUB_ACTIONS' in os.environ:
236239
max_attempts *= 2
237240

@@ -242,7 +245,15 @@ def start_server(args):
242245
print(f"bench: waiting for server to start ...")
243246
time.sleep(0.5)
244247

245-
print("bench: server started.")
248+
attempts = 0
249+
while not is_server_ready(args.host, args.port):
250+
attempts += 1
251+
if attempts > max_attempts:
252+
assert False, "server not ready"
253+
print(f"bench: waiting for server to be ready ...")
254+
time.sleep(0.5)
255+
256+
print("bench: server started and ready.")
246257
return server_process
247258

248259

@@ -255,11 +266,6 @@ def start_server_background(args):
255266
'--host', args.host,
256267
'--port', args.port,
257268
]
258-
model_file = args.model_path_prefix + os.path.sep + args.hf_file
259-
model_dir = os.path.dirname(model_file)
260-
if not os.path.exists(model_dir):
261-
os.makedirs(model_dir)
262-
server_args.extend(['--model', model_file])
263269
server_args.extend(['--hf-repo', args.hf_repo])
264270
server_args.extend(['--hf-file', args.hf_file])
265271
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
303309
return _is_server_listening
304310

305311

312+
def is_server_ready(server_fqdn, server_port):
313+
url = f"http://{server_fqdn}:{server_port}/health"
314+
response = requests.get(url)
315+
return response.status_code == 200
316+
317+
306318
def escape_metric_name(metric_name):
307319
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
308320

examples/server/bench/script.js

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
5656

5757
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
5858
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
59+
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
5960

6061
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
6162
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -89,6 +90,9 @@ export default function () {
8990
],
9091
"model": model,
9192
"stream": true,
93+
"stream_options": {
94+
"include_usage": true, // False to be supported in llama.cpp server
95+
},
9296
"seed": 42,
9397
"max_tokens": max_tokens,
9498
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@@ -105,12 +109,20 @@ export default function () {
105109
client.on('event', function (event) {
106110
if (promptEvalEndTime == null) {
107111
promptEvalEndTime = new Date()
112+
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
113+
}
114+
115+
if (event.data === '[DONE]' || event.data === '') {
116+
return
108117
}
109118

110119
let chunk = JSON.parse(event.data)
111-
let choice = chunk.choices[0]
112-
if (choice.finish_reason) {
113-
finish_reason = choice.finish_reason
120+
121+
if (chunk.choices && chunk.choices.length > 0) {
122+
let choice = chunk.choices[0]
123+
if (choice.finish_reason) {
124+
finish_reason = choice.finish_reason
125+
}
114126
}
115127

116128
if (chunk.usage) {

0 commit comments

Comments
 (0)