Skip to content

Commit a684bef

Browse files
Merge pull request #94 from menloresearch/update-dev-from-master-2025-05-18-00-09
Sync master with upstream release b5415
2 parents e6eb54f + 6a2bc8b commit a684bef

File tree

13 files changed

+299
-435
lines changed

13 files changed

+299
-435
lines changed

common/arg.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2585,7 +2585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25852585
[](common_params & params, int value) {
25862586
params.n_junk = value;
25872587
}
2588-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2588+
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
25892589
add_opt(common_arg(
25902590
{"--pos"}, "N",
25912591
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2648,7 +2648,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26482648
[](common_params & params) {
26492649
params.is_pp_shared = true;
26502650
}
2651-
).set_examples({LLAMA_EXAMPLE_BENCH}));
2651+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
26522652
add_opt(common_arg(
26532653
{"-npp"}, "n0,n1,...",
26542654
"number of prompt tokens",
@@ -2880,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28802880
params.chat_template = read_file(value);
28812881
}
28822882
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2883+
add_opt(common_arg(
2884+
{"--no-prefill-assistant"},
2885+
string_format(
2886+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2887+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2888+
),
2889+
[](common_params & params) {
2890+
params.prefill_assistant = false;
2891+
}
2892+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
28832893
add_opt(common_arg(
28842894
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
28852895
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ struct common_params {
368368
bool use_jinja = false; // NOLINT
369369
bool enable_chat_template = true;
370370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
371372

372373
std::vector<std::string> api_keys;
373374

examples/parallel/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
11
# llama.cpp/example/parallel
22

33
Simplified simulation of serving incoming requests in parallel
4+
5+
## Example
6+
7+
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
8+
9+
```bash
10+
llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
11+
```
12+
13+
> [!NOTE]
14+
> It's recommended to use base models with this example. Instruction tuned models might not be able to properly follow the custom chat template specified here, so the results might not be as expected.

examples/parallel/parallel.cpp

Lines changed: 86 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,61 @@ static std::string k_system =
3434
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
3535
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
3636
37-
User: Recommend a nice restaurant in the area.
38-
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
39-
User: Who is Richard Feynman?
40-
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
41-
User:)";
37+
User:
38+
Recommend a nice restaurant in the area.
39+
Assistant:
40+
I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
41+
User:
42+
Who is Richard Feynman?
43+
Assistant:
44+
Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
45+
)";
46+
47+
static std::vector<std::string> k_questions = {
48+
"What is the tallest mountain in the world?",
49+
"Who was the first person to win two Nobel Prizes?",
50+
"Which country invented paper?",
51+
"What organ is primarily responsible for pumping blood throughout the body?",
52+
"Which planet is known for its prominent ring system?",
53+
"Who directed the movie 'Inception'?",
54+
"What is the freezing point of water in Fahrenheit?",
55+
"Which animal is known to have the longest lifespan?",
56+
"What language has the most native speakers worldwide?",
57+
"What is the capital city of Canada?",
58+
"Who is credited with inventing the World Wide Web?",
59+
"Which metal is liquid at room temperature?",
60+
"What is the term for an animal that eats both plants and meat?",
61+
"Who painted 'The Starry Night'?",
62+
"What gas do humans exhale that plants use for photosynthesis?",
63+
"What year did World War II end?",
64+
"Which continent has the most countries?",
65+
"Who wrote the novel 'Frankenstein'?",
66+
"What does DNA stand for?",
67+
"What is the main ingredient in traditional Japanese miso soup?"
68+
};
69+
70+
static std::vector<std::string> k_answers = {
71+
"The tallest mountain in the world is Mount Everest.",
72+
"Marie Curie was the first person to win two Nobel Prizes.",
73+
"Paper was invented in China.",
74+
"The heart is the organ responsible for pumping blood.",
75+
"Saturn is known for its prominent ring system.",
76+
"Christopher Nolan directed the movie 'Inception'.",
77+
"The freezing point of water in Fahrenheit is 32°F.",
78+
"The bowhead whale is known to have the longest lifespan among mammals.",
79+
"Mandarin Chinese has the most native speakers in the world.",
80+
"The capital city of Canada is Ottawa.",
81+
"Tim Berners-Lee is credited with inventing the World Wide Web.",
82+
"Mercury is the metal that is liquid at room temperature.",
83+
"An animal that eats both plants and meat is called an omnivore.",
84+
"'The Starry Night' was painted by Vincent van Gogh.",
85+
"Humans exhale carbon dioxide, which plants use in photosynthesis.",
86+
"World War II ended in 1945.",
87+
"Africa is the continent with the most countries.",
88+
"The novel 'Frankenstein' was written by Mary Shelley.",
89+
"DNA stands for Deoxyribonucleic Acid.",
90+
"The main ingredient in traditional Japanese miso soup is fermented soybean paste."
91+
};
4292

4393
static std::vector<std::string> k_prompts = {
4494
"What is the meaning of life?",
@@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
4999
"What is the best way to learn a new language?",
50100
"How to get a job at Google?",
51101
"If you could have any superpower, what would it be?",
52-
"I want to learn how to play the piano.",
102+
"I want to learn how to play the piano. What would be the best way to do it?",
53103
};
54104

55105
struct client {
@@ -68,6 +118,7 @@ struct client {
68118
int64_t t_start_prompt;
69119
int64_t t_start_gen;
70120

121+
int32_t n_past = 0;
71122
int32_t n_prompt = 0;
72123
int32_t n_decoded = 0;
73124
int32_t i_batch = -1;
@@ -107,6 +158,7 @@ int main(int argc, char ** argv) {
107158
common_params params;
108159

109160
params.n_predict = 128;
161+
params.n_junk = 0;
110162

111163
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
112164
return 1;
@@ -128,6 +180,12 @@ int main(int argc, char ** argv) {
128180

129181
const bool dump_kv_cache = params.dump_kv_cache;
130182

183+
// is the system prompt shared in the cache
184+
const bool is_sp_shared = params.is_pp_shared;
185+
186+
// extra text to insert in each client's prompt in order to make it larger
187+
const int32_t n_junk = params.n_junk;
188+
131189
// init llama.cpp
132190
llama_backend_init();
133191
llama_numa_init(params.numa);
@@ -169,6 +227,7 @@ int main(int argc, char ** argv) {
169227
}
170228

171229
std::vector<llama_token> tokens_system;
230+
172231
tokens_system = common_tokenize(ctx, k_system, true);
173232
const int32_t n_tokens_system = tokens_system.size();
174233

@@ -190,7 +249,7 @@ int main(int argc, char ** argv) {
190249
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
191250
LOG_INF("\n");
192251

193-
{
252+
if (is_sp_shared) {
194253
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
195254

196255
for (int32_t i = 0; i < n_tokens_system; ++i) {
@@ -228,7 +287,7 @@ int main(int argc, char ** argv) {
228287

229288
client.i_batch = batch.n_tokens;
230289

231-
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
290+
common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);
232291

233292
client.n_decoded += 1;
234293
}
@@ -254,17 +313,31 @@ int main(int argc, char ** argv) {
254313
client.t_start_gen = 0;
255314

256315
client.input = k_prompts[rand() % k_prompts.size()];
257-
client.prompt = client.input + "\nAssistant:";
258316
client.response = "";
259317

318+
// construct the prompt:
319+
// [system prompt] + [junk] + [user prompt]
320+
client.n_past = 0;
321+
client.prompt = "";
322+
if (is_sp_shared) {
323+
client.n_past = n_tokens_system;
324+
} else {
325+
client.prompt += k_system;
326+
}
327+
for (int i = 0; i < n_junk; ++i) {
328+
const int r = rand() % k_questions.size();
329+
client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
330+
}
331+
client.prompt += "User:\n" + client.input + "\nAssistant:\n";
332+
260333
common_sampler_reset(client.smpl);
261334

262335
// do not prepend BOS because we have a system prompt!
263336
std::vector<llama_token> tokens_prompt;
264337
tokens_prompt = common_tokenize(ctx, client.prompt, false);
265338

266339
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
267-
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
340+
common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
268341
}
269342

270343
// extract the logits only for the last token
@@ -363,10 +436,9 @@ int main(int argc, char ** argv) {
363436
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
364437

365438
if (client.n_decoded > 2 &&
366-
(llama_vocab_is_eog(vocab, id) ||
367-
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
368-
client.response.find("User:") != std::string::npos ||
369-
client.response.find('\n') != std::string::npos)) {
439+
(llama_vocab_is_eog(vocab, id) ||
440+
(params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
441+
client.response.find("User:") != std::string::npos)) {
370442
// basic reverse prompt
371443
const size_t pos = client.response.find("User:");
372444
if (pos != std::string::npos) {

ggml/src/ggml-vulkan/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ if (Vulkan_FOUND)
5454
-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
5555
)
5656

57+
set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
58+
if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
59+
list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
60+
endif()
61+
5762
# Test all shader extensions
5863
test_shader_extension_support(
5964
"GL_KHR_cooperative_matrix"
@@ -149,7 +154,7 @@ if (Vulkan_FOUND)
149154
vulkan-shaders-gen
150155
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
151156
CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
152-
BUILD_COMMAND ${CMAKE_COMMAND} --build .
157+
BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
153158
INSTALL_COMMAND ${CMAKE_COMMAND} --install .
154159
INSTALL_DIR ${CMAKE_BINARY_DIR}
155160
)

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5872,10 +5872,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
58725872
vk_pipeline *pipelines;
58735873
bool small_rows = N <= get_fa_num_small_rows(path);
58745874

5875+
// coopmat1 does not actually support "small rows" (it needs 16 rows).
5876+
// So use scalar instead.
58755877
if (small_rows && path == FA_COOPMAT1) {
58765878
path = FA_SCALAR;
58775879
}
58785880

5881+
// scalar is faster than coopmat2 when N==1
5882+
if (N == 1 && path == FA_COOPMAT2) {
5883+
path = FA_SCALAR;
5884+
}
5885+
58795886
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
58805887

58815888
switch (path) {

0 commit comments

Comments
 (0)