Skip to content

Commit 2819f78

Browse files
committed
use a threadpool, seems to improve tg performance
1 parent 40eb3a5 commit 2819f78

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

gpttype_adapter.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,6 +2343,21 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23432343
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, kcpp_data->model_filename.c_str());
23442344
return ModelLoadResult::FAIL;
23452345
}
2346+
2347+
//we use a threadpool, greatly speeds up qwen3moe tg
2348+
ggml_threadpool_params threadpool1_params, threadpool2_params;
2349+
ggml_threadpool_params_init(&threadpool1_params,kcpp_data->n_threads);
2350+
ggml_threadpool_params_init(&threadpool2_params,kcpp_data->n_blasthreads);
2351+
2352+
printf("Threadpool set to %d threads and %d blasthreads...\n", kcpp_data->n_threads,kcpp_data->n_blasthreads);
2353+
struct ggml_threadpool * threadpool1 = ggml_threadpool_new(&threadpool1_params);
2354+
struct ggml_threadpool * threadpool2 = ggml_threadpool_new(&threadpool2_params);
2355+
if (!threadpool1 || !threadpool2) {
2356+
fprintf(stderr, "%s: error: failed to create threadpool.\n", __func__);
2357+
return ModelLoadResult::FAIL;
2358+
}
2359+
llama_attach_threadpool(llama_ctx_v4, threadpool1, threadpool2);
2360+
23462361
if (lora_filename != "")
23472362
{
23482363
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());

0 commit comments

Comments
 (0)