@@ -2995,7 +2995,7 @@ void llama_opt_epoch(
29952995 callback_eval);
29962996}
29972997
2998- llama_token llama_build_and_execute_mtp_graph (struct llama_context * ctx,
2998+ void llama_build_and_execute_mtp_graph (struct llama_context * ctx,
29992999 const llama_batch batch_inp, llama_token last_token_id, int32_t n_past, int32_t last_tok_idx) {
30003000
30013001 const auto * model = llama_get_model (ctx);
@@ -3033,6 +3033,12 @@ llama_token llama_build_and_execute_mtp_graph(struct llama_context * ctx,
30333033
30343034 auto * gf = model->build_mtp_graph (*params_mtp, last_token_id, n_past);
30353035
3036+ if (!gf) {
3037+ LLAMA_LOG_ERROR (" %s: ERROR - The construction of the MTP graph failed (returned null)." , __func__);
3038+ if (sched) ggml_backend_sched_free (sched);
3039+ return ;
3040+ }
3041+
30363042 ggml_backend_sched_reset (sched); // clear the allocation of the previous graph
30373043 ggml_backend_sched_alloc_graph (sched, gf); // explicitly allocate the new graph but do not execute it
30383044
@@ -3044,29 +3050,24 @@ llama_token llama_build_and_execute_mtp_graph(struct llama_context * ctx,
30443050
30453051 ggml_backend_sched_graph_compute (sched, gf); // execute the graph
30463052
3047- // struct ggml_tensor * logits_mtp = res_mtp->get_logits();
3048-
3049- // LLAMA_LOG_INFO("logits_mtp pointer address: %p\n", (void*)logits_mtp);
3050-
3051- // if (logits_mtp) {
3052- // ctx->set_logits_ith(logits_mtp, sched, last_tok_idx);
3053- // }
3054- struct ggml_tensor * token_id_tensor = ggml_get_tensor (res_mtp->get_ctx (), " mtp_argmax_result" );
3055-
3056-
3057- llama_token token_id = 0 ; // The C++ variable to hold the result.
3058-
3059- // ggml_backend_tensor_get is the function for GPU->CPU copies.
3060- // We are copying a single 32-bit integer.
3061- ggml_backend_tensor_get (
3062- token_id_tensor,
3063- &token_id, // Pointer to our C++ variable
3064- 0 , // Starting offset in bytes
3065- sizeof (llama_token) // Number of bytes to copy
3066- );
3053+ struct ggml_tensor * logits_mtp = res_mtp->get_logits ();
3054+
3055+ if (logits_mtp) {
3056+ float * logits_dest = ctx->get_logits_ith (last_tok_idx);
3057+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend (sched, logits_mtp);
3058+ if (backend_res) {
3059+ // ggml_backend_tensor_get is the function for GPU->CPU copies.
3060+ // We are copying a single 32-bit integer.
3061+ ggml_backend_tensor_get (logits_mtp,
3062+ logits_dest, // Pointer to our C++ variable
3063+ 0 , // Starting offset in bytes
3064+ ggml_nbytes (logits_mtp)); // Number of bytes to copy
3065+ } else {
3066+ LLAMA_LOG_ERROR (" %s: ERROR - Could not obtain the backend for the logits tensor." , __func__);
3067+ }
3068+ } else {
3069+ LLAMA_LOG_WARN (" %s: WARNING - The MTP graph did not produce a logit tensor." , __func__);
3070+ }
30673071
30683072 ggml_backend_sched_free (sched);
3069-
3070- return token_id;
3071- }
3072-
3073+ }
0 commit comments