|
1 | 1 | #include "llama-model.h" |
2 | 2 |
|
| 3 | +#include "gguf.h" |
3 | 4 | #include "llama-impl.h" |
4 | 5 | #include "llama-mmap.h" |
5 | 6 | #include "llama-batch.h" |
@@ -2428,6 +2429,115 @@ bool llama_model::load_tensors(llama_model_loader & ml) { |
2428 | 2429 | return ml.create_tensor(ctx, tn, ne, flags); |
2429 | 2430 | }; |
2430 | 2431 |
|
| 2432 | + struct tensor_def { |
| 2433 | + LLM_TN_IMPL tn; |
| 2434 | + std::vector<int64_t> ne; |
| 2435 | + int flags; |
| 2436 | + ggml_tensor ** out; |
| 2437 | + }; |
| 2438 | + |
| 2439 | + auto create_contiguous = [&](const LLM_TN_IMPL & fused_tn, |
| 2440 | + std::initializer_list<int64_t> ne, |
| 2441 | + std::initializer_list<tensor_def> reqs) -> ggml_tensor * { |
| 2442 | + ggml_backend_buffer_type_t fused_buft = nullptr; |
| 2443 | + |
| 2444 | + std::vector<const ggml_tensor*> tensor_metas; |
| 2445 | + |
| 2446 | + for (size_t i = 0; i < reqs.size(); ++i) { |
| 2447 | + const tensor_def & req = reqs.begin()[i]; |
| 2448 | + const bool required = (req.flags & llama_model_loader::TENSOR_NOT_REQUIRED) == 0; |
| 2449 | + const ggml_tensor * tensor_meta = ml.check_tensor_dims(req.tn.str(), req.ne, required); |
| 2450 | + |
| 2451 | + if (!tensor_meta) { |
| 2452 | + return nullptr; |
| 2453 | + } |
| 2454 | + |
| 2455 | + tensor_metas.push_back(tensor_meta); |
| 2456 | + |
| 2457 | + *req.out = const_cast<ggml_tensor*>(tensor_meta); |
| 2458 | + |
| 2459 | + if (!*req.out) { |
| 2460 | + return nullptr; |
| 2461 | + } |
| 2462 | + |
| 2463 | + llm_tensor tn_tensor = req.tn.tensor; |
| 2464 | + if (tn_tensor == LLM_TENSOR_TOKEN_EMBD && (req.flags & llama_model_loader::TENSOR_DUPLICATED)) { |
| 2465 | + tn_tensor = LLM_TENSOR_OUTPUT; |
| 2466 | + } |
| 2467 | + |
| 2468 | + llm_tensor_info info; |
| 2469 | + try { |
| 2470 | + info = llm_tensor_info_for(tn_tensor); |
| 2471 | + } catch (const std::out_of_range &) { |
| 2472 | + throw std::runtime_error(format("missing tensor info mapping for %s", req.tn.str().c_str())); |
| 2473 | + } |
| 2474 | + |
| 2475 | + bool bias = req.tn.suffix != nullptr && strcmp(req.tn.suffix, "bias") == 0; |
| 2476 | + ggml_op op = bias ? (info.op == GGML_OP_MUL_MAT_ID ? GGML_OP_ADD_ID : GGML_OP_ADD) : info.op; |
| 2477 | + |
| 2478 | + buft_list_t * buft_list = nullptr; |
| 2479 | + switch (info.layer) { |
| 2480 | + case LLM_TENSOR_LAYER_INPUT: |
| 2481 | + buft_list = pimpl->dev_input.buft_list; |
| 2482 | + break; |
| 2483 | + case LLM_TENSOR_LAYER_OUTPUT: |
| 2484 | + buft_list = pimpl->dev_output.buft_list; |
| 2485 | + break; |
| 2486 | + case LLM_TENSOR_LAYER_REPEATING: |
| 2487 | + buft_list = pimpl->dev_layer.at(req.tn.bid).buft_list; |
| 2488 | + break; |
| 2489 | + default: |
| 2490 | + GGML_ABORT("invalid layer %d for tensor %s", info.layer, req.tn.str().c_str()); |
| 2491 | + } |
| 2492 | + |
| 2493 | + ggml_backend_buffer_type_t buft = select_weight_buft(hparams, *req.out, op, *buft_list); |
| 2494 | + if (!buft) { |
| 2495 | + return nullptr; |
| 2496 | + } |
| 2497 | + |
| 2498 | + auto * buft_dev = ggml_backend_buft_get_device(buft); |
| 2499 | + if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { |
| 2500 | + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
| 2501 | + if (!cpu_dev) { |
| 2502 | + throw std::runtime_error("no CPU backend found"); |
| 2503 | + } |
| 2504 | + buft = ggml_backend_dev_buffer_type(cpu_dev); |
| 2505 | + } |
| 2506 | + |
| 2507 | + //TODO: check buft overrides |
| 2508 | + |
| 2509 | + if (!fused_buft) { |
| 2510 | + fused_buft = buft; |
| 2511 | + } else if (fused_buft != buft) { |
| 2512 | + return nullptr; |
| 2513 | + } |
| 2514 | + } |
| 2515 | + |
| 2516 | + if (!fused_buft) { |
| 2517 | + return nullptr; |
| 2518 | + } |
| 2519 | + |
| 2520 | + ggml_context * ctx = ctx_for_buft(fused_buft); |
| 2521 | + |
| 2522 | + std::vector<ggml_tensor**> tensor_req{reqs.size()}; |
| 2523 | + |
| 2524 | + ggml_type type = tensor_metas[0]->type; |
| 2525 | + for (size_t i = 0; i < reqs.size(); ++i) { |
| 2526 | + |
| 2527 | + // types are not same |
| 2528 | + if (tensor_metas[i]->type != type) { |
| 2529 | + return nullptr; |
| 2530 | + } |
| 2531 | + |
| 2532 | + const auto & req = reqs.begin()[i]; |
| 2533 | + tensor_req[i] = req.out; |
| 2534 | + } |
| 2535 | + |
| 2536 | + ggml_tensor * fused = ml.create_contiguous_tensor(ctx, fused_tn.str(), ne, tensor_req, 0); |
| 2537 | + |
| 2538 | + return fused; |
| 2539 | + }; |
| 2540 | + |
2431 | 2541 | layers.resize(n_layer); |
2432 | 2542 |
|
2433 | 2543 | // TODO: move to a separate function |
@@ -3297,9 +3407,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { |
3297 | 3407 |
|
3298 | 3408 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); |
3299 | 3409 |
|
3300 | | - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); |
3301 | | - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); |
3302 | | - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3410 | + layer.wqkv = create_contiguous( |
| 3411 | + tn(LLM_TENSOR_ATTN_QKV, "weight", i), |
| 3412 | + {n_embd, n_embd_head_k * n_head + n_embd_gqa * 2}, |
| 3413 | + { |
| 3414 | + { tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0, &layer.wq }, |
| 3415 | + { tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0, &layer.wk }, |
| 3416 | + { tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0, &layer.wv }, |
| 3417 | + }); |
| 3418 | + if (!layer.wqkv) { |
| 3419 | + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); |
| 3420 | + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3421 | + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3422 | + } |
3303 | 3423 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); |
3304 | 3424 |
|
3305 | 3425 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); |
@@ -3328,9 +3448,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { |
3328 | 3448 |
|
3329 | 3449 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); |
3330 | 3450 |
|
3331 | | - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); |
3332 | | - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); |
3333 | | - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3451 | + layer.wqkv = create_contiguous( |
| 3452 | + tn(LLM_TENSOR_ATTN_QKV, "weight", i), |
| 3453 | + {n_embd, n_embd_head_k * n_head + n_embd_gqa * 2}, |
| 3454 | + { |
| 3455 | + { tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0, &layer.wq }, |
| 3456 | + { tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0, &layer.wk }, |
| 3457 | + { tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0, &layer.wv }, |
| 3458 | + }); |
| 3459 | + if (!layer.wqkv) { |
| 3460 | + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); |
| 3461 | + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3462 | + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); |
| 3463 | + } |
3334 | 3464 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); |
3335 | 3465 |
|
3336 | 3466 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); |
@@ -9388,18 +9518,15 @@ struct llm_build_qwen3 : public llm_graph_context { |
9388 | 9518 | // self-attention |
9389 | 9519 | { |
9390 | 9520 | // compute Q and K and RoPE them |
9391 | | - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
9392 | | - cb(Qcur, "Qcur", il); |
9393 | | - |
9394 | | - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
9395 | | - cb(Kcur, "Kcur", il); |
9396 | 9521 |
|
9397 | | - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
9398 | | - cb(Vcur, "Vcur", il); |
| 9522 | + ggml_tensor * Qcur = nullptr; |
| 9523 | + ggml_tensor * Kcur = nullptr; |
| 9524 | + ggml_tensor * Vcur = nullptr; |
9399 | 9525 |
|
9400 | | - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
9401 | | - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
9402 | | - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
| 9526 | + build_qkv(model.layers[il], cur, n_embd_head, |
| 9527 | + n_embd_head_k, n_embd_head_v, n_head, n_head_kv, |
| 9528 | + &Qcur, &Kcur, &Vcur, il |
| 9529 | + ); |
9403 | 9530 |
|
9404 | 9531 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); |
9405 | 9532 | cb(Qcur, "Qcur_normed", il); |
@@ -9509,18 +9636,15 @@ struct llm_build_qwen3moe : public llm_graph_context { |
9509 | 9636 | // self_attention |
9510 | 9637 | { |
9511 | 9638 | // compute Q and K and RoPE them |
9512 | | - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
9513 | | - cb(Qcur, "Qcur", il); |
9514 | 9639 |
|
9515 | | - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
9516 | | - cb(Kcur, "Kcur", il); |
9517 | | - |
9518 | | - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
9519 | | - cb(Vcur, "Vcur", il); |
| 9640 | + ggml_tensor * Qcur = nullptr; |
| 9641 | + ggml_tensor * Kcur = nullptr; |
| 9642 | + ggml_tensor * Vcur = nullptr; |
9520 | 9643 |
|
9521 | | - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
9522 | | - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
9523 | | - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
| 9644 | + build_qkv(model.layers[il], cur, n_embd_head, |
| 9645 | + n_embd_head_k, n_embd_head_v, n_head, n_head_kv, |
| 9646 | + &Qcur, &Kcur, &Vcur, il |
| 9647 | + ); |
9524 | 9648 |
|
9525 | 9649 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); |
9526 | 9650 | cb(Qcur, "Qcur_normed", il); |
|
0 commit comments