|
1 | 1 | #include "models.h" |
2 | 2 |
|
3 | | -llm_build_rwkv6::llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) { |
4 | | - GGML_ASSERT(hparams.token_shift_count == 2); |
5 | | - |
6 | | - ggml_tensor * cur; |
7 | | - ggml_tensor * inpL; |
8 | | - |
9 | | - inpL = build_inp_embd(model.tok_embd); |
10 | | - inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); |
11 | | - |
12 | | - auto * rs_inp = build_rs_inp(); |
13 | | - |
14 | | - const auto n_embd = hparams.n_embd; |
15 | | - const auto n_seq_tokens = ubatch.n_seq_tokens; |
16 | | - const auto n_seqs = ubatch.n_seqs; |
17 | | - |
18 | | - ggml_tensor * inp_out_ids = build_inp_out_ids(); |
19 | | - |
20 | | - for (int il = 0; il < n_layer; ++il) { |
21 | | - const llama_layer * layer = &model.layers[il]; |
22 | | - inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); |
23 | | - |
24 | | - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); |
25 | | - |
26 | | - ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); |
27 | | - ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); |
28 | | - |
29 | | - ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); |
30 | | - cb(att_norm, "attn_norm", il); |
31 | | - |
32 | | - ggml_tensor * x_prev = ggml_concat( |
33 | | - ctx0, |
34 | | - att_shift, |
35 | | - ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), |
36 | | - 1 |
37 | | - ); |
38 | | - |
39 | | - cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); |
40 | | - |
41 | | - ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
42 | | - cb(ffn_inp, "ffn_inp", il); |
43 | | - |
44 | | - ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); |
45 | | - cb(ffn_norm, "ffn_norm", il); |
46 | | - |
47 | | - x_prev = ggml_concat( |
48 | | - ctx0, |
49 | | - ffn_shift, |
50 | | - ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), |
51 | | - 1 |
52 | | - ); |
53 | | - |
54 | | - token_shift = ggml_concat(ctx0, |
55 | | - ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), |
56 | | - ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), |
57 | | - 1 |
58 | | - ); |
59 | | - ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); |
60 | | - |
61 | | - ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens); |
62 | | - ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens); |
63 | | - x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens); |
64 | | - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); |
65 | | - |
66 | | - if (il == n_layer - 1 && inp_out_ids) { |
67 | | - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); |
68 | | - ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids); |
69 | | - x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids); |
70 | | - cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
71 | | - } |
72 | | - cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); |
73 | | - cur = ggml_add(ctx0, cur, ffn_inp); |
74 | | - |
75 | | - if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { |
76 | | - cur = ggml_scale(ctx0, cur, 0.5F); |
77 | | - } |
78 | | - cur = build_cvec(cur, il); |
79 | | - cb(cur, "l_out", il); |
80 | | - |
81 | | - // input for next layer |
82 | | - inpL = cur; |
83 | | - } |
84 | | - cur = inpL; |
85 | | - cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); |
| 3 | +llm_build_rwkv6::llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : |
| 4 | + llm_build_rwkv6_base(model, params) { |
| 5 | + GGML_ASSERT(hparams.token_shift_count == 2); |
| 6 | + |
| 7 | + ggml_tensor * cur; |
| 8 | + ggml_tensor * inpL; |
| 9 | + |
| 10 | + inpL = build_inp_embd(model.tok_embd); |
| 11 | + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); |
| 12 | + |
| 13 | + auto * rs_inp = build_rs_inp(); |
| 14 | + |
| 15 | + const auto n_embd = hparams.n_embd; |
| 16 | + const auto n_seq_tokens = ubatch.n_seq_tokens; |
| 17 | + const auto n_seqs = ubatch.n_seqs; |
| 18 | + |
| 19 | + ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| 20 | + |
| 21 | + for (int il = 0; il < n_layer; ++il) { |
| 22 | + const llama_layer * layer = &model.layers[il]; |
| 23 | + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); |
| 24 | + |
| 25 | + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); |
| 26 | + |
| 27 | + ggml_tensor * att_shift = |
| 28 | + ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); |
| 29 | + ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], |
| 30 | + token_shift->nb[2], n_embd * ggml_element_size(token_shift)); |
| 31 | + |
| 32 | + ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); |
| 33 | + cb(att_norm, "attn_norm", il); |
| 34 | + |
| 35 | + ggml_tensor * x_prev = ggml_concat( |
| 36 | + ctx0, att_shift, |
| 37 | + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1); |
86 | 38 |
|
87 | | - cb(cur, "result_norm", -1); |
88 | | - res->t_embd = cur; |
| 39 | + cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); |
89 | 40 |
|
90 | | - cur = build_lora_mm(model.output, cur); |
| 41 | + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| 42 | + cb(ffn_inp, "ffn_inp", il); |
91 | 43 |
|
92 | | - cb(cur, "result_output", -1); |
93 | | - res->t_logits = cur; |
| 44 | + ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); |
| 45 | + cb(ffn_norm, "ffn_norm", il); |
94 | 46 |
|
95 | | - ggml_build_forward_expand(gf, cur); |
| 47 | + x_prev = ggml_concat( |
| 48 | + ctx0, ffn_shift, |
| 49 | + ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1); |
| 50 | + |
| 51 | + token_shift = ggml_concat(ctx0, |
| 52 | + ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], |
| 53 | + (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)), |
| 54 | + ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], |
| 55 | + (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)), |
| 56 | + 1); |
| 57 | + ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); |
| 58 | + |
| 59 | + ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens); |
| 60 | + ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens); |
| 61 | + x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens); |
| 62 | + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); |
| 63 | + |
| 64 | + if (il == n_layer - 1 && inp_out_ids) { |
| 65 | + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); |
| 66 | + ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids); |
| 67 | + x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids); |
| 68 | + cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| 69 | + } |
| 70 | + cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); |
| 71 | + cur = ggml_add(ctx0, cur, ffn_inp); |
| 72 | + |
| 73 | + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { |
| 74 | + cur = ggml_scale(ctx0, cur, 0.5F); |
| 75 | + } |
| 76 | + cur = build_cvec(cur, il); |
| 77 | + cb(cur, "l_out", il); |
| 78 | + |
| 79 | + // input for next layer |
| 80 | + inpL = cur; |
96 | 81 | } |
| 82 | + cur = inpL; |
| 83 | + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); |
| 84 | + |
| 85 | + cb(cur, "result_norm", -1); |
| 86 | + res->t_embd = cur; |
| 87 | + |
| 88 | + cur = build_lora_mm(model.output, cur); |
| 89 | + |
| 90 | + cb(cur, "result_output", -1); |
| 91 | + res->t_logits = cur; |
| 92 | + |
| 93 | + ggml_build_forward_expand(gf, cur); |
| 94 | +} |
0 commit comments