@@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
71277127            } break;
71287128        case GGML_OP_MUL_MAT:
71297129            {
7130-                 ggml_tensor * b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, w->ne[0], 512);
7130+                 ggml_tensor * b = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3] );
71317131                op_tensor = ggml_mul_mat(ctx, w, b);
71327132            } break;
71337133        case GGML_OP_MUL_MAT_ID:
@@ -7167,18 +7167,38 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
71677167            } break;
71687168        case GGML_OP_SSM_CONV:
71697169            {
7170-                 // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
7171-                 op_tensor = ggml_ssm_conv(ctx, nullptr, w);
7170+                 // FIXME
7171+                 ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
7172+                 op_tensor = ggml_ssm_conv(ctx, conv_x, w);
71727173            } break;
71737174        case GGML_OP_SSM_SCAN:
71747175            {
7175-                 // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
7176-                 op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
7176+                 // FIXME
7177+                 const int64_t d_state      = w->ne[0];
7178+                 const int64_t d_inner      = w->ne[1];
7179+                 const int64_t n_seq_tokens = 512;
7180+                 const int64_t n_seqs       = 1;
7181+                 ggml_tensor * s  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
7182+                 ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
7183+                 ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
7184+                 ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
7185+                 ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
7186+                 op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
71777187            } break;
71787188        case GGML_OP_RWKV_WKV:
71797189            {
7180-                 // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
7181-                 op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
7190+                 // FIXME
7191+                 const int64_t S = 123;
7192+                 const int64_t H = 123;
7193+                 const int64_t n_tokens = 123;
7194+                 const int64_t n_seqs = 123;
7195+                 ggml_tensor  * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
7196+                 ggml_tensor  * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7197+                 ggml_tensor  * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7198+                 ggml_tensor  * tf = w;
7199+                 ggml_tensor  * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7200+                 ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
7201+                 op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
71827202            } break;
71837203        default:
71847204            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
@@ -7453,7 +7473,7 @@ static bool llm_load_tensors(
74537473
74547474            // tensors with "bias" suffix are always used with GGML_OP_ADD
74557475            ggml_op op;
7456-             bool bias = strcmp(tn.suffix, "bias") == 0;
7476+             bool bias = tn.suffix != nullptr &&  strcmp(tn.suffix, "bias") == 0;
74577477            if (bias) {
74587478                op = GGML_OP_ADD;
74597479            } else {
@@ -19681,7 +19701,7 @@ struct llama_context * llama_new_context_with_model(
1968119701            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
1968219702
1968319703            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
19684-             gf_pp = llama_build_graph(*ctx, ubatch_pp, false );
19704+             gf_pp = llama_build_graph(*ctx, ubatch_pp, true );
1968519705            if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
1968619706                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
1968719707                llama_free(ctx);
0 commit comments