@@ -1056,13 +1056,14 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10561056 const auto & model = ctx->vision_model ;
10571057 const auto & hparams = model.hparams ;
10581058
1059- int n_step = img.nx ;
1060- int n_mel = img.ny ;
1059+ const int n_step = img.nx ;
1060+ const int n_mel = img.ny ;
10611061
10621062 const int n_embd = hparams.hidden_size ;
10631063 const int n_head = hparams.n_head ;
10641064 const int d_head = n_embd / n_head;
10651065 const int n_layer = hparams.n_layer ;
1066+ const int n_pos = n_step / 2 ;
10661067 const float eps = hparams.eps ;
10671068
10681069 ggml_init_params params = {
@@ -1080,7 +1081,7 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10801081 ggml_set_name (inp_raw, " inp_raw" );
10811082 ggml_set_input (inp_raw);
10821083
1083- struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_step );
1084+ struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos );
10841085 ggml_set_name (positions, " positions" );
10851086 ggml_set_input (positions);
10861087
@@ -1125,20 +1126,17 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11251126 ggml_tensor * k = ggml_mul_mat (ctx0, layer.k_w , cur); // no bias for key
11261127 ggml_tensor * v = ggml_add (ctx0, ggml_mul_mat (ctx0, layer.v_w , cur), layer.v_b );
11271128
1128- q = ggml_reshape_3d (ctx0, q, d_head, n_head, n_step );
1129- k = ggml_reshape_3d (ctx0, k, d_head, n_head, n_step );
1130- v = ggml_reshape_3d (ctx0, v, d_head, n_head, n_step );
1129+ q = ggml_reshape_3d (ctx0, q, d_head, n_head, n_pos );
1130+ k = ggml_reshape_3d (ctx0, k, d_head, n_head, n_pos );
1131+ v = ggml_reshape_3d (ctx0, v, d_head, n_head, n_pos );
11311132
11321133 q = ggml_cont (ctx0, ggml_permute (ctx0, q, 0 , 2 , 1 , 3 ));
11331134 q = ggml_scale (ctx0, q, 1 .0f / std::sqrt (d_head));
1134- // utils.debug_print(q, "q rope");
11351135
11361136 k = ggml_cont (ctx0, ggml_permute (ctx0, k, 0 , 2 , 1 , 3 ));
1137- // utils.debug_print(k, "k rope");
11381137
11391138 ggml_tensor * kq = ggml_mul_mat (ctx0, k, q);
11401139 kq = ggml_soft_max_ext (ctx0, kq, nullptr , 1 .0f , 0 .0f );
1141- // utils.debug_print(kq, "kq softmax");
11421140
11431141 v = ggml_cont (ctx0, ggml_permute (ctx0, v, 1 , 2 , 0 , 3 ));
11441142
@@ -2217,6 +2215,10 @@ struct clip_model_loader {
22172215 } break ;
22182216 case PROJECTOR_TYPE_ULTRAVOX:
22192217 {
2218+ vision_model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
2219+ vision_model.conv1d_1_b = get_tensor (string_format (TN_CONV1D, 1 , " bias" ));
2220+ vision_model.conv1d_2_w = get_tensor (string_format (TN_CONV1D, 2 , " weight" ));
2221+ vision_model.conv1d_2_b = get_tensor (string_format (TN_CONV1D, 2 , " bias" ));
22202222 vision_model.mm_1_w = get_tensor (string_format (TN_MM_AUDIO_MLP, 1 , " weight" ));
22212223 vision_model.mm_2_w = get_tensor (string_format (TN_MM_AUDIO_MLP, 2 , " weight" ));
22222224 vision_model.mm_norm_pre_w = get_tensor (string_format (TN_MM_NORM_PRE, " weight" ));
0 commit comments