Skip to content

Commit 8b73116

Browse files
committed
warmup ok
1 parent 4fa0c27 commit 8b73116

File tree

3 files changed

+25
-14
lines changed

3 files changed

+25
-14
lines changed

convert_hf_to_gguf.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5827,8 +5827,8 @@ def __init__(self, *args, **kwargs):
58275827
class UltravoxAudioModel(VisionModel):
58285828
def __init__(self, *args, **kwargs):
58295829
super().__init__(*args, **kwargs)
5830-
self.hparams["image_size"] = 0
5831-
self.hparams["patch_size"] = 0
5830+
self.hparams["image_size"] = self.hparams["num_mel_bins"]
5831+
self.hparams["patch_size"] = self.hparams["num_mel_bins"]
58325832
self.hparams["hidden_size"] = self.hparams["d_model"]
58335833
self.hparams["intermediate_size"] = self.hparams["d_model"]
58345834
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
@@ -5847,6 +5847,15 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
58475847
if ".conv" in name:
58485848
return gguf.GGMLQuantizationType.F16
58495849
return False
5850+
5851+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5852+
del bid # unused
5853+
5854+
if "conv1.bias" in name or "conv2.bias" in name:
5855+
# transpose conv1 and conv2 bias
5856+
data_torch = data_torch.unsqueeze(-1)
5857+
5858+
return [(self.map_tensor_name(name), data_torch)]
58505859

58515860
###### CONVERSION LOGIC ######
58525861

gguf-py/gguf/tensor_mapping.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,12 +1088,12 @@ class TensorNameMap:
10881088
"audio_tower.conv{bid}", # ultravox
10891089
),
10901090

1091-
MODEL_TENSOR.A_PRE_NORM: (
1091+
MODEL_TENSOR.A_PRE_NORM: (),
1092+
1093+
MODEL_TENSOR.A_POST_NORM: (
10921094
"audio_tower.layer_norm", # ultravox
10931095
),
10941096

1095-
MODEL_TENSOR.A_POST_NORM: (),
1096-
10971097
MODEL_TENSOR.A_ENC_ATTN_Q: (
10981098
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
10991099
),

tools/llava/clip.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,13 +1056,14 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10561056
const auto & model = ctx->vision_model;
10571057
const auto & hparams = model.hparams;
10581058

1059-
int n_step = img.nx;
1060-
int n_mel = img.ny;
1059+
const int n_step = img.nx;
1060+
const int n_mel = img.ny;
10611061

10621062
const int n_embd = hparams.hidden_size;
10631063
const int n_head = hparams.n_head;
10641064
const int d_head = n_embd / n_head;
10651065
const int n_layer = hparams.n_layer;
1066+
const int n_pos = n_step / 2;
10661067
const float eps = hparams.eps;
10671068

10681069
ggml_init_params params = {
@@ -1080,7 +1081,7 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10801081
ggml_set_name(inp_raw, "inp_raw");
10811082
ggml_set_input(inp_raw);
10821083

1083-
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_step);
1084+
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
10841085
ggml_set_name(positions, "positions");
10851086
ggml_set_input(positions);
10861087

@@ -1125,20 +1126,17 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11251126
ggml_tensor * k = ggml_mul_mat(ctx0, layer.k_w, cur); // no bias for key
11261127
ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
11271128

1128-
q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_step);
1129-
k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_step);
1130-
v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_step);
1129+
q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_pos);
1130+
k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_pos);
1131+
v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_pos);
11311132

11321133
q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
11331134
q = ggml_scale(ctx0, q, 1.0f / std::sqrt(d_head));
1134-
// utils.debug_print(q, "q rope");
11351135

11361136
k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
1137-
// utils.debug_print(k, "k rope");
11381137

11391138
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
11401139
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f, 0.0f);
1141-
// utils.debug_print(kq, "kq softmax");
11421140

11431141
v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
11441142

@@ -2217,6 +2215,10 @@ struct clip_model_loader {
22172215
} break;
22182216
case PROJECTOR_TYPE_ULTRAVOX:
22192217
{
2218+
vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2219+
vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2220+
vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2221+
vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
22202222
vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
22212223
vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
22222224
vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));

0 commit comments

Comments
 (0)