Skip to content

Commit 4ac7940

Browse files
committed
test
1 parent 8b73116 commit 4ac7940

File tree

5 files changed

+73
-24
lines changed

5 files changed

+73
-24
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5844,7 +5844,7 @@ def set_gguf_parameters(self):
58445844

58455845
def tensor_force_quant(self, name, new_name, bid, n_dims):
58465846
del bid, new_name, n_dims # unused
5847-
if ".conv" in name:
5847+
if ".conv" in name and ".weight" in name:
58485848
return gguf.GGMLQuantizationType.F16
58495849
return False
58505850

tools/llava/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ struct clip_image_u8 {
151151

152152
// RGB float32 image (NHWC)
153153
// Memory layout: RGBRGBRGB...
154+
// For audio, only one channel is used, buf.size() == nx*ny
154155
struct clip_image_f32 {
155156
int nx;
156157
int ny;
@@ -244,6 +245,7 @@ struct clip_image_u8_batch {
244245

245246
struct clip_image_f32_batch {
246247
std::vector<clip_image_f32_ptr> entries;
248+
bool is_audio = false;
247249
};
248250

249251
//

tools/llava/clip.cpp

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,13 +1087,6 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10871087

10881088
ggml_tensor * inp;
10891089

1090-
auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
1091-
x = ggml_norm(ctx0, x, eps);
1092-
x = ggml_mul(ctx0, x, w);
1093-
x = ggml_add(ctx0, x, b);
1094-
return x;
1095-
};
1096-
10971090
// conv1d block
10981091
{
10991092
// convolution + gelu
@@ -1118,7 +1111,8 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11181111
auto & layer = model.layers[il];
11191112
ggml_tensor * cur = inp;
11201113

1121-
cur = layer_norm(cur, layer.ln_1_w, layer.ln_1_b);
1114+
cur = ggml_norm(ctx0, cur, eps);
1115+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
11221116

11231117
// attention
11241118
{
@@ -1131,19 +1125,17 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11311125
v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_pos);
11321126

11331127
q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
1134-
q = ggml_scale(ctx0, q, 1.0f / std::sqrt(d_head));
1135-
11361128
k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
11371129

11381130
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
1139-
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f, 0.0f);
1131+
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f / std::sqrt(d_head), 0.0f);
11401132

11411133
v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
11421134

11431135
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
1144-
//kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_tokens, n_head);
1136+
//kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_pos, n_head);
11451137
kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
1146-
kqv = ggml_cont_2d(ctx0, kqv, n_embd, d_head);
1138+
kqv = ggml_cont_2d(ctx0, kqv, n_embd, n_pos);
11471139

11481140
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.o_w, kqv), layer.o_b);
11491141
}
@@ -1152,7 +1144,8 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11521144
cur = ggml_add(ctx0, cur, inp);
11531145

11541146
inp = cur; // inp = residual, cur = hidden_states
1155-
cur = layer_norm(cur, layer.ln_2_w, layer.ln_2_b);
1147+
cur = ggml_norm(ctx0, cur, eps);
1148+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
11561149

11571150
// mlp
11581151
{
@@ -1170,10 +1163,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11701163
ggml_tensor * embeddings = inp;
11711164

11721165
// output norm
1173-
embeddings = layer_norm(embeddings, model.post_ln_w, model.post_ln_b);
1166+
embeddings = ggml_norm(ctx0, embeddings, eps);
1167+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
11741168

1175-
// pad and stack
1176-
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py#L520
1169+
// StackAudioFrames
1170+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
11771171
{
11781172
int64_t stride = n_embd * hparams.proj_stack_factor;
11791173
int64_t padded_len = GGML_PAD(ggml_nelements(embeddings), stride);
@@ -1186,11 +1180,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11861180
ggml_row_size(embeddings->type, stride), 0);
11871181
}
11881182

1189-
// projection
1183+
// UltravoxProjector
11901184
{
11911185
ggml_tensor * cur = embeddings;
11921186
// pre-norm
1193-
cur = ggml_rms_norm(ctx0, cur, eps);
1187+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
11941188
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
11951189

11961190
// ffn in
@@ -1202,12 +1196,13 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
12021196
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
12031197
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
12041198

1205-
x0 = ggml_silu(ctx0, x0);
1199+
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1200+
x1 = ggml_silu(ctx0, x1);
12061201
cur = ggml_mul(ctx0, x0, x1);
12071202
}
12081203

12091204
// mid-norm
1210-
cur = ggml_rms_norm(ctx0, cur, eps);
1205+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
12111206
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
12121207

12131208
// ffn out
@@ -1216,6 +1211,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
12161211
embeddings = cur;
12171212
}
12181213

1214+
embeddings = ggml_view_2d(ctx0, embeddings, 2048, n_step / 16,
1215+
ggml_row_size(embeddings->type, 2048), 0);
1216+
1217+
printf("shape of embd: %lld %lld %lld\n", embeddings->ne[0], embeddings->ne[1], embeddings->ne[2]);
1218+
12191219
// build the graph
12201220
ggml_build_forward_expand(gf, embeddings);
12211221

@@ -3350,7 +3350,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33503350
};
33513351

33523352
// set input pixel values
3353-
{
3353+
if (!imgs.is_audio) {
33543354
size_t nelem = 0;
33553355
for (const auto & img : imgs.entries) {
33563356
nelem += img->nx * img->ny * 3;
@@ -3387,6 +3387,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33873387
}
33883388
}
33893389
set_input_f32("inp_raw", inp_raw);
3390+
3391+
} else {
3392+
// audio input
3393+
GGML_ASSERT(imgs.entries.size() == 1);
3394+
const auto & mel_inp = imgs.entries[0]; // 3 channels, but only use one
3395+
const int n_step = mel_inp->nx;
3396+
const int n_mel = mel_inp->ny;
3397+
std::vector<float> inp_raw(n_step * n_mel);
3398+
std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
3399+
set_input_f32("inp_raw", inp_raw);
33903400
}
33913401

33923402
// set input per projector
@@ -3585,6 +3595,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
35853595
{
35863596
// do nothing
35873597
} break;
3598+
case PROJECTOR_TYPE_ULTRAVOX:
3599+
{
3600+
const auto & mel_inp = imgs.entries[0];
3601+
const int n_pos = mel_inp->nx / 2;
3602+
std::vector<int32_t> positions(n_pos);
3603+
for (int i = 0; i < n_pos; i++) {
3604+
positions[i] = i;
3605+
}
3606+
set_input_i32("positions", positions);
3607+
} break;
35883608
default:
35893609
GGML_ABORT("Unknown projector type");
35903610
}
@@ -3820,3 +3840,14 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
38203840
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
38213841
return ctx->proj_type;
38223842
}
3843+
3844+
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel) {
3845+
clip_image_f32 * audio = new clip_image_f32;
3846+
audio->nx = n_step;
3847+
audio->ny = n_mel;
3848+
audio->buf.resize(n_step * n_mel);
3849+
std::memcpy(audio->buf.data(), mel, n_step * n_mel * sizeof(float));
3850+
3851+
batch->entries.push_back(clip_image_f32_ptr(audio));
3852+
batch->is_audio = true;
3853+
}

tools/llava/clip.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
127127

128128
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
129129

130+
// use by audio input
131+
CLIP_API void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel);
130132

131133
#ifdef __cplusplus
132134
}

tools/llava/mtmd.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,21 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
138138
const struct llama_model * text_model,
139139
const struct mtmd_context_params ctx_params) {
140140
try {
141-
return new mtmd_context(mmproj_fname, text_model, ctx_params);
141+
auto * test = new mtmd_context(mmproj_fname, text_model, ctx_params);
142+
143+
//// TEST, TO BE REMOVED LATER
144+
clip_image_f32_batch * batch = clip_image_f32_batch_init();
145+
std::vector<float> mel(128 * 1024);
146+
clip_image_f32_batch_add_mel(batch, 128, 1024, mel.data());
147+
std::vector<float> output(64 * 2048);
148+
clip_image_batch_encode(test->ctx_clip, 8, batch, output.data());
149+
for (int i = 0; i < 3; i++) printf("%f ", output[i]); printf("\n");
150+
for (int i = 0; i < 3; i++) printf("%f ", output[i+2048]); printf("\n");
151+
for (int i = 0; i < 3; i++) printf("%f ", output[i+2048*2]); printf("\n");
152+
float sum = 0.0;
153+
for (size_t i = 0; i < 1000; i++) sum += output[i];
154+
printf("sum: %f\n", sum);
155+
GGML_ABORT("test");
142156
} catch (const std::exception & e) {
143157
LOG_ERR("%s: error: %s\n", __func__, e.what());
144158
return nullptr;

0 commit comments

Comments
 (0)