Skip to content

Commit 0f39770

Browse files
committed
Merge branch 'master' into xsn/server_mtmd
2 parents add9e21 + edb18b6 commit 0f39770

File tree

9 files changed

+135
-113
lines changed

9 files changed

+135
-113
lines changed

examples/embedding/embedding.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
8989
common_init();
9090

9191
params.embedding = true;
92+
93+
// utilize the full context
94+
if (params.n_batch < params.n_ctx) {
95+
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
96+
params.n_batch = params.n_ctx;
97+
}
98+
9299
// For non-causal models, batch size must be equal to ubatch size
93100
params.n_ubatch = params.n_batch;
94101

@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
134141

135142
// max batch size
136143
const uint64_t n_batch = params.n_batch;
137-
GGML_ASSERT(params.n_batch >= params.n_ctx);
138144

139145
// tokenize the prompts and trim
140146
std::vector<std::vector<int32_t>> inputs;

examples/llava/clip-impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@
9090
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
9191
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
9292
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
93-
#define TN_GLM_BOI_W "adapter.boi"
94-
#define TN_GLM_EOI_W "adapter.eoi"
9593

9694
enum projector_type {
9795
PROJECTOR_TYPE_MLP,

examples/llava/clip.cpp

Lines changed: 47 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,6 @@ struct clip_vision_model {
244244
//GLMV-Edge projection
245245
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
246246
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
247-
struct ggml_tensor * boi_w = nullptr;
248-
struct ggml_tensor * eoi_w = nullptr;
249247

250248
// MobileVLM projection
251249
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -556,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
556554
}
557555

558556
// implementation of the 2D RoPE without adding a new op in ggml
557+
// this is not efficient (use double the memory), but works on all backends
558+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
559559
static ggml_tensor * build_rope_2d(
560-
ggml_cgraph * gf,
561560
ggml_context * ctx0,
562561
ggml_tensor * cur,
563562
ggml_tensor * pos_h,
564563
ggml_tensor * pos_w,
565564
const float freq_base
566565
) {
567-
ggml_tensor * tmp;
568566
const int64_t n_dim = cur->ne[0];
569567
const int64_t n_head = cur->ne[1];
570568
const int64_t n_pos = cur->ne[2];
@@ -573,18 +571,23 @@ static ggml_tensor * build_rope_2d(
573571
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
574572
// first half of cur will use 1e-0, 1e-2 (even)
575573
// second half of cur will use 1e-1, 1e-3 (odd)
576-
//
577-
// for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
578575
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
579576
// then for the second half, we use freq_scale to shift the inv_freq
580577
// ^ why? replace (2i) with (2i+1) in the above equation
581578
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
582579

583580
// first half
581+
ggml_tensor * first;
584582
{
585-
cur = ggml_rope_ext_inplace(
583+
first = ggml_view_3d(ctx0, cur,
584+
n_dim/2, n_head, n_pos,
585+
ggml_row_size(cur->type, n_dim),
586+
ggml_row_size(cur->type, n_dim*n_head),
587+
0);
588+
first = ggml_rope_ext(
586589
ctx0,
587-
cur,
590+
first,
588591
pos_h, // positions
589592
nullptr, // freq factors
590593
n_dim/2, // n_dims
@@ -594,26 +597,27 @@ static ggml_tensor * build_rope_2d(
594597
}
595598

596599
// second half
600+
ggml_tensor * second;
597601
{
598-
tmp = ggml_view_3d(ctx0, cur,
602+
second = ggml_view_3d(ctx0, cur,
599603
n_dim/2, n_head, n_pos,
600604
ggml_row_size(cur->type, n_dim),
601605
ggml_row_size(cur->type, n_dim*n_head),
602606
n_dim/2 * ggml_element_size(cur));
603-
tmp = ggml_rope_ext_inplace(
607+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608+
second = ggml_rope_ext(
604609
ctx0,
605-
tmp,
610+
second,
606611
pos_w, // positions
607612
nullptr, // freq factors
608613
n_dim/2, // n_dims
609614
0, 0, freq_base,
610615
freq_scale_odd,
611616
0.0f, 1.0f, 0.0f, 0.0f
612617
);
613-
// calculate inplace (modify cur directly)
614-
ggml_build_forward_expand(gf, tmp);
615618
}
616619

620+
cur = ggml_concat(ctx0, first, second, 0);
617621
return cur;
618622
}
619623

@@ -682,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
682686
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
683687

684688
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
685-
Q = build_rope_2d(gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta);
689+
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
686690
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
687691

688692
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
689693

690694
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
691-
K = build_rope_2d(gf, ctx0, K, pos_h, pos_w, hparams.rope_theta);
695+
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
692696
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
693697

694698
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
@@ -1697,8 +1701,6 @@ struct clip_model_loader {
16971701
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
16981702
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
16991703
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1700-
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
1701-
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
17021704
} break;
17031705
case PROJECTOR_TYPE_MERGER:
17041706
{
@@ -2593,8 +2595,7 @@ void clip_free(clip_ctx * ctx) {
25932595
}
25942596

25952597
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2596-
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2597-
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2598+
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
25982599
}
25992600

26002601
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2791,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27902791
}
27912792
if (ctx->has_glm_projector) {
27922793
GGML_ASSERT(batch_size == 1);
2793-
ggml_tensor * boi = ctx->vision_model.boi_w;
2794-
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2795-
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
27962794
}
27972795

27982796
// build the inference graph
@@ -2804,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28042802
const auto & model = ctx->vision_model;
28052803
const auto & hparams = model.hparams;
28062804

2805+
// TODO @ngxson : this is ugly, need to refactor later
2806+
bool support_dynamic_size = ctx->has_minicpmv_projector
2807+
|| ctx->has_qwen2vl_merger
2808+
|| ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809+
28072810
const int image_size = hparams.image_size;
28082811
int image_size_width = image_size;
28092812
int image_size_height = image_size;
2810-
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2813+
if (support_dynamic_size) {
28112814
image_size_width = imgs.entries[0]->nx;
28122815
image_size_height = imgs.entries[0]->ny;
28132816
}
@@ -2819,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28192822

28202823
{
28212824
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
2822-
float * data = (float *)malloc(ggml_nbytes(inp_raw));
2825+
std::vector<float> inp_data(ggml_nelements(inp_raw));
2826+
float * data = inp_data.data();
2827+
2828+
// layout of data (note: the channel dim is unrolled to better visualize the layout):
2829+
//
2830+
// ┌──W──┐
2831+
// │ H │ channel = R
2832+
// ├─────┤ │
2833+
// │ H │ channel = G
2834+
// ├─────┤ │
2835+
// │ H │ channel = B
2836+
// └─────┘ │
2837+
// ──────┘ x B
28232838

2824-
// TODO @ngxson : this whole code block is ugly, will need to be refactored
28252839
for (size_t i = 0; i < imgs.entries.size(); i++) {
28262840
const int nx = imgs.entries[i]->nx;
28272841
const int ny = imgs.entries[i]->ny;
@@ -2836,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28362850
const int n = nx * ny;
28372851

28382852
for (int b = 0; b < batch_size; b++) {
2839-
for (int k = 0; k < 3; k++) {
2840-
for (int y = 0; y < ny; y++) {
2841-
for (int x = 0; x < nx; x++) {
2842-
data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
2843-
}
2853+
float * batch_entry = data + b * (3*n);
2854+
for (int y = 0; y < ny; y++) {
2855+
for (int x = 0; x < nx; x++) {
2856+
size_t base_src = 3*(y * nx + x); // idx of the first channel
2857+
size_t base_dst = y * nx + x; // idx of the first channel
2858+
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
2859+
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
2860+
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
28442861
}
28452862
}
28462863
}
28472864
}
28482865
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
2849-
free(data);
28502866
}
28512867
if (ctx->has_minicpmv_projector) {
28522868
{
@@ -3001,13 +3017,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30013017
// copy the embeddings to the location passed by the user
30023018
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
30033019

3004-
if (ctx->has_glm_projector) {
3005-
//eoi
3006-
ggml_tensor * eoi = ctx->vision_model.eoi_w;
3007-
int offset = ggml_nelements(embeddings);
3008-
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
3009-
}
3010-
30113020
return true;
30123021
}
30133022

examples/llava/mtmd.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
187187
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
188188
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
189189

190+
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
191+
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
192+
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
193+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
194+
190195
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
191196
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
192197
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";

ggml/include/ggml-rpc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
extern "C" {
88
#endif
99

10-
#define RPC_PROTO_MAJOR_VERSION 1
10+
#define RPC_PROTO_MAJOR_VERSION 2
1111
#define RPC_PROTO_MINOR_VERSION 0
1212
#define RPC_PROTO_PATCH_VERSION 0
1313
#define GGML_RPC_MAX_SERVERS 16

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
378378
}
379379

380380
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
381-
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
382-
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
381+
// No response
382+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
383383
uint8_t cmd_byte = cmd;
384384
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
385385
return false;
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
390390
if (!send_data(sock->fd, input, input_size)) {
391391
return false;
392392
}
393+
return true;
394+
}
395+
396+
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
397+
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
398+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
399+
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
400+
return false;
401+
}
393402
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
394403
// even if we do, we can skip sending output_size from the server for commands with known output size
395404
uint64_t out_size;
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
555564
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
556565
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
557566
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
558-
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
567+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
559568
GGML_ASSERT(status);
560569
}
561570

@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
14281437
if (!server.set_tensor(input)) {
14291438
return;
14301439
}
1431-
if (!send_msg(sockfd, nullptr, 0)) {
1432-
return;
1433-
}
14341440
break;
14351441
}
14361442
case RPC_CMD_SET_TENSOR_HASH: {

ggml/src/ggml-sycl/common.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
313313
int device;
314314
std::string name;
315315
optimize_feature opt_feature;
316-
bool optimized_graph=false;
317316

318317
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
319318

0 commit comments

Comments
 (0)