Skip to content

Commit b67a168

Browse files
committed
improve debugging message
1 parent 29330dc commit b67a168

File tree

2 files changed

+40
-17
lines changed

2 files changed

+40
-17
lines changed

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
703703
case GGML_OP_ARANGE:
704704
return true;
705705
case GGML_OP_FLASH_ATTN_EXT:
706+
return false;
706707
// for new head sizes, add checks here
707708
if (op->src[0]->ne[0] != 32 &&
708709
op->src[0]->ne[0] != 40 &&

tools/mtmd/clip.cpp

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3209,6 +3209,7 @@ struct clip_model_loader {
32093209
struct support_info_graph {
32103210
// whether the clip_ctx.backend supports flash attention
32113211
bool fattn = true;
3212+
ggml_tensor * fattn_op = nullptr; // for debugging
32123213

32133214
std::vector<support_info_op> ops;
32143215
};
@@ -3220,9 +3221,23 @@ struct clip_model_loader {
32203221
// try to enable flash attention to see if it's supported
32213222
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
32223223
info = alloc_compute_meta(ctx_clip);
3223-
if (!info.fattn) {
3224-
LOG_WRN("%s: flash attention not supported, memory usage will increase\n", __func__);
3225-
// TODO: maybe log more details about why flash attention is not supported
3224+
if (!info.fattn && info.fattn_op) {
3225+
auto op = info.fattn_op;
3226+
LOG_WRN("%s: *****************************************************************\n", __func__);
3227+
LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
3228+
LOG_WRN("%s: op params: \n", __func__);
3229+
static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
3230+
LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
3231+
name, ggml_type_name(t->type),
3232+
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
3233+
t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
3234+
};
3235+
print_shape(__func__, " dst", op);
3236+
print_shape(__func__, "src0", op->src[0]);
3237+
print_shape(__func__, "src1", op->src[1]);
3238+
print_shape(__func__, "src2", op->src[2]);
3239+
LOG_WRN("%s: please report this on github as an issue\n", __func__);
3240+
LOG_WRN("%s: *****************************************************************\n", __func__);
32263241
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
32273242
alloc_compute_meta(ctx_clip);
32283243
}
@@ -3238,13 +3253,28 @@ struct clip_model_loader {
32383253

32393254
// print ops that are not supported by the GPU backend (if there is one)
32403255
if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
3256+
std::vector<support_info_op> unsupported_ops;
32413257
for (const auto & op : info.ops) {
32423258
if (!op.is_accel) {
3243-
LOG_WRN("%s: op %16s is not supported by the CLIP backend: type = %s, ne = [%d %d %d %d]\n", __func__,
3259+
unsupported_ops.push_back(op);
3260+
}
3261+
}
3262+
if (!unsupported_ops.empty()) {
3263+
LOG_WRN("%s: *****************************************************************\n", __func__);
3264+
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
3265+
LOG_WRN("%s: the performance will be suboptimal \n", __func__);
3266+
LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
3267+
for (const auto & op : unsupported_ops) {
3268+
LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
32443269
ggml_op_name(op.op->op),
32453270
ggml_type_name(op.op->type),
32463271
op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
32473272
}
3273+
LOG_WRN("%s: flash attention is %s\n", __func__,
3274+
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
3275+
LOG_WRN("%s: please report this on github as an issue\n", __func__);
3276+
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
3277+
LOG_WRN("%s: *****************************************************************\n", __func__);
32483278
}
32493279
}
32503280
}
@@ -3287,8 +3317,9 @@ struct clip_model_loader {
32873317
LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
32883318

32893319
support_info_graph res {
3290-
/*.fattn = */ true,
3291-
/*.ops = */ {},
3320+
/*.fattn = */ true,
3321+
/*.fattn_op = */ nullptr,
3322+
/*.ops = */ {},
32923323
};
32933324

32943325
// check op support
@@ -3298,7 +3329,8 @@ struct clip_model_loader {
32983329
if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
32993330
res.ops.back().is_accel = false;
33003331
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
3301-
res.fattn = false;
3332+
res.fattn = false;
3333+
res.fattn_op = node;
33023334
}
33033335
}
33043336
}
@@ -4576,16 +4608,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
45764608
return false; // only support batch size of 1
45774609
}
45784610

4579-
if (ggml_backend_sched_get_n_splits(ctx->sched.get()) > 1) {
4580-
LOG_WRN("%s: *****************************************************************\n", __func__);
4581-
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
4582-
LOG_WRN("%s: use GGML_SCHED_DEBUG=2 to determine which ops \n", __func__);
4583-
LOG_WRN("%s: the performance will be suboptimal \n", __func__);
4584-
LOG_WRN("%s: \n", __func__);
4585-
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
4586-
LOG_WRN("%s: *****************************************************************\n", __func__);
4587-
}
4588-
45894611
// build the inference graph
45904612
ctx->debug_print_tensors.clear();
45914613
ggml_backend_sched_reset(ctx->sched.get());

0 commit comments

Comments
 (0)