@@ -3209,6 +3209,7 @@ struct clip_model_loader {
32093209 struct support_info_graph {
32103210 // whether the clip_ctx.backend supports flash attention
32113211 bool fattn = true ;
3212+ ggml_tensor * fattn_op = nullptr ; // for debugging
32123213
32133214 std::vector<support_info_op> ops;
32143215 };
@@ -3220,9 +3221,23 @@ struct clip_model_loader {
32203221 // try to enable flash attention to see if it's supported
32213222 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
32223223 info = alloc_compute_meta (ctx_clip);
3223- if (!info.fattn ) {
3224- LOG_WRN (" %s: flash attention not supported, memory usage will increase\n " , __func__);
3225- // TODO: maybe log more details about why flash attention is not supported
3224+ if (!info.fattn && info.fattn_op ) {
3225+ auto op = info.fattn_op ;
3226+ LOG_WRN (" %s: *****************************************************************\n " , __func__);
3227+ LOG_WRN (" %s: WARNING: flash attention not supported by %s, memory usage will increase\n " , __func__, ggml_backend_name (ctx_clip.backend ));
3228+ LOG_WRN (" %s: op params: \n " , __func__);
3229+ static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
3230+ LOG_WRN (" %s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n " , fn,
3231+ name, ggml_type_name (t->type ),
3232+ t->ne [0 ], t->ne [1 ], t->ne [2 ], t->ne [3 ],
3233+ t->nb [0 ], t->nb [1 ], t->nb [2 ], t->nb [3 ]);
3234+ };
3235+ print_shape (__func__, " dst" , op);
3236+ print_shape (__func__, " src0" , op->src [0 ]);
3237+ print_shape (__func__, " src1" , op->src [1 ]);
3238+ print_shape (__func__, " src2" , op->src [2 ]);
3239+ LOG_WRN (" %s: please report this on github as an issue\n " , __func__);
3240+ LOG_WRN (" %s: *****************************************************************\n " , __func__);
32263241 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
32273242 alloc_compute_meta (ctx_clip);
32283243 }
@@ -3238,13 +3253,28 @@ struct clip_model_loader {
32383253
32393254 // print ops that are not supported by the GPU backend (if there is one)
32403255 if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu ) {
3256+ std::vector<support_info_op> unsupported_ops;
32413257 for (const auto & op : info.ops ) {
32423258 if (!op.is_accel ) {
3243- LOG_WRN (" %s: op %16s is not supported by the CLIP backend: type = %s, ne = [%d %d %d %d]\n " , __func__,
3259+ unsupported_ops.push_back (op);
3260+ }
3261+ }
3262+ if (!unsupported_ops.empty ()) {
3263+ LOG_WRN (" %s: *****************************************************************\n " , __func__);
3264+ LOG_WRN (" %s: WARNING: the CLIP graph uses unsupported operators by the backend\n " , __func__);
3265+ LOG_WRN (" %s: the performance will be suboptimal \n " , __func__);
3266+ LOG_WRN (" %s: list of unsupported ops (backend=%s):\n " , __func__, ggml_backend_name (ctx_clip.backend ));
3267+ for (const auto & op : unsupported_ops) {
3268+ LOG_WRN (" %s: %16s: type = %s, ne = [%d %d %d %d]\n " , __func__,
32443269 ggml_op_name (op.op ->op ),
32453270 ggml_type_name (op.op ->type ),
32463271 op.op ->ne [0 ], op.op ->ne [1 ], op.op ->ne [2 ], op.op ->ne [3 ]);
32473272 }
3273+ LOG_WRN (" %s: flash attention is %s\n " , __func__,
3274+ (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? " enabled" : " disabled" );
3275+ LOG_WRN (" %s: please report this on github as an issue\n " , __func__);
3276+ LOG_WRN (" %s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n " , __func__);
3277+ LOG_WRN (" %s: *****************************************************************\n " , __func__);
32483278 }
32493279 }
32503280 }
@@ -3287,8 +3317,9 @@ struct clip_model_loader {
32873317 LOG_INF (" %s: graph splits = %d, nodes = %d\n " , __func__, n_splits, n_nodes);
32883318
32893319 support_info_graph res {
3290- /* .fattn = */ true ,
3291- /* .ops = */ {},
3320+ /* .fattn = */ true ,
3321+ /* .fattn_op = */ nullptr ,
3322+ /* .ops = */ {},
32923323 };
32933324
32943325 // check op support
@@ -3298,7 +3329,8 @@ struct clip_model_loader {
32983329 if (!ggml_backend_supports_op (ctx_clip.backend , node)) {
32993330 res.ops .back ().is_accel = false ;
33003331 if (node->op == GGML_OP_FLASH_ATTN_EXT) {
3301- res.fattn = false ;
3332+ res.fattn = false ;
3333+ res.fattn_op = node;
33023334 }
33033335 }
33043336 }
@@ -4576,16 +4608,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
45764608 return false ; // only support batch size of 1
45774609 }
45784610
4579- if (ggml_backend_sched_get_n_splits (ctx->sched .get ()) > 1 ) {
4580- LOG_WRN (" %s: *****************************************************************\n " , __func__);
4581- LOG_WRN (" %s: WARNING: the CLIP graph uses unsupported operators by the backend\n " , __func__);
4582- LOG_WRN (" %s: use GGML_SCHED_DEBUG=2 to determine which ops \n " , __func__);
4583- LOG_WRN (" %s: the performance will be suboptimal \n " , __func__);
4584- LOG_WRN (" %s: \n " , __func__);
4585- LOG_WRN (" %s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n " , __func__);
4586- LOG_WRN (" %s: *****************************************************************\n " , __func__);
4587- }
4588-
45894611 // build the inference graph
45904612 ctx->debug_print_tensors .clear ();
45914613 ggml_backend_sched_reset (ctx->sched .get ());
0 commit comments