@@ -982,7 +982,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
982982 }
983983
984984 // alloc memory for graph
985- bool ok = ggml_backend_sched_alloc_graph (ctx.sched , gf);
985+ bool ok = ggml_backend_sched_alloc_graph (ctx.sched . get () , gf);
986986 if (!ok) {
987987 LLAMA_LOG_ERROR (" failed to alloc memory for graph\n " );
988988 return -1 ;
@@ -1064,7 +1064,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
10641064 // compute
10651065 LLAMA_LOG_DEBUG (" %s: compute start\n " , __func__);
10661066 int64_t t_start = ggml_time_ms ();
1067- ggml_backend_sched_graph_compute (ctx.sched , gf);
1067+ ggml_backend_sched_graph_compute (ctx.sched . get () , gf);
10681068
10691069 // the last node is the embedding tensor
10701070 struct ggml_tensor * output_node = ggml_graph_node (gf, -1 );
@@ -1091,6 +1091,92 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
10911091// //////////////////////////////////////////////////////////////////////////////////////
10921092// public API
10931093
1094+ struct llama_vision_context_params llama_vision_context_default_params () {
1095+ return {
1096+ /* .n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
1097+ };
1098+ }
1099+
1100+ struct llama_vision_context * llama_vision_init_from_model (const struct llama_model * model, struct llama_vision_context_params params) {
1101+ if (!model->has_vision ) {
1102+ return nullptr ;
1103+ }
1104+
1105+ llama_vision_context * ctx = new llama_vision_context;
1106+ ctx->model = &model->vit ;
1107+
1108+ // TODO: this looks ugly, mostly copied from llama.cpp, refactor it in the future
1109+
1110+ // init backends
1111+ {
1112+ // add CPU backend
1113+ ctx->backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
1114+ if (ctx->backend_cpu == nullptr ) {
1115+ LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
1116+ llama_vision_free (ctx);
1117+ return nullptr ;
1118+ }
1119+ ctx->backends .emplace_back (ctx->backend_cpu );
1120+
1121+ // create a list of the set_n_threads functions in the backends
1122+ for (auto & backend : ctx->backends ) {
1123+ ggml_backend_dev_t dev = ggml_backend_get_device (backend.get ());
1124+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg (dev) : nullptr ;
1125+ if (reg) {
1126+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
1127+ ggml_backend_set_n_threads_fn (backend.get (), params.n_threads );
1128+ }
1129+ }
1130+ }
1131+
1132+ // scheduler and compute buffers
1133+ {
1134+ // buffer types used for the compute buffer of each backend
1135+ std::vector<ggml_backend_buffer_type_t > backend_buft;
1136+ std::vector<ggml_backend_t > backend_ptrs;
1137+ for (auto & backend : ctx->backends ) {
1138+ auto * buft = ggml_backend_get_default_buffer_type (backend.get ());
1139+ auto backend_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
1140+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices .empty ()) {
1141+ // use the host buffer of the first device CPU for faster transfer of the intermediate state
1142+ auto * dev = model->devices [0 ];
1143+ auto * host_buft = ggml_backend_dev_host_buffer_type (dev);
1144+ if (host_buft) {
1145+ buft = host_buft;
1146+ }
1147+ }
1148+ backend_buft.push_back (buft);
1149+ backend_ptrs.push_back (backend.get ());
1150+ }
1151+
1152+ const size_t max_nodes = model->max_nodes ();
1153+
1154+ // buffer used to store the computation graph and the tensor meta data
1155+ ctx->buf_compute_meta .resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
1156+
1157+ // TODO: support pipeline_parallel
1158+ const bool pipeline_parallel = false ;
1159+
1160+ ctx->sched .reset (ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), max_nodes, pipeline_parallel));
1161+
1162+ if (pipeline_parallel) {
1163+ LLAMA_LOG_INFO (" %s: pipeline parallelism enabled (n_copies=%d)\n " , __func__, ggml_backend_sched_get_n_copies (ctx->sched .get ()));
1164+ }
1165+ }
1166+
1167+ const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic
1168+ ctx->buf_compute_meta .resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
1169+
1170+ return ctx;
1171+ }
1172+
1173+ void llama_vision_free (struct llama_vision_context * ctx) {
1174+ if (ctx->ctx_ggml ) {
1175+ ggml_free (ctx->ctx_ggml );
1176+ }
1177+ delete ctx;
1178+ }
1179+
10941180struct llama_vision_bitmap * llama_vision_bitmap_init (uint32_t nx, uint32_t ny) {
10951181 llama_vision_bitmap * bmp = new llama_vision_bitmap;
10961182 bmp->nx = nx;
@@ -1105,16 +1191,15 @@ void llama_vision_bitmap_free(llama_vision_bitmap * bmp) {
11051191}
11061192
11071193struct llama_vision_tokens * llama_vision_tokenize (
1108- struct llama_context * ctx,
1109- llama_vision_bitmap * bmp) {
1110- llama_vision_context & vctx = ctx->vctx ;
1111- switch (vctx.model ->hparams .arch ) {
1194+ struct llama_vision_context * ctx,
1195+ struct llama_vision_bitmap * bmp) {
1196+ switch (ctx->model ->hparams .arch ) {
11121197 case LLM_ARCH_VISION_LLAVA:
11131198 case LLM_ARCH_VISION_MOBILEVLM:
11141199 case LLM_ARCH_VISION_IDEFICS3:
1115- return new llama_vision_tokens (llama_vision_processor_llava (vctx ).tokenize (*bmp));
1200+ return new llama_vision_tokens (llama_vision_processor_llava (*ctx ).tokenize (*bmp));
11161201 case LLM_ARCH_VISION_MINICPMV:
1117- return new llama_vision_tokens (llama_vision_processor_llava (vctx ).tokenize (*bmp));
1202+ return new llama_vision_tokens (llama_vision_processor_llava (*ctx ).tokenize (*bmp));
11181203 default :
11191204 GGML_ASSERT (false && " unsupported arch" );
11201205 }
@@ -1124,19 +1209,18 @@ void llama_vision_tokens_free(llama_vision_tokens * p) {
11241209 delete p;
11251210}
11261211
1127- int32_t llama_vision_encode (struct llama_context * ctx, llama_vision_tokens * p) {
1212+ int32_t llama_vision_encode (struct llama_vision_context * ctx, struct llama_vision_tokens * p) {
11281213 if (p->buf .empty ()) {
11291214 LLAMA_LOG_ERROR (" %s: nothing to encode\n " , __func__);
11301215 return -1 ;
11311216 }
11321217
1133- llama_vision_context & vctx = ctx->vctx ;
1134- auto & hparams = vctx.model ->hparams ;
1218+ auto & hparams = ctx->model ->hparams ;
11351219 switch (hparams.mm_patch_merge_type ) {
11361220 case MM_PATCH_MERGE_FLAT:
11371221 {
11381222 // flat / default llava-1.5 type embedding
1139- int32_t encoded = llama_vision_encode_impl (vctx , *p);
1223+ int32_t encoded = llama_vision_encode_impl (*ctx , *p);
11401224 if (encoded != 0 ) {
11411225 LLAMA_LOG_ERROR (" Unable to encode image\n " );
11421226 return encoded;
@@ -1154,8 +1238,8 @@ int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p)
11541238 return 0 ;
11551239}
11561240
1157- struct ggml_tensor * llama_vision_get_output_tensor (llama_context * ctx) {
1158- return ctx->vctx . output ;
1241+ struct ggml_tensor * llama_vision_get_output_tensor (struct llama_vision_context * ctx) {
1242+ return ctx->output ;
11591243}
11601244
11611245// //////////////////////////////////////////////////////////////////////////////////////
0 commit comments