@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
6363#endif
6464
6565struct mtmd_cli_context {
66- mtmd_context_ptr ctx_vision;
66+ mtmd::context_ptr ctx_vision;
6767 common_init_result llama_init;
6868
6969 llama_model * model;
@@ -72,7 +72,7 @@ struct mtmd_cli_context {
7272 llama_batch batch;
7373 int n_batch;
7474
75- std::vector<mtmd_bitmap> bitmaps;
75+ mtmd::bitmaps bitmaps;
7676
7777 // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
7878 // so here we don't need to keep track of chat history
@@ -115,12 +115,12 @@ struct mtmd_cli_context {
115115
116116 void init_vision_context (common_params & params) {
117117 const char * clip_path = params.mmproj .path .c_str ();
118- ctx_vision. reset ( mtmd_init_from_file (clip_path, model, mtmd_context_params{
119- /* use_gpu */ params.mmproj_use_gpu ,
120- /* timings */ true ,
121- /* n_threads */ params.cpuparams .n_threads ,
122- /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
123- } ));
118+ mtmd_context_params mparams = mtmd_context_params_default ();
119+ mparams. use_gpu = params.mmproj_use_gpu ;
120+ mparams. print_timings = true ;
121+ mparams. n_threads = params.cpuparams .n_threads ;
122+ mparams. verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
123+ ctx_vision. reset ( mtmd_init_from_file (clip_path, model, mparams ));
124124 if (!ctx_vision.get ()) {
125125 LOG_ERR (" Failed to load vision model from %s\n " , clip_path);
126126 exit (1 );
@@ -139,11 +139,11 @@ struct mtmd_cli_context {
139139 }
140140
141141 bool load_image (const std::string & fname) {
142- mtmd_bitmap bitmap;
143- if (mtmd_helper_bitmap_init_from_file (fname. c_str (), bitmap) ) {
142+ mtmd:: bitmap bmp ( mtmd_helper_bitmap_init_from_file (fname. c_str ())) ;
143+ if (!bmp. ptr ) {
144144 return false ;
145145 }
146- bitmaps.push_back (std::move (bitmap ));
146+ bitmaps.entries . push_back (std::move (bmp ));
147147 return true ;
148148 }
149149};
@@ -193,27 +193,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
193193 LOG_DBG (" formatted_chat.prompt: %s\n " , formatted_chat.prompt .c_str ());
194194
195195 mtmd_input_text text;
196- text.text = formatted_chat.prompt ;
196+ text.text = formatted_chat.prompt . c_str () ;
197197 text.add_special = add_bos;
198198 text.parse_special = true ;
199- mtmd_input_chunks chunks;
200199
201200 if (g_is_interrupted) return 0 ;
202201
203- int32_t res = mtmd_tokenize (ctx.ctx_vision .get (), chunks, text, ctx.bitmaps );
202+ mtmd::input_chunks chunks (mtmd_input_chunks_init ());
203+ auto bitmaps_c_ptr = ctx.bitmaps .c_ptr ();
204+ int32_t res = mtmd_tokenize (ctx.ctx_vision .get (),
205+ chunks.ptr .get (), // output
206+ &text, // text
207+ bitmaps_c_ptr.data (),
208+ bitmaps_c_ptr.size ());
204209 if (res != 0 ) {
205210 LOG_ERR (" Unable to tokenize prompt, res = %d\n " , res);
206211 return 1 ;
207212 }
208213
209- ctx.bitmaps .clear ();
210-
211- if (mtmd_helper_eval (ctx.ctx_vision .get (), ctx.lctx , chunks, ctx.n_past , 0 , ctx.n_batch )) {
214+ ctx.bitmaps .entries .clear ();
215+
216+ llama_pos new_n_past;
217+ if (mtmd_helper_eval_chunks (ctx.ctx_vision .get (),
218+ ctx.lctx , // lctx
219+ chunks.ptr .get (), // chunks
220+ ctx.n_past , // n_past
221+ 0 , // seq_id
222+ ctx.n_batch , // n_batch
223+ true , // logits_last
224+ &new_n_past)) {
212225 LOG_ERR (" Unable to eval prompt\n " );
213226 return 1 ;
214227 }
215228
216- ctx.n_past += mtmd_helper_get_n_pos (chunks) ;
229+ ctx.n_past = new_n_past ;
217230
218231 LOG (" \n " );
219232
@@ -246,7 +259,7 @@ int main(int argc, char ** argv) {
246259 struct common_sampler * smpl = common_sampler_init (ctx.model , params.sampling );
247260 int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict ;
248261
249- // ctrl +C handling
262+ // Ctrl +C handling
250263 {
251264#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
252265 struct sigaction sigint_action;
0 commit comments