@@ -148,19 +148,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
148148 process_eval_image_embed (ctx_llava, embeds, params->n_batch , &n_past, idx++);
149149 eval_string (ctx_llava->ctx_llama , std::string (" </image>" ).c_str (), params->n_batch , &n_past, false );
150150 if (num_image_embeds > 1 ) {
151- size_t num_image_embeds_col = clip_uhd_num_image_embeds_col (ctx_llava->ctx_clip );
152- eval_string (ctx_llava->ctx_llama , std::string (" <slice>" ).c_str (), params->n_batch , &n_past, false );
153- for (size_t i = 0 ; i < (num_image_embeds-1 )/num_image_embeds_col; ++i) {
154- for (size_t j = 0 ; j < num_image_embeds_col; ++j) {
155- eval_string (ctx_llava->ctx_llama , std::string (" <image>" ).c_str (), params->n_batch , &n_past, false );
156- process_eval_image_embed (ctx_llava, embeds, params->n_batch , &n_past, idx++);
157- eval_string (ctx_llava->ctx_llama , std::string (" </image>" ).c_str (), params->n_batch , &n_past, false );
158- if (j == num_image_embeds_col - 1 ) {
159- eval_string (ctx_llava->ctx_llama , std::string (" \n " ).c_str (), params->n_batch , &n_past, false );
151+ if (has_minicpmv_projector == 2 ) {
152+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col (ctx_llava->ctx_clip );
153+ eval_string (ctx_llava->ctx_llama , std::string (" <slice>" ).c_str (), params->n_batch , &n_past, false );
154+ for (size_t i = 0 ; i < (num_image_embeds-1 )/num_image_embeds_col; ++i) {
155+ for (size_t j = 0 ; j < num_image_embeds_col; ++j) {
156+ eval_string (ctx_llava->ctx_llama , std::string (" <image>" ).c_str (), params->n_batch , &n_past, false );
157+ process_eval_image_embed (ctx_llava, embeds, params->n_batch , &n_past, idx++);
158+ eval_string (ctx_llava->ctx_llama , std::string (" </image>" ).c_str (), params->n_batch , &n_past, false );
159+ if (j == num_image_embeds_col - 1 ) {
160+ eval_string (ctx_llava->ctx_llama , std::string (" \n " ).c_str (), params->n_batch , &n_past, false );
161+ }
162+ }
163+ }
164+ eval_string (ctx_llava->ctx_llama , std::string (" </slice>" ).c_str (), params->n_batch , &n_past, false );
165+ }
166+ else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4 ) {
167+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col (ctx_llava->ctx_clip );
168+ for (size_t i = 0 ; i < (num_image_embeds-1 )/num_image_embeds_col; ++i) {
169+ for (size_t j = 0 ; j < num_image_embeds_col; ++j) {
170+ eval_string (ctx_llava->ctx_llama , std::string (" <slice>" ).c_str (), params->n_batch , &n_past, false );
171+ process_eval_image_embed (ctx_llava, embeds, params->n_batch , &n_past, idx++);
172+ eval_string (ctx_llava->ctx_llama , std::string (" </slice>" ).c_str (), params->n_batch , &n_past, false );
173+ if (j == num_image_embeds_col - 1 ) {
174+ eval_string (ctx_llava->ctx_llama , std::string (" \n " ).c_str (), params->n_batch , &n_past, false );
175+ }
160176 }
161177 }
162178 }
163- eval_string (ctx_llava->ctx_llama , std::string (" </slice>" ).c_str (), params->n_batch , &n_past, false );
164179 }
165180 LOG_INF (" %s: image token past: %d\n " , __func__, n_past);
166181}
0 commit comments