Yutong-Dai · Yutong-Dai · Sep 9, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 16, 2024
diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp
diff --git a/examples/xgenmm/clip.h b/examples/xgenmm/clip.h
@@ -85,8 +85,12 @@ CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_ima
 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_encode_vit      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+// CLIP_API bool clip_image_encode_tokenizer(struct clip_ctx * ctx, const int n_threads, float * image_embd_v_m, float * image_embd_v_m_mask, float * image_embd);
+CLIP_API bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_tensor *img_embeddings, ggml_tensor *attn_bias_input, float * image_embd);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
-
+CLIP_API bool clip_image_batch_encode_vit(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+CLIP_API bool clip_image_batch_encode_tokenizer(struct clip_ctx * ctx, const int n_threads, float * image_embd_v_m, float * image_embd_v_m_mask, float * image_embd);
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

diff --git a/examples/xgenmm/imgs/5patches_embeddings.pt b/examples/xgenmm/imgs/5patches_embeddings.pt
diff --git a/examples/xgenmm/imgs/attention_mask_4patches.pt b/examples/xgenmm/imgs/attention_mask_4patches.pt
diff --git a/examples/xgenmm/imgs/attention_mask_5patches.pt b/examples/xgenmm/imgs/attention_mask_5patches.pt
diff --git a/examples/xgenmm/imgs/attention_mask_5patches_stage1.pt b/examples/xgenmm/imgs/attention_mask_5patches_stage1.pt
diff --git a/examples/xgenmm/run_cli.sh b/examples/xgenmm/run_cli.sh
@@ -2,8 +2,22 @@
 
 make xgenmm-cli
 
-./xgenmm-cli -m /export/share/llamacpp_models/MiniCPM-Llama3-V-2_5/ggml-model-Q4_K_M.gguf \
+# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
+#     --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
+#     -c 4096 --temp 0.01 --repeat-penalty 1.05 \
+#     --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \
+#     -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\nWhat is the color of this notebook?<|end|>\n<|assistant|>\n"
+
+
+# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
+#     --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
+#     -c 4096 --temp 0 --num_beams 1 \
+#     --image /export/home/on-device-mm/notebooks/open-flamingo/imgs/receipt.jpg \
+#     -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image> Describe this image.<|end|>\n<|assistant|>\n"
+
+
+./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
     --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
-    -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 \
-    --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \
-    -p "What is in the image?"
+    -c 4096 --temp 0.01 --repeat-penalty 1.05 \
+    --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
+    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
diff --git a/examples/xgenmm/xgenmm-cli.cpp b/examples/xgenmm/xgenmm-cli.cpp
@@ -181,41 +181,127 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
     return ret.c_str();
 }
 
-// static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-//     auto ctx_clip = clip_init_context(params);
-//     auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
-//     if (!embeds) {
-//         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
-//         return NULL;
-//     }
-
-//     // process the prompt
-//     if (params->prompt.empty() && params->interactive == false) {
-//         LOG_TEE("prompt should be given or interactive mode should be on");
-//         return NULL;
-//     }
-
-//     auto model = llava_init(params);
-//     if (model == NULL) {
-//         fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
-//         return NULL;
-//     }
-//     const int64_t t_llava_init_start_us = ggml_time_us();
-//     auto ctx_llava = llava_init_context(params, model);
-//     ctx_llava->ctx_clip = ctx_clip;
-//     const int64_t t_llava_init_end_us = ggml_time_us();
-//     float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-//     LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-//     const int64_t t_process_image_start_us = ggml_time_us();
-//     process_image(ctx_llava, embeds, params, n_past);
-//     const int64_t t_process_image_end_us = ggml_time_us();
-//     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-//     LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-//     llava_image_embed_free(embeds);
-//     return ctx_llava;
-// }
+static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
+    auto ctx_clip = clip_init_context(params);
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+    if (!embeds) {
+        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+        return NULL;
+    }
+
+    // process the prompt
+    if (params->prompt.empty() && params->interactive == false) {
+        LOG_TEE("prompt should be given or interactive mode should be on");
+        return NULL;
+    }
+
+    auto model = llava_init(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
+        return NULL;
+    }
+    const int64_t t_llava_init_start_us = ggml_time_us();
+    auto ctx_llava = llava_init_context(params, model);
+    ctx_llava->ctx_clip = ctx_clip;
+    const int64_t t_llava_init_end_us = ggml_time_us();
+    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
+    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+
+    const int64_t t_process_image_start_us = ggml_time_us();
+    process_image(ctx_llava, embeds, params, n_past);
+    const int64_t t_process_image_end_us = ggml_time_us();
+    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
+    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+
+    llava_image_embed_free(embeds);
+    return ctx_llava;
+}
+
+static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params,
+                           const std::string &prompt)
+{
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    std::string system_prompt, user_prompt;
+    size_t      image_pos = prompt.find("<image>");
+    if (image_pos != std::string::npos)
+    {
+        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for
+        // the image
+        system_prompt = prompt.substr(0, image_pos);
+        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        if (params->verbose_prompt)
+        {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            for (int i = 0; i < (int)tmp.size(); i++)
+            {
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+        if (params->verbose_prompt)
+        {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int)tmp.size(); i++)
+            {
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    }
+    else
+    {
+        // llava-1.5 native mode
+        system_prompt =
+            "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, "
+            "detailed, and polite answers to the human's questions.\nUSER:";
+        user_prompt = prompt + "\nASSISTANT:";
+        if (params->verbose_prompt)
+        {
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int)tmp.size(); i++)
+            {
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    }
+
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
+    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+
+    // generate the response
+
+    LOG_TEE("\n");
+
+    struct llama_sampling_context *ctx_sampling = llama_sampling_init(params->sparams);
+    if (!ctx_sampling)
+    {
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++)
+    {
+        const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        response += tmp;
+        if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break;  // Yi-VL behavior
+        printf("%s", tmp);
+        if (strstr(response.c_str(), "<|im_end|>"))
+            break;  // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        if (strstr(response.c_str(), "<|im_start|>")) break;  // Yi-34B llava-1.6
+        if (strstr(response.c_str(), "USER:")) break;         // mistral llava-1.6
+
+        fflush(stdout);
+    }
+
+    llama_sampling_free(ctx_sampling);
+    printf("\n");
+}
 
 static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
     auto ctx_clip = clip_init_context(params);
@@ -226,8 +312,8 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string
         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
         return NULL;
     }
-    std::cout<< "Start Processing Prompt" << std::endl;
-    exit(1);
+    std::cout<< "Start Processing Prompt: " << std::endl;
+    // TODO:
     // process the prompt
     if (params->prompt.empty() && params->interactive == false) {
         LOG_TEE("prompt should be given or interactive mode should be on");
@@ -247,7 +333,8 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string
     LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
 
     const int64_t t_process_image_start_us = ggml_time_us();
-    process_image(ctx_llava, embeds, params, n_past);
+    process_prompt(ctx_llava, embeds, params, params->prompt);
+    // process_image(ctx_llava, embeds, params, n_past);
     const int64_t t_process_image_end_us = ggml_time_us();
     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
     LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
@@ -291,6 +378,8 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sam
     return tmp;
 }
 
+
+
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -319,6 +408,8 @@ int main(int argc, char ** argv) {
         // auto ctx_llava = minicpmv_init(&params, image, n_past);
         auto ctx_llava = xgenmm_init(&params, image, n_past);  // generate vision tokens
         std::cout << "Start llava generation: " << std::endl;
+        llama_print_timings(ctx_llava->ctx_llama);
+
         // // TODO: integrate base llm
         // if (!params.prompt.empty()) {
         //     LOG_TEE("<user>%s\n", params.prompt.c_str());