Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
705 changes: 572 additions & 133 deletions examples/xgenmm/clip.cpp

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion examples/xgenmm/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,12 @@ CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_ima
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_encode_vit (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
// CLIP_API bool clip_image_encode_tokenizer(struct clip_ctx * ctx, const int n_threads, float * image_embd_v_m, float * image_embd_v_m_mask, float * image_embd);
CLIP_API bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_tensor *img_embeddings, ggml_tensor *attn_bias_input, float * image_embd);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

CLIP_API bool clip_image_batch_encode_vit(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
CLIP_API bool clip_image_batch_encode_tokenizer(struct clip_ctx * ctx, const int n_threads, float * image_embd_v_m, float * image_embd_v_m_mask, float * image_embd);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
Expand Down
Binary file added examples/xgenmm/imgs/5patches_embeddings.pt
Binary file not shown.
Binary file added examples/xgenmm/imgs/attention_mask_4patches.pt
Binary file not shown.
Binary file added examples/xgenmm/imgs/attention_mask_5patches.pt
Binary file not shown.
Binary file not shown.
22 changes: 18 additions & 4 deletions examples/xgenmm/run_cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,22 @@

make xgenmm-cli

./xgenmm-cli -m /export/share/llamacpp_models/MiniCPM-Llama3-V-2_5/ggml-model-Q4_K_M.gguf \
# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
# -c 4096 --temp 0.01 --repeat-penalty 1.05 \
# --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\nWhat is the color of this notebook?<|end|>\n<|assistant|>\n"


# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
# -c 4096 --temp 0 --num_beams 1 \
# --image /export/home/on-device-mm/notebooks/open-flamingo/imgs/receipt.jpg \
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image> Describe this image.<|end|>\n<|assistant|>\n"


./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
--mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
-c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 \
--image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \
-p "What is in the image?"
-c 4096 --temp 0.01 --repeat-penalty 1.05 \
--image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
167 changes: 129 additions & 38 deletions examples/xgenmm/xgenmm-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,41 +181,127 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
return ret.c_str();
}

// static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
// auto ctx_clip = clip_init_context(params);
// auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
// if (!embeds) {
// std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
// return NULL;
// }

// // process the prompt
// if (params->prompt.empty() && params->interactive == false) {
// LOG_TEE("prompt should be given or interactive mode should be on");
// return NULL;
// }

// auto model = llava_init(params);
// if (model == NULL) {
// fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
// return NULL;
// }
// const int64_t t_llava_init_start_us = ggml_time_us();
// auto ctx_llava = llava_init_context(params, model);
// ctx_llava->ctx_clip = ctx_clip;
// const int64_t t_llava_init_end_us = ggml_time_us();
// float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
// LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);

// const int64_t t_process_image_start_us = ggml_time_us();
// process_image(ctx_llava, embeds, params, n_past);
// const int64_t t_process_image_end_us = ggml_time_us();
// float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
// LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);

// llava_image_embed_free(embeds);
// return ctx_llava;
// }
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
}

// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on");
return NULL;
}

auto model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);

const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);

llava_image_embed_free(embeds);
return ctx_llava;
}

static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params,
const std::string &prompt)
{
int n_past = 0;

const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;

std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<image>");
if (image_pos != std::string::npos)
{
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for
// the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt)
{
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int)tmp.size(); i++)
{
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt)
{
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int)tmp.size(); i++)
{
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
else
{
// llava-1.5 native mode
system_prompt =
"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, "
"detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt)
{
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int)tmp.size(); i++)
{
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}

eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

// generate the response

LOG_TEE("\n");

struct llama_sampling_context *ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling)
{
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}

std::string response = "";
for (int i = 0; i < max_tgt_len; i++)
{
const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>"))
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6

fflush(stdout);
}

llama_sampling_free(ctx_sampling);
printf("\n");
}

static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
Expand All @@ -226,8 +312,8 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
}
std::cout<< "Start Processing Prompt" << std::endl;
exit(1);
std::cout<< "Start Processing Prompt: " << std::endl;
// TODO:
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on");
Expand All @@ -247,7 +333,8 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);

const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
process_prompt(ctx_llava, embeds, params, params->prompt);
// process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
Expand Down Expand Up @@ -291,6 +378,8 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sam
return tmp;
}



int main(int argc, char ** argv) {
ggml_time_init();

Expand Down Expand Up @@ -319,6 +408,8 @@ int main(int argc, char ** argv) {
// auto ctx_llava = minicpmv_init(&params, image, n_past);
auto ctx_llava = xgenmm_init(&params, image, n_past); // generate vision tokens
std::cout << "Start llava generation: " << std::endl;
llama_print_timings(ctx_llava->ctx_llama);

// // TODO: integrate base llm
// if (!params.prompt.empty()) {
// LOG_TEE("<user>%s\n", params.prompt.c_str());
Expand Down
Loading