Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,14 +1406,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.mmproj = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_COGAGENT}));
add_opt(common_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION, LLAMA_EXAMPLE_COGAGENT}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_VISION,
LLAMA_EXAMPLE_COGAGENT,

LLAMA_EXAMPLE_COUNT,
};
Expand Down
383 changes: 371 additions & 12 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ if (EMSCRIPTEN)
else()
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(cogagent)
add_subdirectory(embedding)
add_subdirectory(eval-callback)

Expand Down Expand Up @@ -53,6 +54,7 @@ else()
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
add_subdirectory(vision)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
Expand Down
18 changes: 18 additions & 0 deletions examples/cogagent/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
set(TARGET llama-cogagent-cli)
add_executable(${TARGET} cogagent-cli.cpp)
add_library(cogagent OBJECT
vision_encoder.cpp
vision_encoder.h
cross_vision.cpp
cross_vision.h
cogagent_util.cpp
cogagent_util.h
image_util.cpp
image_util.h)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-cogagent-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common cogagent ggml ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(cogagent PUBLIC ../../ggml/include)
target_include_directories(cogagent PUBLIC ../../include)
target_include_directories(cogagent PUBLIC ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
285 changes: 285 additions & 0 deletions examples/cogagent/cogagent-cli.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "llama.h"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>

#include "cogagent.h"

cogagent_ctx cogagent_global;

// This function is mostly copied from cogagent cli
static bool eval_string_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();

//// Processing the input tokens in batches
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}

std::vector<int> pos;
pos.resize(n_eval);
for (int i=0; i<n_eval; i++) {
pos[i] = *n_past + i;
}

llama_batch batch = llama_batch_get_one(&tokens[i], n_eval);
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}

static bool eval_image_tokens(llama_context * ctx_llama, std::vector<float> &img_data,
int n_batch, int * n_past) {
int n_embd = 4096;
int num_tokens = 258;
int positions[258];

positions[0] = *n_past;
for (int i=0; i<num_tokens-2; i++) {
positions[i + 1] = *n_past + 1;
}
positions[num_tokens - 1] = *n_past + 2;

float * data_ptr = img_data.data();

for (int i = 0; i < num_tokens; i += n_batch) {
int n_eval = num_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, data_ptr, positions, nullptr, nullptr, nullptr, nullptr, nullptr, };
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
data_ptr += i * n_embd;
}
*n_past += 3;
return true;
}

static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <cogagent-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <cogagent-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}

static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);

const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);

static std::string ret;
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
// Give the new token to the model. I'm not sure how it is stored.
// Perhaps it is stored in the KV cache.
std::vector<llama_token> tokens;
tokens.push_back(id);
eval_string_tokens(ctx_llama, tokens, 1, n_past);

return ret.c_str();
}

static bool run_vision_encoders(const char* vision_encoder_path, const char* image_path) {
// Load image and resize for the encoders
std::vector<float> small_image_data; // For vision encoder
std::vector<float> large_image_data; // For cross vision encoder
if (!load_and_stretch_image(image_path, cogagent_global.vision_encoder_img_size,
small_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
printf("Failed to load the specified image file.\n");
return false;
}
if (!load_and_stretch_image(image_path, cogagent_global.cross_vision_img_size,
large_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
printf("Failed to load the specified image file.\n");
return false;
}

// For debugging purposes
const char * vision_encoder_resized_image = "cogagent_encoders/llama_vision_encoder_input.gguf";
int dims[3] = {cogagent_global.vision_encoder_img_size,
cogagent_global.vision_encoder_img_size, 3};
save_tensor_from_data(small_image_data, dims, vision_encoder_resized_image);
const char * cross_vision_resized_image = "cogagent_encoders/llama_cross_vision_input.gguf";
dims[0] = cogagent_global.cross_vision_img_size;
dims[1] = cogagent_global.cross_vision_img_size;
save_tensor_from_data(large_image_data, dims, cross_vision_resized_image);

// const char * reference_vision_encoder_input = "/home/tianyue/myworkspace"
// "/vlm_intermediate/vision_encoder_input.gguf";
// const char * reference_cross_vision_input = "/home/tianyue/myworkspace"
// "/vlm_intermediate/cross_vision_input.gguf";
// // Load the reference input
// if (get_input(small_image_data, reference_vision_encoder_input) < 0) {
// printf("Failed to load small image input\n");
// return false;
// }
// if (get_input(large_image_data, reference_cross_vision_input) < 0) {
// printf("Failed to load big image input\n");
// return false;
// }
printf("Loaded and resized the specified image.\n");

// Load the vision encoder weights
if (!vision_encoder_init_load(vision_encoder_path)) {
printf("Failed to load vision encoder model file.\n");
return false;
}
printf("Vision encoder weights loaded.\n");

// Run the vision encoder
run_vision_encoder(small_image_data);
printf("Completed vision encoder run on image file.\n");

free_vision_encoder_ctx();

// Load and run the cross vision encoder
if (!cross_vision_init_load(vision_encoder_path)) {
printf("Failed to load cross vision encoder model file.\n");
return false;
}
printf("Cross vision encoder weights loaded.\n");

run_cross_vision(large_image_data);
printf("Completed cross vision encoder run on image file.\n");

free_cross_vision_ctx();
return true;
}

int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COGAGENT, print_usage)) {
return 1;
}
common_init();

llama_backend_init();
llama_numa_init(params.numa);

// Initialize a GGML context to store the encoded image tensors
struct ggml_init_params token_ctx_params = {
size_t(40000000),
NULL,
false,
};
cogagent_global.token_ctx = ggml_init(token_ctx_params);
if (!cogagent_global.token_ctx) {
printf("Failed to initialize token storage context.\n");
return 1;
}
// Allocate the tensor for cross vision encoded image
cogagent_global.cross_vision_image_tensor = ggml_new_tensor_2d(
cogagent_global.token_ctx, GGML_TYPE_F32, 1024, 6400
);

// Load the images and the encoder models
// Then run the encoder models
if (!run_vision_encoders(params.mmproj.c_str(), params.image[0].c_str())) {
return 1;
}

llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == nullptr) {
printf("Failed to load decoder model\n");
return 1;
}

llama_context_params ctx_params = common_context_params_to_llama(params);
printf("Context size is %d tokens\n", ctx_params.n_ctx);
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);

if (ctx_llama == nullptr) {
printf("Failed to create the llama context\n");
return 1;
}

cogagent_global.ctx_llama = ctx_llama;
cogagent_global.cogvlm_model = model;

// At the moment I can't figure out how the llama kv cache
// keeps its information across runs.
// It seems to me that the graph is allocated for each batch,
// which would invalidate any tensors stored in the kv cache.
// I don't spot logic for separately allocating the kv cache
// tensors to avoid this, so it doesn't make sense.
// Maybe the graph isn't actually allocated for each batch?
// Perhaps that is why a worst case graph is allocated.

// TODO: Check if system prompt is compatible
std::vector<llama_token> begin_token;
const llama_vocab * vocab = llama_model_get_vocab(cogagent_global.cogvlm_model);
begin_token.push_back(llama_vocab_bos(vocab));

int n_past = 0;
printf("Run model with bos token.\n");
eval_string_tokens(cogagent_global.ctx_llama,
begin_token, params.n_batch, &n_past);
printf("Run model with image tokens.\n");
eval_image_tokens(cogagent_global.ctx_llama, cogagent_global.vision_encoder_image,
params.n_batch, &n_past);
// Tokenize user prompt
// Third option set to false to that the tokenizer doesn't add
// beginning of sentence and end of sentence
std::vector<llama_token> user_prompt_tokens = common_tokenize(
cogagent_global.ctx_llama, params.prompt, false, true
);
printf("Run model with user entered text tokens.\n");
eval_string_tokens(cogagent_global.ctx_llama, user_prompt_tokens,
params.n_batch, &n_past);

printf("Parsed maximum sampling length %d.\n", params.n_predict);
int max_len = params.n_predict < 0 ? 256 : params.n_predict;

struct common_sampler * smpl = common_sampler_init(cogagent_global.cogvlm_model, params.sampling);
if (!smpl) {
printf("Failed to initialize sampler.\n");
return 1;
}
printf("\nReprinting entered prompt.\n %s \n", params.prompt.c_str());
printf("\n\n Beginning of response.\n");
std::string response = "";
for (int i=0; i<max_len; ++i) {
const char * tmp = sample(smpl, cogagent_global.ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) {
if (i < 10) {
continue;
}
break;
}
printf("%s", tmp);
fflush(stdout);
}
common_sampler_free(smpl);

llama_model_free(model);
ggml_free(cogagent_global.token_ctx);
return 0;
}
36 changes: 36 additions & 0 deletions examples/cogagent/cogagent.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef COGAGENT_H
#define COGAGENT_H

#include "vision_encoder.h"
#include "cross_vision.h"
#include "cogagent_util.h"
#include "image_util.h"
#include "ggml.h"
#include "gguf.h"

struct cogagent_ctx {
// Vision encoder and cross vision encoder models
vision_encoder_ctx vision_encoder;
cross_vision_ctx cross_vision;

struct llama_context * ctx_llama;
struct llama_model * cogvlm_model;

// Context for storing vision tokens and cross vision
// embedded picture tensor
ggml_context * token_ctx;

std::string user_prompt;
std::vector<float> vision_encoder_image; // Image encoded by the vision encoder
struct ggml_tensor * cross_vision_image_tensor; // Image encoded by the cross vision encoder

int vision_encoder_img_size = 224;
int cross_vision_img_size = 1120;

float norm_mean[3] = {0.48145466, 0.4578275, 0.40821073};
float norm_deviation[3] = {0.26862954, 0.26130258, 0.27577711};
};

extern struct cogagent_ctx cogagent_global;

#endif
Loading