Skip to content

Commit 1343d66

Browse files
committed
Fix issues with CogAgent cross attention
1 parent 5c19d77 commit 1343d66

File tree

14 files changed

+1684
-17
lines changed

14 files changed

+1684
-17
lines changed

examples/cogagent/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
set(TARGET llama-cogagent-cli)
2+
add_executable(${TARGET} cogagent-cli.cpp)
3+
add_library(cogagent OBJECT
4+
vision_encoder.cpp
5+
vision_encoder.h
6+
cross_vision.cpp
7+
cross_vision.h
8+
cogagent_util.cpp
9+
cogagent_util.h
10+
image_util.cpp
11+
image_util.h)
12+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-cogagent-cli)
13+
install(TARGETS ${TARGET} RUNTIME)
14+
target_link_libraries(${TARGET} PRIVATE common cogagent ggml ${CMAKE_THREAD_LIBS_INIT})
15+
target_include_directories(cogagent PUBLIC ../../ggml/include)
16+
target_include_directories(cogagent PUBLIC ../../include)
17+
target_include_directories(cogagent PUBLIC ../../common)
18+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/cogagent/cogagent-cli.cpp

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
#include "arg.h"
2+
#include "base64.hpp"
3+
#include "log.h"
4+
#include "common.h"
5+
#include "sampling.h"
6+
#include "llama.h"
7+
8+
#include <cstdio>
9+
#include <cstdlib>
10+
#include <cstring>
11+
#include <vector>
12+
13+
#include "cogagent.h"
14+
15+
cogagent_ctx cogagent_global;
16+
17+
// This function is mostly copied from cogagent cli
18+
static bool eval_string_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
19+
int N = (int) tokens.size();
20+
21+
//// Processing the input tokens in batches
22+
for (int i = 0; i < N; i += n_batch) {
23+
int n_eval = (int) tokens.size() - i;
24+
if (n_eval > n_batch) {
25+
n_eval = n_batch;
26+
}
27+
28+
std::vector<int> pos;
29+
pos.resize(n_eval);
30+
for (int i=0; i<n_eval; i++) {
31+
pos[i] = *n_past + i;
32+
}
33+
34+
llama_batch batch = llama_batch_get_one(&tokens[i], n_eval);
35+
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
36+
batch.pos = pos.data();
37+
if (llama_decode(ctx_llama, batch)) {
38+
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
39+
return false;
40+
}
41+
*n_past += n_eval;
42+
}
43+
return true;
44+
}
45+
46+
static bool eval_image_tokens(llama_context * ctx_llama, std::vector<float> &img_data,
47+
int n_batch, int * n_past) {
48+
int n_embd = 4096;
49+
int num_tokens = 258;
50+
int positions[258];
51+
52+
positions[0] = *n_past;
53+
for (int i=0; i<num_tokens-2; i++) {
54+
positions[i + 1] = *n_past + 1;
55+
}
56+
positions[num_tokens - 1] = *n_past + 2;
57+
58+
float * data_ptr = img_data.data();
59+
60+
for (int i = 0; i < num_tokens; i += n_batch) {
61+
int n_eval = num_tokens - i;
62+
if (n_eval > n_batch) {
63+
n_eval = n_batch;
64+
}
65+
llama_batch batch = {int32_t(n_eval), nullptr, data_ptr, positions, nullptr, nullptr, nullptr, nullptr, nullptr, };
66+
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
67+
if (llama_decode(ctx_llama, batch)) {
68+
LOG_ERR("%s : failed to eval\n", __func__);
69+
return false;
70+
}
71+
data_ptr += i * n_embd;
72+
}
73+
*n_past += 3;
74+
return true;
75+
}
76+
77+
static void print_usage(int, char ** argv) {
78+
LOG("\n example usage:\n");
79+
LOG("\n %s -m <cogagent-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <cogagent-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
80+
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
81+
}
82+
83+
static const char * sample(struct common_sampler * smpl,
84+
struct llama_context * ctx_llama,
85+
int * n_past) {
86+
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
87+
common_sampler_accept(smpl, id, true);
88+
89+
const llama_model * model = llama_get_model(ctx_llama);
90+
const llama_vocab * vocab = llama_model_get_vocab(model);
91+
92+
static std::string ret;
93+
if (llama_vocab_is_eog(vocab, id)) {
94+
ret = "</s>";
95+
} else {
96+
ret = common_token_to_piece(ctx_llama, id);
97+
}
98+
// Give the new token to the model. I'm not sure how it is stored.
99+
// Perhaps it is stored in the KV cache.
100+
std::vector<llama_token> tokens;
101+
tokens.push_back(id);
102+
eval_string_tokens(ctx_llama, tokens, 1, n_past);
103+
104+
return ret.c_str();
105+
}
106+
107+
static bool run_vision_encoders(const char* vision_encoder_path, const char* image_path) {
108+
// Load image and resize for the encoders
109+
std::vector<float> small_image_data; // For vision encoder
110+
std::vector<float> large_image_data; // For cross vision encoder
111+
if (!load_and_stretch_image(image_path, cogagent_global.vision_encoder_img_size,
112+
small_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
113+
printf("Failed to load the specified image file.\n");
114+
return false;
115+
}
116+
if (!load_and_stretch_image(image_path, cogagent_global.cross_vision_img_size,
117+
large_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
118+
printf("Failed to load the specified image file.\n");
119+
return false;
120+
}
121+
122+
// For debugging purposes
123+
const char * vision_encoder_resized_image = "cogagent_encoders/llama_vision_encoder_input.gguf";
124+
int dims[3] = {cogagent_global.vision_encoder_img_size,
125+
cogagent_global.vision_encoder_img_size, 3};
126+
save_tensor_from_data(small_image_data, dims, vision_encoder_resized_image);
127+
const char * cross_vision_resized_image = "cogagent_encoders/llama_cross_vision_input.gguf";
128+
dims[0] = cogagent_global.cross_vision_img_size;
129+
dims[1] = cogagent_global.cross_vision_img_size;
130+
save_tensor_from_data(large_image_data, dims, cross_vision_resized_image);
131+
132+
// const char * reference_vision_encoder_input = "/home/tianyue/myworkspace"
133+
// "/vlm_intermediate/vision_encoder_input.gguf";
134+
// const char * reference_cross_vision_input = "/home/tianyue/myworkspace"
135+
// "/vlm_intermediate/cross_vision_input.gguf";
136+
// // Load the reference input
137+
// if (get_input(small_image_data, reference_vision_encoder_input) < 0) {
138+
// printf("Failed to load small image input\n");
139+
// return false;
140+
// }
141+
// if (get_input(large_image_data, reference_cross_vision_input) < 0) {
142+
// printf("Failed to load big image input\n");
143+
// return false;
144+
// }
145+
printf("Loaded and resized the specified image.\n");
146+
147+
// Load the vision encoder weights
148+
if (!vision_encoder_init_load(vision_encoder_path)) {
149+
printf("Failed to load vision encoder model file.\n");
150+
return false;
151+
}
152+
printf("Vision encoder weights loaded.\n");
153+
154+
// Run the vision encoder
155+
run_vision_encoder(small_image_data);
156+
printf("Completed vision encoder run on image file.\n");
157+
158+
free_vision_encoder_ctx();
159+
160+
// Load and run the cross vision encoder
161+
if (!cross_vision_init_load(vision_encoder_path)) {
162+
printf("Failed to load cross vision encoder model file.\n");
163+
return false;
164+
}
165+
printf("Cross vision encoder weights loaded.\n");
166+
167+
run_cross_vision(large_image_data);
168+
printf("Completed cross vision encoder run on image file.\n");
169+
170+
free_cross_vision_ctx();
171+
return true;
172+
}
173+
174+
int main(int argc, char ** argv) {
175+
ggml_time_init();
176+
common_params params;
177+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COGAGENT, print_usage)) {
178+
return 1;
179+
}
180+
common_init();
181+
182+
llama_backend_init();
183+
llama_numa_init(params.numa);
184+
185+
// Initialize a GGML context to store the encoded image tensors
186+
struct ggml_init_params token_ctx_params = {
187+
size_t(40000000),
188+
NULL,
189+
false,
190+
};
191+
cogagent_global.token_ctx = ggml_init(token_ctx_params);
192+
if (!cogagent_global.token_ctx) {
193+
printf("Failed to initialize token storage context.\n");
194+
return 1;
195+
}
196+
// Allocate the tensor for cross vision encoded image
197+
cogagent_global.cross_vision_image_tensor = ggml_new_tensor_2d(
198+
cogagent_global.token_ctx, GGML_TYPE_F32, 1024, 6400
199+
);
200+
201+
// Load the images and the encoder models
202+
// Then run the encoder models
203+
if (!run_vision_encoders(params.mmproj.c_str(), params.image[0].c_str())) {
204+
return 1;
205+
}
206+
207+
llama_model_params model_params = common_model_params_to_llama(params);
208+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
209+
if (model == nullptr) {
210+
printf("Failed to load decoder model\n");
211+
return 1;
212+
}
213+
214+
llama_context_params ctx_params = common_context_params_to_llama(params);
215+
printf("Context size is %d tokens\n", ctx_params.n_ctx);
216+
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
217+
218+
if (ctx_llama == nullptr) {
219+
printf("Failed to create the llama context\n");
220+
return 1;
221+
}
222+
223+
cogagent_global.ctx_llama = ctx_llama;
224+
cogagent_global.cogvlm_model = model;
225+
226+
// At the moment I can't figure out how the llama kv cache
227+
// keeps its information across runs.
228+
// It seems to me that the graph is allocated for each batch,
229+
// which would invalidate any tensors stored in the kv cache.
230+
// I don't spot logic for separately allocating the kv cache
231+
// tensors to avoid this, so it doesn't make sense.
232+
// Maybe the graph isn't actually allocated for each batch?
233+
// Perhaps that is why a worst case graph is allocated.
234+
235+
// TODO: Check if system prompt is compatible
236+
std::vector<llama_token> begin_token;
237+
const llama_vocab * vocab = llama_model_get_vocab(cogagent_global.cogvlm_model);
238+
begin_token.push_back(llama_vocab_bos(vocab));
239+
240+
int n_past = 0;
241+
printf("Run model with bos token.\n");
242+
eval_string_tokens(cogagent_global.ctx_llama,
243+
begin_token, params.n_batch, &n_past);
244+
printf("Run model with image tokens.\n");
245+
eval_image_tokens(cogagent_global.ctx_llama, cogagent_global.vision_encoder_image,
246+
params.n_batch, &n_past);
247+
// Tokenize user prompt
248+
// Third option set to false to that the tokenizer doesn't add
249+
// beginning of sentence and end of sentence
250+
std::vector<llama_token> user_prompt_tokens = common_tokenize(
251+
cogagent_global.ctx_llama, params.prompt, false, true
252+
);
253+
printf("Run model with user entered text tokens.\n");
254+
eval_string_tokens(cogagent_global.ctx_llama, user_prompt_tokens,
255+
params.n_batch, &n_past);
256+
257+
printf("Parsed maximum sampling length %d.\n", params.n_predict);
258+
int max_len = params.n_predict < 0 ? 256 : params.n_predict;
259+
260+
struct common_sampler * smpl = common_sampler_init(cogagent_global.cogvlm_model, params.sampling);
261+
if (!smpl) {
262+
printf("Failed to initialize sampler.\n");
263+
return 1;
264+
}
265+
printf("\nReprinting entered prompt.\n %s \n", params.prompt.c_str());
266+
printf("\n\n Beginning of response.\n");
267+
std::string response = "";
268+
for (int i=0; i<max_len; ++i) {
269+
const char * tmp = sample(smpl, cogagent_global.ctx_llama, &n_past);
270+
response += tmp;
271+
if (strcmp(tmp, "</s>") == 0) {
272+
if (i < 10) {
273+
continue;
274+
}
275+
break;
276+
}
277+
printf("%s", tmp);
278+
fflush(stdout);
279+
}
280+
common_sampler_free(smpl);
281+
282+
llama_model_free(model);
283+
ggml_free(cogagent_global.token_ctx);
284+
return 0;
285+
}

examples/cogagent/cogagent.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#ifndef COGAGENT_H
2+
#define COGAGENT_H
3+
4+
#include "vision_encoder.h"
5+
#include "cross_vision.h"
6+
#include "cogagent_util.h"
7+
#include "image_util.h"
8+
#include "ggml.h"
9+
#include "gguf.h"
10+
11+
struct cogagent_ctx {
12+
// Vision encoder and cross vision encoder models
13+
vision_encoder_ctx vision_encoder;
14+
cross_vision_ctx cross_vision;
15+
16+
struct llama_context * ctx_llama;
17+
struct llama_model * cogvlm_model;
18+
19+
// Context for storing vision tokens and cross vision
20+
// embedded picture tensor
21+
ggml_context * token_ctx;
22+
23+
std::string user_prompt;
24+
std::vector<float> vision_encoder_image; // Image encoded by the vision encoder
25+
struct ggml_tensor * cross_vision_image_tensor; // Image encoded by the cross vision encoder
26+
27+
int vision_encoder_img_size = 224;
28+
int cross_vision_img_size = 1120;
29+
30+
float norm_mean[3] = {0.48145466, 0.4578275, 0.40821073};
31+
float norm_deviation[3] = {0.26862954, 0.26130258, 0.27577711};
32+
};
33+
34+
extern struct cogagent_ctx cogagent_global;
35+
36+
#endif

0 commit comments

Comments
 (0)