1+ #include " arg.h"
2+ #include " base64.hpp"
3+ #include " log.h"
4+ #include " common.h"
5+ #include " sampling.h"
6+ #include " llama.h"
7+
8+ #include < cstdio>
9+ #include < cstdlib>
10+ #include < cstring>
11+ #include < vector>
12+
13+ #include " cogagent.h"
14+
15+ cogagent_ctx cogagent_global;
16+
17+ // This function is mostly copied from cogagent cli
18+ static bool eval_string_tokens (struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
19+ int N = (int ) tokens.size ();
20+
21+ // // Processing the input tokens in batches
22+ for (int i = 0 ; i < N; i += n_batch) {
23+ int n_eval = (int ) tokens.size () - i;
24+ if (n_eval > n_batch) {
25+ n_eval = n_batch;
26+ }
27+
28+ std::vector<int > pos;
29+ pos.resize (n_eval);
30+ for (int i=0 ; i<n_eval; i++) {
31+ pos[i] = *n_past + i;
32+ }
33+
34+ llama_batch batch = llama_batch_get_one (&tokens[i], n_eval);
35+ batch.cross_embd = cogagent_global.cross_vision_image_tensor ;
36+ batch.pos = pos.data ();
37+ if (llama_decode (ctx_llama, batch)) {
38+ LOG_ERR (" %s : failed to eval. token %d/%d (batch size %d, n_past %d)\n " , __func__, i, N, n_batch, *n_past);
39+ return false ;
40+ }
41+ *n_past += n_eval;
42+ }
43+ return true ;
44+ }
45+
46+ static bool eval_image_tokens (llama_context * ctx_llama, std::vector<float > &img_data,
47+ int n_batch, int * n_past) {
48+ int n_embd = 4096 ;
49+ int num_tokens = 258 ;
50+ int positions[258 ];
51+
52+ positions[0 ] = *n_past;
53+ for (int i=0 ; i<num_tokens-2 ; i++) {
54+ positions[i + 1 ] = *n_past + 1 ;
55+ }
56+ positions[num_tokens - 1 ] = *n_past + 2 ;
57+
58+ float * data_ptr = img_data.data ();
59+
60+ for (int i = 0 ; i < num_tokens; i += n_batch) {
61+ int n_eval = num_tokens - i;
62+ if (n_eval > n_batch) {
63+ n_eval = n_batch;
64+ }
65+ llama_batch batch = {int32_t (n_eval), nullptr , data_ptr, positions, nullptr , nullptr , nullptr , nullptr , nullptr , };
66+ batch.cross_embd = cogagent_global.cross_vision_image_tensor ;
67+ if (llama_decode (ctx_llama, batch)) {
68+ LOG_ERR (" %s : failed to eval\n " , __func__);
69+ return false ;
70+ }
71+ data_ptr += i * n_embd;
72+ }
73+ *n_past += 3 ;
74+ return true ;
75+ }
76+
77+ static void print_usage (int , char ** argv) {
78+ LOG (" \n example usage:\n " );
79+ LOG (" \n %s -m <cogagent-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <cogagent-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \" describe the image in detail.\" ]\n " , argv[0 ]);
80+ LOG (" \n note: a lower temperature value like 0.1 is recommended for better quality.\n " );
81+ }
82+
83+ static const char * sample (struct common_sampler * smpl,
84+ struct llama_context * ctx_llama,
85+ int * n_past) {
86+ const llama_token id = common_sampler_sample (smpl, ctx_llama, -1 );
87+ common_sampler_accept (smpl, id, true );
88+
89+ const llama_model * model = llama_get_model (ctx_llama);
90+ const llama_vocab * vocab = llama_model_get_vocab (model);
91+
92+ static std::string ret;
93+ if (llama_vocab_is_eog (vocab, id)) {
94+ ret = " </s>" ;
95+ } else {
96+ ret = common_token_to_piece (ctx_llama, id);
97+ }
98+ // Give the new token to the model. I'm not sure how it is stored.
99+ // Perhaps it is stored in the KV cache.
100+ std::vector<llama_token> tokens;
101+ tokens.push_back (id);
102+ eval_string_tokens (ctx_llama, tokens, 1 , n_past);
103+
104+ return ret.c_str ();
105+ }
106+
107+ static bool run_vision_encoders (const char * vision_encoder_path, const char * image_path) {
108+ // Load image and resize for the encoders
109+ std::vector<float > small_image_data; // For vision encoder
110+ std::vector<float > large_image_data; // For cross vision encoder
111+ if (!load_and_stretch_image (image_path, cogagent_global.vision_encoder_img_size ,
112+ small_image_data, cogagent_global.norm_mean , cogagent_global.norm_deviation )) {
113+ printf (" Failed to load the specified image file.\n " );
114+ return false ;
115+ }
116+ if (!load_and_stretch_image (image_path, cogagent_global.cross_vision_img_size ,
117+ large_image_data, cogagent_global.norm_mean , cogagent_global.norm_deviation )) {
118+ printf (" Failed to load the specified image file.\n " );
119+ return false ;
120+ }
121+
122+ // For debugging purposes
123+ const char * vision_encoder_resized_image = " cogagent_encoders/llama_vision_encoder_input.gguf" ;
124+ int dims[3 ] = {cogagent_global.vision_encoder_img_size ,
125+ cogagent_global.vision_encoder_img_size , 3 };
126+ save_tensor_from_data (small_image_data, dims, vision_encoder_resized_image);
127+ const char * cross_vision_resized_image = " cogagent_encoders/llama_cross_vision_input.gguf" ;
128+ dims[0 ] = cogagent_global.cross_vision_img_size ;
129+ dims[1 ] = cogagent_global.cross_vision_img_size ;
130+ save_tensor_from_data (large_image_data, dims, cross_vision_resized_image);
131+
132+ // const char * reference_vision_encoder_input = "/home/tianyue/myworkspace"
133+ // "/vlm_intermediate/vision_encoder_input.gguf";
134+ // const char * reference_cross_vision_input = "/home/tianyue/myworkspace"
135+ // "/vlm_intermediate/cross_vision_input.gguf";
136+ // // Load the reference input
137+ // if (get_input(small_image_data, reference_vision_encoder_input) < 0) {
138+ // printf("Failed to load small image input\n");
139+ // return false;
140+ // }
141+ // if (get_input(large_image_data, reference_cross_vision_input) < 0) {
142+ // printf("Failed to load big image input\n");
143+ // return false;
144+ // }
145+ printf (" Loaded and resized the specified image.\n " );
146+
147+ // Load the vision encoder weights
148+ if (!vision_encoder_init_load (vision_encoder_path)) {
149+ printf (" Failed to load vision encoder model file.\n " );
150+ return false ;
151+ }
152+ printf (" Vision encoder weights loaded.\n " );
153+
154+ // Run the vision encoder
155+ run_vision_encoder (small_image_data);
156+ printf (" Completed vision encoder run on image file.\n " );
157+
158+ free_vision_encoder_ctx ();
159+
160+ // Load and run the cross vision encoder
161+ if (!cross_vision_init_load (vision_encoder_path)) {
162+ printf (" Failed to load cross vision encoder model file.\n " );
163+ return false ;
164+ }
165+ printf (" Cross vision encoder weights loaded.\n " );
166+
167+ run_cross_vision (large_image_data);
168+ printf (" Completed cross vision encoder run on image file.\n " );
169+
170+ free_cross_vision_ctx ();
171+ return true ;
172+ }
173+
174+ int main (int argc, char ** argv) {
175+ ggml_time_init ();
176+ common_params params;
177+ if (!common_params_parse (argc, argv, params, LLAMA_EXAMPLE_COGAGENT, print_usage)) {
178+ return 1 ;
179+ }
180+ common_init ();
181+
182+ llama_backend_init ();
183+ llama_numa_init (params.numa );
184+
185+ // Initialize a GGML context to store the encoded image tensors
186+ struct ggml_init_params token_ctx_params = {
187+ size_t (40000000 ),
188+ NULL ,
189+ false ,
190+ };
191+ cogagent_global.token_ctx = ggml_init (token_ctx_params);
192+ if (!cogagent_global.token_ctx ) {
193+ printf (" Failed to initialize token storage context.\n " );
194+ return 1 ;
195+ }
196+ // Allocate the tensor for cross vision encoded image
197+ cogagent_global.cross_vision_image_tensor = ggml_new_tensor_2d (
198+ cogagent_global.token_ctx , GGML_TYPE_F32, 1024 , 6400
199+ );
200+
201+ // Load the images and the encoder models
202+ // Then run the encoder models
203+ if (!run_vision_encoders (params.mmproj .c_str (), params.image [0 ].c_str ())) {
204+ return 1 ;
205+ }
206+
207+ llama_model_params model_params = common_model_params_to_llama (params);
208+ llama_model * model = llama_model_load_from_file (params.model .c_str (), model_params);
209+ if (model == nullptr ) {
210+ printf (" Failed to load decoder model\n " );
211+ return 1 ;
212+ }
213+
214+ llama_context_params ctx_params = common_context_params_to_llama (params);
215+ printf (" Context size is %d tokens\n " , ctx_params.n_ctx );
216+ llama_context * ctx_llama = llama_init_from_model (model, ctx_params);
217+
218+ if (ctx_llama == nullptr ) {
219+ printf (" Failed to create the llama context\n " );
220+ return 1 ;
221+ }
222+
223+ cogagent_global.ctx_llama = ctx_llama;
224+ cogagent_global.cogvlm_model = model;
225+
226+ // At the moment I can't figure out how the llama kv cache
227+ // keeps its information across runs.
228+ // It seems to me that the graph is allocated for each batch,
229+ // which would invalidate any tensors stored in the kv cache.
230+ // I don't spot logic for separately allocating the kv cache
231+ // tensors to avoid this, so it doesn't make sense.
232+ // Maybe the graph isn't actually allocated for each batch?
233+ // Perhaps that is why a worst case graph is allocated.
234+
235+ // TODO: Check if system prompt is compatible
236+ std::vector<llama_token> begin_token;
237+ const llama_vocab * vocab = llama_model_get_vocab (cogagent_global.cogvlm_model );
238+ begin_token.push_back (llama_vocab_bos (vocab));
239+
240+ int n_past = 0 ;
241+ printf (" Run model with bos token.\n " );
242+ eval_string_tokens (cogagent_global.ctx_llama ,
243+ begin_token, params.n_batch , &n_past);
244+ printf (" Run model with image tokens.\n " );
245+ eval_image_tokens (cogagent_global.ctx_llama , cogagent_global.vision_encoder_image ,
246+ params.n_batch , &n_past);
247+ // Tokenize user prompt
248+ // Third option set to false to that the tokenizer doesn't add
249+ // beginning of sentence and end of sentence
250+ std::vector<llama_token> user_prompt_tokens = common_tokenize (
251+ cogagent_global.ctx_llama , params.prompt , false , true
252+ );
253+ printf (" Run model with user entered text tokens.\n " );
254+ eval_string_tokens (cogagent_global.ctx_llama , user_prompt_tokens,
255+ params.n_batch , &n_past);
256+
257+ printf (" Parsed maximum sampling length %d.\n " , params.n_predict );
258+ int max_len = params.n_predict < 0 ? 256 : params.n_predict ;
259+
260+ struct common_sampler * smpl = common_sampler_init (cogagent_global.cogvlm_model , params.sampling );
261+ if (!smpl) {
262+ printf (" Failed to initialize sampler.\n " );
263+ return 1 ;
264+ }
265+ printf (" \n Reprinting entered prompt.\n %s \n " , params.prompt .c_str ());
266+ printf (" \n\n Beginning of response.\n " );
267+ std::string response = " " ;
268+ for (int i=0 ; i<max_len; ++i) {
269+ const char * tmp = sample (smpl, cogagent_global.ctx_llama , &n_past);
270+ response += tmp;
271+ if (strcmp (tmp, " </s>" ) == 0 ) {
272+ if (i < 10 ) {
273+ continue ;
274+ }
275+ break ;
276+ }
277+ printf (" %s" , tmp);
278+ fflush (stdout);
279+ }
280+ common_sampler_free (smpl);
281+
282+ llama_model_free (model);
283+ ggml_free (cogagent_global.token_ctx );
284+ return 0 ;
285+ }
0 commit comments