@@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) {
1515int main (int argc, char ** argv) {
1616 gpt_params params;
1717
18- params.prompt = " Hello my name is" ;
18+ // params.prompt = "Hello my name is";
19+ params.prompt = " A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n "
20+ " USER:<img_placement>\n what did you see?\n ASSISTANT:" ;
1921 params.n_predict = 32 ;
2022
2123 if (!gpt_params_parse (argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
@@ -62,52 +64,10 @@ int main(int argc, char ** argv) {
6264
6365 llama_sampler_chain_add (smpl, llama_sampler_init_greedy ());
6466
65-
66-
67-
68- // TODO: this is for testing; DELETE ME
69- int n_cur = 0 ;
70- params.prompt = " A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n USER:" ;
71- {
72- llama_img_batch ibatch;
73- ibatch.n_imgs = 1 ;
74- ibatch.imgs = (llama_img **) malloc (1024 );
75- ibatch.imgs [0 ] = load_image_from_file (" ../models/eiffel-tower-3349075_1280.jpg" );
76- llama_vision_encode (ctx, &ibatch);
77-
78- auto tokens = ::llama_tokenize (ctx, params.prompt , true );
79- int n_imgs = ibatch.n_imgs ;
80- int n_embd = llama_n_embd (model);
81- int n_patches = llama_vision_n_patches (ctx);
82- printf (" n_embd = %d ; n_patches = %d \n " , n_embd, n_patches);
83- float * output_img = llama_vision_get_embeddings (ctx, 0 );
84-
85- n_cur += tokens.size ();
86- llama_batch batch = llama_batch_init (512 , 0 , 1 );
87- llama_batch_clear (batch);
88- for (auto t : tokens) { llama_batch_add (batch, t, n_cur, { 0 }, false ); n_cur++; }
89- if (llama_decode (ctx, batch) != 0 ) {
90- LOG (" %s: llama_decode() failed\n " , __func__);
91- return 1 ;
92- }
93-
94- // for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]);
95- llama_batch_clear (batch);
96- batch = {int32_t (n_patches*n_imgs), nullptr , output_img, nullptr , nullptr , nullptr , nullptr , n_cur, 1 , 0 , };
97- if (llama_decode (ctx, batch) != 0 ) {
98- LOG (" %s: llama_decode() failed\n " , __func__);
99- return 1 ;
100- }
101- n_cur += n_embd*n_imgs;
102- }
103- params.prompt = " \n what did you see?\n ASSISTANT:" ;
104-
105-
106-
10767 // tokenize the prompt
10868
10969 std::vector<llama_token> tokens_list;
110- tokens_list = ::llama_tokenize (ctx, params.prompt , true );
70+ tokens_list = ::llama_tokenize_with_img (ctx, params.prompt , true );
11171
11272 const int n_ctx = llama_n_ctx (ctx);
11373 const int n_kv_req = tokens_list.size () + (n_predict - tokens_list.size ());
@@ -127,33 +87,82 @@ int main(int argc, char ** argv) {
12787 LOG (" \n " );
12888
12989 for (auto id : tokens_list) {
130- LOG (" %s" , llama_token_to_piece (ctx, id).c_str ());
90+ if (id == TOKEN_IMG_PLACEMENT) {
91+ LOG (" <img_placement>" );
92+ } else {
93+ LOG (" %s" , llama_token_to_piece (ctx, id).c_str ());
94+ }
13195 }
13296
97+ LOG (" \n\n " );
98+
99+ // load image
100+ llama_batch_img img_batch = llama_batch_img_init (1 );
101+ img_batch.imgs [0 ] = load_image_from_file (" ../models/eiffel-tower-3349075_1280.jpg" );
102+
133103 // create a llama_batch with size 512
134104 // we use this object to submit token data for decoding
135105
136106 llama_batch batch = llama_batch_init (512 , 0 , 1 );
137107
138108 // evaluate the initial prompt
139- for (size_t i = 0 ; i < tokens_list.size (); i++) {
140- // llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
141- if (i == 0 ) continue ;
142- llama_batch_add (batch, tokens_list[i], n_cur, { 0 }, false );
143- n_cur++;
109+ int n_cur = 0 ;
110+ int i_img = 0 ;
111+ for (auto id : tokens_list) {
112+ if (id == TOKEN_IMG_PLACEMENT) {
113+ img_batch.pos [i_img] = n_cur;
114+ n_cur += llama_img_n_tokens (ctx, img_batch.imgs [i_img]);
115+ i_img++;
116+ } else {
117+ llama_batch_add (batch, id, n_cur, { 0 }, false );
118+ printf (" pos %d tok %d --> %s\n " , n_cur, id, llama_token_to_piece (ctx, id).c_str ());
119+ n_cur++;
120+ }
144121 }
145122
146123 // llama_decode will output logits only for the last token of the prompt
147124 batch.logits [batch.n_tokens - 1 ] = true ;
148125
126+ if (llama_encode_vision (ctx, img_batch) != 0 ) {
127+ LOG (" %s: llama_encode_vision() failed\n " , __func__);
128+ return 1 ;
129+ }
130+
131+ n_cur = 0 ;
132+ {
133+ auto t1 = ::llama_tokenize (ctx, " A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n USER:" , false );
134+ auto t2 = ::llama_tokenize (ctx, " \n what did you see?\n ASSISTANT:" , false );
135+ t1.insert (t1.begin (), 1 );
136+
137+ n_cur = 0 ;
138+ llama_batch_clear (batch);
139+ llama_batch_add (batch, 1 , 0 , { 0 }, false );
140+ llama_decode (ctx, batch);
141+
142+ n_cur = t1.size ();
143+ llama_batch_clear (batch);
144+ llama_batch batch0 = {int32_t (576 ), nullptr , _test_get_img_embd (ctx), nullptr , nullptr , nullptr , nullptr , n_cur, 1 , 0 , };
145+ llama_decode (ctx, batch0);
146+
147+ n_cur = 0 ;
148+ llama_batch_clear (batch);
149+ for (auto t : t1) { llama_batch_add (batch, t, n_cur, { 0 }, false ); n_cur++; }
150+ llama_decode (ctx, batch);
151+
152+ n_cur = t1.size () + 576 ;
153+ llama_batch_clear (batch);
154+ printf (" pos %d\n " , n_cur);
155+ for (auto t : t2) { llama_batch_add (batch, t, n_cur, { 0 }, false ); n_cur++; }
156+ batch.logits [batch.n_tokens - 1 ] = true ;
157+ }
158+
149159 if (llama_decode (ctx, batch) != 0 ) {
150160 LOG (" %s: llama_decode() failed\n " , __func__);
151161 return 1 ;
152162 }
153163
154164 // main loop
155165
156- // int n_cur = batch.n_tokens;
157166 int n_decode = 0 ;
158167
159168 const auto t_main_start = ggml_time_us ();
0 commit comments