@@ -37,6 +37,8 @@ int msB_log256(int x)
3737const int block_header_size = 2 ;
3838const int fixed_token_cost = 1 ;
3939
40+ int total_pad = 0 ;
41+
4042std::vector<uint8_t > encode (llama_context *ctx, std::vector<llama_token> inp, gpt_sampler *smpl, int num_raw_tokens_header)
4143{
4244
@@ -62,7 +64,6 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
6264 for (int index = num_raw_tokens_header; index < inp.size (); index++)
6365 {
6466 auto &cur_p = smpl->cur_p ; // initialized by set_logits
65- // llama_sampler_apply(smpl->grmr, &cur_p);
6667 llama_sampler_apply (smpl->chain , &cur_p);
6768
6869 int match = -1 ;
@@ -121,12 +122,10 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
121122 int sample_id = sample_ids[i];
122123 uint8_t PAD = (8 - bit_offset % 8 ) % 8 ;
123124 uint8_t bytesize = (uint8_t )msB_log256 (sample_id);
124- // LOG("pos: %d, bs: %d\n",sample_id, bytesize);
125125
126126 // Big number, better save as token
127127 if (sample_id > PAD + (block_header_size + fixed_token_cost + bytesize) * 8 )
128128 {
129- // LOG("End block\n");
130129 // Close current block (0b1010 is block marker)
131130 if (was_block)
132131 {
@@ -151,21 +150,18 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
151150 }
152151 }
153152 bit_offset += PAD;
153+ total_pad += PAD;
154154 if (bit_offset % 8 )
155155 {
156156 LOG_ERR (" Unreachable" );
157157 exit (-1 );
158158 }
159- // LOG("\n%d",bit_offset/8);
160159 // 0b0101 is token marker
161-
162160 sample_ids_bitpacked.push_back (0b01010000 | bytesize);
163161 // put token bytes into sample_ids_bitpacked
164- // LOG("\n%d -> ",sample_id);
165162 for (int j = 0 ; j < bytesize; j++)
166163 {
167164 sample_ids_bitpacked.push_back (sample_id & 0xff );
168- LOG (" %02x " , sample_id & 0xff );
169165 sample_id >>= 8 ;
170166 }
171167 if (sample_id)
@@ -217,6 +213,7 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
217213 int block_size = (bit_offset + PAD) / 8 - block_start;
218214 // endianness: big endian
219215 sample_ids_bitpacked[block_start + 1 ] = block_size & 0xff ;
216+ total_pad+=PAD;
220217 }
221218 llama_batch_free (batch);
222219 return sample_ids_bitpacked;
@@ -245,7 +242,6 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
245242 auto token_str = llama_token_to_piece (ctx, token);
246243 LOG (" %s" , token_str.c_str ());
247244 }
248- LOG (" \u001b [0m\u001b [37m" );
249245 if (llama_decode (ctx, batch))
250246 {
251247 LOG_ERR (" %s: llama_decode() failed\n " , __func__);
@@ -275,6 +271,7 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
275271
276272 auto &cur_p = smpl->cur_p ; // initialized by set_logits
277273 llama_sampler_apply (smpl->chain , &cur_p);
274+
278275 auto token_id = cur_p.data [sample_id].id ;
279276
280277 out.push_back (token_id);
@@ -288,12 +285,10 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
288285 // print in red
289286 LOG (" \u001b [31m%s" , llama_token_to_piece (ctx, token_id).c_str ());
290287 LOG (" \n Expected: %s" , llama_token_to_piece (ctx, inp[num_raw_tokens_header + index]).c_str ());
291- // LOG("\n%d", num_raw_tokens_header + index);
292288 LOG (" \n , Id: %d != %d" , token_id, inp[num_raw_tokens_header + index]);
293289 LOG (" \n Pos: %d, bs:%d" , sample_id, bytesize);
294290
295291 // print sample_id bytes in hex
296- // LOG("\n %02x %02x", sample_ids_bitpacked[bit_index / 8], sample_ids_bitpacked[bit_index / 8 + 1]);
297292 LOG (" \n " );
298293 for (int i = bytesize; i > 0 ; i--)
299294 {
@@ -335,8 +330,8 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
335330 int sample_id = id;
336331
337332 auto &cur_p = smpl->cur_p ; // initialized by set_logits
338- // llama_sampler_apply(smpl->grmr, &cur_p);
339333 llama_sampler_apply (smpl->chain , &cur_p);
334+
340335 auto token_id = cur_p.data [sample_id].id ;
341336 out.push_back (token_id);
342337 if (!inp.size () || token_id == inp[num_raw_tokens_header + index])
@@ -363,7 +358,6 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
363358 id = 0 ;
364359 }
365360 }
366- // LOG("\n(%d+%d)/8= %d\n",bit_index,PAD,(bit_index+PAD)/8);
367361 bit_index += PAD;
368362 }
369363 }
@@ -554,10 +548,12 @@ int main(int argc, char **argv)
554548 if (!params.no_perf ){
555549 LOG (" \n Input: %d characters (%d tokens)" , params.prompt .length (), inp.size ());
556550
557- float compressed_byte_per_token = (float )sample_ids_bitpacked.size () / (float )inp.size ();
551+ float compressed_bits_per_token = 8 * (float )sample_ids_bitpacked.size () / (float )inp.size ();
558552 float compressed_bits_per_char = 8 * (float )sample_ids_bitpacked.size () / (float )params.prompt .length ();
559553
560- LOG (" \n %d compressed bytes,(%04f bytes per token, %04f bits per character)\n " , (int )sample_ids_bitpacked.size (), compressed_byte_per_token, compressed_bits_per_char);
554+ LOG (" \n %d compressed bytes,(%04f bits per token, %04f bits per character)\n " , (int )sample_ids_bitpacked.size (), compressed_bits_per_token, compressed_bits_per_char);
555+ LOG (" \n %d padding bits, (%04f bits per character without padding)" , total_pad, compressed_bits_per_char - total_pad/(float )params.prompt .length ());
556+ LOG (" \n PPL (over)estimation: %04f (%04f with padding)" , exp2 (compressed_bits_per_token-total_pad/(float )inp.size ()),exp2 (compressed_bits_per_token));
561557 }
562558 // maybe this needs to be changed
563559 if (params.out_file != " imatrix.dat" ){
@@ -630,7 +626,7 @@ int main(int argc, char **argv)
630626 ofs.write ((char *)&out_str[0 ], out_str.size ());
631627 ofs.close ();
632628 }
633-
629+
634630 llama_free (ctx);
635631 llama_free_model (model);
636632
0 commit comments