1010#include < cstdio>
1111#include < cstring>
1212#include < ctime>
13- #include < cinttypes>
1413#include < fstream>
1514#include < mutex>
1615#include < random>
@@ -104,7 +103,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
104103 return probs;
105104}
106105
107- static results_log_softmax log_softmax (int64_t n_vocab, const float * logits, int tok) {
106+ static results_log_softmax log_softmax (int n_vocab, const float * logits, int tok) {
108107 float max_logit = logits[0 ];
109108 for (int i = 1 ; i < n_vocab; ++i) {
110109 max_logit = std::max (max_logit, logits[i]);
@@ -123,7 +122,7 @@ static inline int nearest_int(float fval) {
123122 return (i & 0x007fffff ) - 0x00400000 ;
124123}
125124
126- static double log_softmax (int64_t n_vocab, const float * logits, uint16_t * log_prob, int tok) {
125+ static double log_softmax (int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
127126 float max_logit = logits[0 ];
128127 float min_logit = logits[0 ];
129128 for (int i = 1 ; i < n_vocab; ++i) {
@@ -154,7 +153,7 @@ static double log_softmax(int64_t n_vocab, const float * logits, uint16_t * log_
154153}
155154
156155static void process_logits (
157- int64_t n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
156+ int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
158157 double & nll, double & nll2, float * logit_history, float * prob_history
159158) {
160159 std::mutex mutex;
@@ -170,7 +169,7 @@ static void process_logits(
170169 break ;
171170 }
172171 lock.unlock ();
173- const results_log_softmax results = log_softmax (n_vocab, logits + i *n_vocab, tokens[i+1 ]);
172+ const results_log_softmax results = log_softmax (n_vocab, logits + size_t (i) *n_vocab, tokens[i+1 ]);
174173 const double v = -results.log_softmax ;
175174 local_nll += v;
176175 local_nll2 += v*v;
@@ -188,7 +187,7 @@ static void process_logits(
188187 }
189188}
190189
191- static void process_logits (std::ostream& out, int64_t n_vocab, const float * logits, const int * tokens, int n_token,
190+ static void process_logits (std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
192191 std::vector<std::thread> & workers, std::vector<uint16_t > & log_probs, double & nll, double & nll2) {
193192 std::mutex mutex;
194193 const int nv = 2 *((n_vocab + 1 )/2 ) + 4 ;
@@ -204,7 +203,7 @@ static void process_logits(std::ostream& out, int64_t n_vocab, const float * log
204203 break ;
205204 }
206205 lock.unlock ();
207- const double v = log_softmax (n_vocab, logits + i *n_vocab, log_probs.data () + i*nv, tokens[i+1 ]);
206+ const double v = log_softmax (n_vocab, logits + size_t (i) *n_vocab, log_probs.data () + i*nv, tokens[i+1 ]);
208207 local_nll += v;
209208 local_nll2 += v*v;
210209 }
@@ -235,7 +234,7 @@ struct kl_divergence_result {
235234 size_t count = 0.0 ;
236235};
237236
238- static std::pair<double , float > log_softmax (int64_t n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
237+ static std::pair<double , float > log_softmax (int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
239238 float max_logit = logits[0 ];
240239 int imax = 0 ;
241240 for (int i = 1 ; i < n_vocab; ++i) {
@@ -298,7 +297,7 @@ static std::pair<double, float> log_softmax(int64_t n_vocab, const float * logit
298297 return std::make_pair (sum, p_diff);
299298}
300299
301- static void process_logits (int64_t n_vocab, const float * logits, const int * tokens, int n_token,
300+ static void process_logits (int n_vocab, const float * logits, const int * tokens, int n_token,
302301 std::vector<std::thread> & workers, const std::vector<uint16_t > & base_log_probs, kl_divergence_result & kld,
303302 float * kld_values, float * p_diff_values) {
304303 std::mutex mutex;
@@ -326,7 +325,7 @@ static void process_logits(int64_t n_vocab, const float * logits, const int * to
326325 break ;
327326 }
328327 lock.unlock ();
329- std::pair<double , float > v = log_softmax (n_vocab, logits + i *n_vocab, base_log_probs.data () + i*nv, tokens[i+1 ], local_kld);
328+ std::pair<double , float > v = log_softmax (n_vocab, logits + size_t (i) *n_vocab, base_log_probs.data () + i*nv, tokens[i+1 ], local_kld);
330329 kld_values[i] = (float )v.first ;
331330 p_diff_values[i] = v.second ;
332331 }
@@ -388,7 +387,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
388387 const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
389388 const int n_batch = params.n_batch ;
390389
391- const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
390+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
392391
393392 int count = 0 ;
394393 double nll = 0.0 ;
@@ -428,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
428427 tokens[batch_start] = llama_token_bos (llama_get_model (ctx));
429428 }
430429
431- const auto batch_logits = llama_get_logits (ctx);
432- logits.insert (logits.end (), batch_logits, batch_logits + batch_size * n_vocab);
430+ const auto * batch_logits = llama_get_logits (ctx);
431+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( batch_size) * n_vocab);
433432
434433 if (j == 0 ) {
435434 tokens[batch_start] = token_org;
@@ -451,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
451450
452451 // LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
453452 for (int j = n_ctx - params.ppl_stride - 1 ; j < n_ctx - 1 ; ++j) {
454-
455453 // Calculate probability of next token, given the previous ones.
456454 const std::vector<float > tok_logits (
457- logits.begin () + (j + 0 ) * n_vocab,
458- logits.begin () + (j + 1 ) * n_vocab);
455+ logits.begin () + size_t (j + 0 ) * n_vocab,
456+ logits.begin () + size_t (j + 1 ) * n_vocab);
459457
460458 const float prob = softmax (tok_logits)[tokens[start + j + 1 ]];
461459 logit_history[start + j + 1 ] = tok_logits[tokens[start + j + 1 ]];
@@ -527,7 +525,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
527525 const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
528526 const int n_batch = params.n_batch ;
529527
530- const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
528+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
531529
532530 int count = 0 ;
533531 double nll = 0.0 ;
@@ -543,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
543541
544542 std::vector<float > logits;
545543 if (num_batches > 1 ) {
546- logits.reserve (( size_t ) n_ctx * n_vocab);
544+ logits.reserve (size_t ( n_ctx) * n_vocab);
547545 }
548546
549547 LOG_INF (" %s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n " , __func__, n_chunk, n_ctx, n_batch, n_seq);
@@ -625,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
625623
626624 if (num_batches > 1 && n_outputs > 0 ) {
627625 const auto * batch_logits = llama_get_logits (ctx);
628- logits.insert (logits.end (), batch_logits, batch_logits + n_outputs * n_vocab);
626+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( n_outputs) * n_vocab);
629627 }
630628 }
631629
@@ -666,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
666664 } else {
667665 double av = nll/count;
668666 double av2 = nll2/count - av*av;
669- if (av2 > 0 ) av2 = sqrt (av2/(count-1 ));
667+ if (av2 > 0 ) {
668+ av2 = sqrt (av2/(count-1 ));
669+ }
670670 LOG (" %8d %.4lf %4lf %4lf\n " , i*n_ctx, std::exp (nll / count), av, av2);
671671 }
672672 }
@@ -691,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
691691 return {tokens, ppl, logit_history, prob_history};
692692}
693693
694- static bool decode_helper (llama_context * ctx, llama_batch & batch, std::vector<float > & batch_logits, int32_t n_batch, int32_t n_vocab) {
694+ static bool decode_helper (llama_context * ctx, llama_batch & batch, std::vector<float > & batch_logits, int n_batch, int n_vocab) {
695695 int prev_outputs = 0 ;
696- for (int32_t i = 0 ; i < (int32_t ) batch.n_tokens ; i += n_batch) {
697- const int32_t n_tokens = std::min (n_batch, ( int32_t ) ( batch.n_tokens - i) );
696+ for (int i = 0 ; i < (int ) batch.n_tokens ; i += n_batch) {
697+ const int n_tokens = std::min< int > (n_batch, batch.n_tokens - i);
698698
699699 llama_batch batch_view = {
700700 n_tokens,
@@ -718,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
718718 n_outputs += batch_view.logits [i] != 0 ;
719719 }
720720
721- memcpy (batch_logits.data () + prev_outputs*n_vocab, llama_get_logits (ctx), n_outputs*n_vocab*sizeof (float ));
721+ memcpy (batch_logits.data () + size_t ( prev_outputs) *n_vocab, llama_get_logits (ctx), size_t ( n_outputs) *n_vocab*sizeof (float ));
722722
723723 prev_outputs += n_outputs;
724724 }
@@ -728,24 +728,28 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
728728
729729#define K_TOKEN_CHUNK 4
730730
731- static void compute_logprobs (const float * batch_logits, int64_t n_vocab, std::vector<std::thread>& workers,
731+ static void compute_logprobs (const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
732732 const std::vector<std::pair<size_t , llama_token>>& eval_pairs, std::vector<float >& eval_results) {
733733 if (eval_results.size () != eval_pairs.size ()) {
734734 eval_results.resize (eval_pairs.size ());
735735 }
736- if (eval_pairs.empty ()) return ;
736+ if (eval_pairs.empty ()) {
737+ return ;
738+ }
737739
738740 size_t max_threads = std::min ((eval_pairs.size () + K_TOKEN_CHUNK - 1 )/K_TOKEN_CHUNK, workers.size ());
739741
740742 std::atomic<int > counter (0 );
741743 auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
742744 float local_logprobs[K_TOKEN_CHUNK];
743745 while (true ) {
744- size_t first = counter.fetch_add (K_TOKEN_CHUNK, std::memory_order_relaxed);
745- if (first >= eval_results.size ()) break ;
746- size_t last = std::min (first + K_TOKEN_CHUNK, eval_results.size ());
746+ const size_t first = counter.fetch_add (K_TOKEN_CHUNK, std::memory_order_relaxed);
747+ if (first >= eval_results.size ()) {
748+ break ;
749+ }
750+ const size_t last = std::min (first + K_TOKEN_CHUNK, eval_results.size ());
747751 for (size_t i = first; i < last; ++i) {
748- auto logits = batch_logits + eval_pairs[i].first * n_vocab;
752+ const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
749753 float max_logit = logits[0 ];
750754 for (int j = 1 ; j < n_vocab; ++j) {
751755 max_logit = std::max (max_logit, logits[j]);
@@ -885,7 +889,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
885889 const int n_ctx = llama_n_ctx (ctx);
886890 const int n_batch = params.n_batch ;
887891
888- const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
892+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
889893
890894 const int max_tasks_per_batch = 32 ;
891895 const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
@@ -894,7 +898,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
894898
895899 std::vector<float > tok_logits (n_vocab);
896900 // TODO: this could be made smaller; it's currently the worst-case size
897- std::vector<float > batch_logits (n_vocab* n_ctx);
901+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
898902
899903 std::vector<std::pair<size_t , llama_token>> eval_pairs;
900904 std::vector<float > eval_results;
@@ -981,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
981985 auto & hs_cur = hs_data[i];
982986
983987 // get the logits of the last token of the common prefix
984- std::memcpy (tok_logits.data (), batch_logits.data () + n_vocab* hs_cur.i_logits , n_vocab*sizeof (float ));
988+ std::memcpy (tok_logits.data (), batch_logits.data () + hs_cur.i_logits *n_vocab , n_vocab*sizeof (float ));
985989
986990 const auto first_probs = softmax (tok_logits);
987991
@@ -1167,7 +1171,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
11671171 const int n_ctx = llama_n_ctx (ctx);
11681172 const int n_batch = params.n_batch ;
11691173
1170- const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
1174+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
11711175
11721176 const int max_tasks_per_batch = 128 ;
11731177 const int max_seq = std::min (2 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
@@ -1176,7 +1180,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
11761180
11771181 std::vector<float > tok_logits (n_vocab);
11781182 // TODO: this could be made smaller; it's currently the worst-case size
1179- std::vector<float > batch_logits (n_vocab* n_ctx);
1183+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
11801184
11811185 std::vector<std::pair<size_t , llama_token>> eval_pairs;
11821186 std::vector<float > eval_results;
@@ -1519,15 +1523,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
15191523 const int n_ctx = llama_n_ctx (ctx);
15201524 const int n_batch = params.n_batch ;
15211525
1522- const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
1526+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
15231527
15241528 const int max_tasks_per_batch = 32 ;
15251529 const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
15261530
15271531 llama_batch batch = llama_batch_init (n_ctx, 0 , max_seq);
15281532
15291533 std::vector<float > tok_logits (n_vocab);
1530- std::vector<float > batch_logits (n_vocab* n_ctx);
1534+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
15311535
15321536 std::vector<std::pair<size_t , llama_token>> eval_pairs;
15331537 std::vector<float > eval_results;
@@ -1635,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
16351639 // LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
16361640
16371641 // get the logits of the last token of the common prefix
1638- std::memcpy (tok_logits.data (), batch_logits.data () + n_vocab* cur_task.i_logits , n_vocab*sizeof (float ));
1642+ std::memcpy (tok_logits.data (), batch_logits.data () + cur_task.i_logits *n_vocab , n_vocab*sizeof (float ));
16391643
16401644 const auto first_probs = softmax (tok_logits);
16411645
@@ -1717,19 +1721,19 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
17171721 __func__, params.logits_file .c_str (), n_ctx, params.n_ctx );
17181722 }
17191723
1720- int64_t n_vocab;
1721- int64_t n_chunk;
1724+ int n_vocab;
1725+ int n_chunk;
17221726 in.read ((char *)&n_vocab, sizeof (n_vocab));
17231727 in.read ((char *)&n_chunk, sizeof (n_chunk));
17241728 if (in.fail ()) {
17251729 LOG_ERR (" %s: failed reading n_vocab, n_chunk from %s\n " , __func__, params.logits_file .c_str ());
17261730 return ;
17271731 }
17281732 if (n_vocab != llama_n_vocab (llama_get_model (ctx))) {
1729- LOG_ERR (" %s: inconsistent vocabulary (%" PRId64 " vs %d)\n " , __func__, n_vocab, llama_n_vocab (llama_get_model (ctx)));
1733+ LOG_ERR (" %s: inconsistent vocabulary (%d vs %d)\n " , __func__, n_vocab, llama_n_vocab (llama_get_model (ctx)));
17301734 }
17311735
1732- std::vector<llama_token> tokens (n_ctx * n_chunk);
1736+ std::vector<llama_token> tokens (size_t ( n_ctx) * n_chunk);
17331737 if (in.read ((char *)tokens.data (), tokens.size ()*sizeof (tokens[0 ])).fail ()) {
17341738 LOG_ERR (" %s: failed reading evaluation tokens from %s\n " , __func__, params.logits_file .c_str ());
17351739 return ;
@@ -1746,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
17461750 std::vector<float > p_diff_values (size_t (n_ctx - 1 - n_ctx/2 )*n_chunk);
17471751 std::vector<float > logits;
17481752 if (num_batches > 1 ) {
1749- logits.reserve (n_ctx * n_vocab);
1753+ logits.reserve (size_t ( n_ctx) * n_vocab);
17501754 }
17511755
17521756 std::vector<std::thread> workers (std::thread::hardware_concurrency () - 1 );
@@ -1810,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
18101814
18111815 if (num_batches > 1 ) {
18121816 const auto * batch_logits = llama_get_logits (ctx);
1813- logits.insert (logits.end (), batch_logits, batch_logits + batch_size * n_vocab);
1817+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( batch_size) * n_vocab);
18141818 }
18151819 }
18161820
@@ -1831,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
18311835
18321836 const int first = n_ctx/2 ;
18331837 const float * all_logits = num_batches > 1 ? logits.data () : llama_get_logits (ctx);
1834- process_logits (n_vocab, all_logits + first*n_vocab, tokens.data () + start + first, n_ctx - 1 - first,
1838+ process_logits (n_vocab, all_logits + size_t ( first) *n_vocab, tokens.data () + start + first, n_ctx - 1 - first,
18351839 workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
18361840 p_diff_ptr += n_ctx - 1 - first;
18371841 kld_ptr += n_ctx - 1 - first;
0 commit comments