common : preallocate sampling token data vector (#8363)

kevmo314 · web-flow · commit 470939d483d1 · 2024-07-08T10:26:53.000+03:00
`emplace_back` repeatedly-called is slower than preallocating the vector to the vocab size and directly inserting the data. Some rudimentary profiling with `chrono` improves the performance of this block of code from ~500us/op to ~40us/op.

Overall, this slightly improves the sampling performance which has a more substantial impact for the `examples/lookahead` implementation -- I am able to see a ~10% performance boost in lookahead inference.
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
     if (ctx_sampling->grammar != NULL && !apply_grammar) {
         GGML_ASSERT(original_logits != NULL);
         // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
     }
 
     // apply params.logit_bias map
@@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
         llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
     }
 
-    cur.clear();
+    cur.resize(n_vocab);
 
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
     }
 
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };

Original file line number	Diff line number	Diff line change
`@@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(`
`378`	`378`	`if (ctx_sampling->grammar != NULL && !apply_grammar) {`
`379`	`379`	`GGML_ASSERT(original_logits != NULL);`
`380`	`380`	`// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.`
`381`		`- *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};`
	`381`	`+ *original_logits = {logits, logits + n_vocab};`
`382`	`382`	`}`
`383`	`383`
`384`	`384`	`// apply params.logit_bias map`
`@@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(`
`391`	`391`	`llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);`
`392`	`392`	`}`
`393`	`393`
`394`		`- cur.clear();`
	`394`	`+ cur.resize(n_vocab);`
`395`	`395`
`396`	`396`	`for (llama_token token_id = 0; token_id < n_vocab; token_id++) {`
`397`		`- cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});`
	`397`	`+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};`
`398`	`398`	`}`
`399`	`399`
`400`	`400`	`llama_token_data_array cur_p = { cur.data(), cur.size(), false };`