common, examples, llama : optimize using reserve if possible

GermanAizek · GermanAizek · commit f104678afc32 · 2024-02-16T16:58:45.000+03:00
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -200,7 +200,8 @@ static llama_token llama_sampling_sample_impl(
     }
 
     cur.clear();
-
+    cur.reserve(n_vocab);
+ 
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
         cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
     }
diff --git a/common/train.cpp b/common/train.cpp
@@ -883,9 +883,11 @@ size_t tokenize_file(
 
         // generate sample starts at all token positions
         out_samples_begin.clear();
+        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
+        out_samples_begin.reserve(end);
         out_samples_begin.push_back(0);
+        out_samples_size.reserve(end);
         out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
-        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
         for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
             out_samples_begin.push_back(sample_begin);
             out_samples_size.push_back(context_length);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -1473,6 +1473,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
     std::vector<clip_image_u8*> patches;
     int width = image.nx;
     int height = image.ny;
+    patches.reserve((height / patch_size) * (width / patch_size));
     for (int i = 0; i < height; i += patch_size) {
         for (int j = 0; j < width; j += patch_size) {
             clip_image_u8 *patch = clip_image_u8_init();
@@ -1542,6 +1543,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         if (params.image_grid_pinpoints[0] != 0) {
             // "spatial_unpad" with "anyres" processing for llava-1.6
             std::vector<std::pair<int, int>> possible_resolutions;
+            possible_resolutions.reserve(16);
             for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -262,6 +262,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         const int32_t * image_grid = clip_image_grid(ctx_clip);
 
         std::vector<std::pair<int, int>> grid_pinpoints;
+        grid_pinpoints.reserve(16);
         for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
             grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
         }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -181,6 +181,7 @@ int main(int argc, char ** argv){
                         const int startIdx = i + ngram_size;
                         const int endIdx = startIdx + n_draft;
                         if (endIdx < inp_size) {
+                            draft.reserve(endIdx - startIdx);
                             for (int j = startIdx; j < endIdx; ++j) {
                                 LOG(" - draft candidate %d: %d\n", j, inp[j]);
                                 draft.push_back(inp[j]);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -876,10 +876,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         // Compute log-probs in parallel
         // First we collect all tasks
         eval_pairs.clear();
+        eval_pairs.reserve((i1 - i0) * 4);
         for (size_t i = i0; i < i1; ++i) {
             auto & hs_cur = hs_data[i];
             size_t li = hs_cur.common_prefix;
             for (int s = 0; s < 4; ++s) {
+                eval_pairs.reserve((hs_cur.seq_tokens[s].size() - 1) - hs_cur.common_prefix);
                 for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
                     eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
                 }
@@ -1148,6 +1150,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         }
 
         eval_pairs.clear();
+        eval_pairs.reserve((i1 - i0));
         for (size_t i = i0; i < i1; ++i) {
             auto & task = data[i];
 
@@ -1158,12 +1161,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
             const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
             size_t li = n_base1 - 1;
+            eval_pairs.reserve((task.seq_tokens[0].size() - 1 - last_1st) - (n_base1 - 1));
             for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
                 eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
             }
             const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
             const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
             li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            eval_pairs.reserve((task.seq_tokens[1].size() - 1 - last_2nd) - (n_base2 - 1));
             for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
                 eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
             }
@@ -1519,10 +1524,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // Compute log-probs in parallel
         // First we collect all tasks
         eval_pairs.clear();
+        eval_pairs.reserve(i1 - i0);
         for (size_t i = i0; i < i1; ++i) {
             auto& cur_task = tasks[i];
             size_t li = cur_task.common_prefix;
+            eval_pairs.reserve(cur_task.seq_tokens.size());
             for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                eval_pairs.reserve((cur_task.seq_tokens[s].size() - 1) - cur_task.common_prefix);
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                     eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
                 }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -49,6 +49,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
 
+    ftype_str.reserve(ftype_str_in.size());
     for (auto ch : ftype_str_in) {
         ftype_str.push_back(std::toupper(ch));
     }
diff --git a/llama.cpp b/llama.cpp
@@ -1107,6 +1107,7 @@ struct llama_mmap {
 
         // update the list of mapped fragments to avoid unmapping the same range again in the destructor
         std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
+        new_mapped_fragments.reserve(mapped_fragments.size());
         for (const auto & frag : mapped_fragments) {
             if (frag.first < first && frag.second > last) {
                 // the range is in the middle of the fragment, split it
@@ -7908,6 +7909,7 @@ struct llm_tokenizer_spm {
         // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
+        symbols.reserve(text.size());
         while (offs < text.size()) {
             llm_symbol sym;
             size_t len = utf8_len(text[offs]);
@@ -8065,6 +8067,7 @@ struct llm_tokenizer_bpe {
             int index = 0;
             size_t offset = 0;
 
+            symbols.reserve(word.size());
             while (offset < word.size()) {
                 llm_symbol sym;
                 size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -8138,6 +8141,7 @@ struct llm_tokenizer_bpe {
                 const auto token = vocab.token_to_id.find(str);
 
                 if (token == vocab.token_to_id.end()) {
+                    output.reserve(str.end() - str.begin());
                     for (auto j = str.begin(); j != str.end(); ++j) {
                         std::string byte_str(1, *j);
                         auto token_multibyte = vocab.token_to_id.find(byte_str);
@@ -8309,6 +8313,7 @@ struct llm_tokenizer_bpe {
             }
         }
 
+        bpe_encoded_words.reserve(bpe_words.size());
         for (std::string & word : bpe_words) {
             std::string encoded_token = "";
             for (char & c : word) {
@@ -10194,6 +10199,7 @@ static void llama_convert_tensor_internal(
     size_t in_buff_offs = 0;
     size_t out_buff_offs = 0;
 
+    workers.reserve(nthread);
     for (int tnum = 0; tnum < nthread; tnum++) {
         size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
         size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -10697,6 +10703,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                                 first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
                     }
                 };
+                workers.reserve(nthread_use - 1);
                 for (int it = 0; it < nthread_use - 1; ++it) {
                     workers.emplace_back(compute);
                 }

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,8 @@ static llama_token llama_sampling_sample_impl(`
`200`	`200`	`}`
`201`	`201`
`202`	`202`	`cur.clear();`
`203`		`-`
	`203`	`+ cur.reserve(n_vocab);`
	`204`	`+`
`204`	`205`	`for (llama_token token_id = 0; token_id < n_vocab; token_id++) {`
`205`	`206`	`cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});`
`206`	`207`	`}`
Original file line number	Diff line number	Diff line change
`@@ -262,6 +262,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli`
`262`	`262`	`const int32_t * image_grid = clip_image_grid(ctx_clip);`
`263`	`263`
`264`	`264`	`std::vector<std::pair<int, int>> grid_pinpoints;`
	`265`	`+ grid_pinpoints.reserve(16);`
`265`	`266`	`for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {`
`266`	`267`	`grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});`
`267`	`268`	`}`
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {`
`49`	`49`	`static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {`
`50`	`50`	`std::string ftype_str;`
`51`	`51`
	`52`	`+ ftype_str.reserve(ftype_str_in.size());`
`52`	`53`	`for (auto ch : ftype_str_in) {`
`53`	`54`	`ftype_str.push_back(std::toupper(ch));`
`54`	`55`	`}`