refactor(embeddings): get embeddings at the last token only to capture whole input and save time

pminev · pminev · commit 8fbd7e5279b2 · 2025-04-22T13:18:28.000+03:00
-- noticed that the embeddings was the same for the full sequence but we're looping throgh all tokens which was inefficient
diff --git a/code/ac/llama/InstanceEmbedding.cpp b/code/ac/llama/InstanceEmbedding.cpp
@@ -100,10 +100,15 @@ void batchAddSeq(llama_batch& batch, std::span<const Token> tokens, llama_seq_id
         batch.pos     [batch.n_tokens] = llama_pos(i);
         batch.n_seq_id[batch.n_tokens] = 1;
         batch.seq_id[batch.n_tokens][0] = seq_id;
-        batch.logits  [batch.n_tokens] = true;
+        batch.logits  [batch.n_tokens] = false;
 
         batch.n_tokens++;
     }
+
+    // We want to extract the embeddings
+    // for the last token in the sequence because
+    // it will capture the all tokens in the sequence.
+    batch.logits[batch.n_tokens - 1] = true;
 }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -100,10 +100,15 @@ void batchAddSeq(llama_batch& batch, std::span<const Token> tokens, llama_seq_id`
`100`	`100`	`batch.pos [batch.n_tokens] = llama_pos(i);`
`101`	`101`	`batch.n_seq_id[batch.n_tokens] = 1;`
`102`	`102`	`batch.seq_id[batch.n_tokens][0] = seq_id;`
`103`		`- batch.logits [batch.n_tokens] = true;`
	`103`	`+ batch.logits [batch.n_tokens] = false;`
`104`	`104`
`105`	`105`	`batch.n_tokens++;`
`106`	`106`	`}`
	`107`	`+`
	`108`	`+ // We want to extract the embeddings`
	`109`	`+ // for the last token in the sequence because`
	`110`	`+ // it will capture the all tokens in the sequence.`
	`111`	`+ batch.logits[batch.n_tokens - 1] = true;`
`107`	`112`	`}`
`108`	`113`	`}`
`109`	`114`