@@ -119,7 +119,6 @@ class ModelRunner {
119119 size_t total_num_tokens = 0 , total_num_blocks = 0 ;
120120 size_t max_context_len_val = 0 ;
121121 size_t hidden_size = 0 ;
122- size_t num_generated_ids = 0 ;
123122 OPENVINO_ASSERT (sequence_groups.size () > 0 );
124123 auto sequence_group_type = sequence_groups[0 ]->get_sequence_group_type ();
125124 if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
@@ -135,9 +134,6 @@ class ModelRunner {
135134 total_num_tokens += sequence_group->get_num_scheduled_tokens () * num_sequences;
136135 total_num_blocks += sequence_group->get_num_blocks () * num_sequences;
137136 max_context_len_val = std::max (max_context_len_val, sequence_group->get_context_len ());
138- for (auto seq: sequence_group->get_running_sequences ()) {
139- num_generated_ids += seq->get_generated_len ();
140- }
141137 }
142138
143139 ov::Tensor
@@ -163,27 +159,6 @@ class ModelRunner {
163159 if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
164160 OPENVINO_ASSERT (m_embedding.get_request (), " Got sequence group with embeddings, but embeddings model wasn't set." );
165161 inputs_embeds_data = inputs_embeds.data <float >();
166-
167- ov::Tensor generated_ids = ov::Tensor (ov::element::i64 , {1 , num_generated_ids});
168- int64_t *generated_ids_data = generated_ids.data <int64_t >();
169- size_t pos = 0 ;
170- for (size_t i = 0 ; i < num_sequence_groups; ++i) {
171- size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids [i];
172- SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
173- for (auto seq: sequence_group->get_running_sequences ()) {
174- auto generated_ids = seq->get_generated_ids ();
175- for (size_t token_idx = 0 ; token_idx < generated_ids.size (); token_idx++) {
176- generated_ids_data[pos] = generated_ids[token_idx];
177- pos++;
178- }
179- }
180- }
181- if (pos > 0 ) {
182- // TODO: Compute embeddings only for last generated token, while previously generated embeddings save in SequenceGroup
183- generated_ids_embeds = m_embedding.infer (generated_ids);
184- generated_ids_embeds_data = generated_ids_embeds.data <float >();
185- }
186-
187162 } else if (sequence_group_type == SequenceGroupType::TOKENS) {
188163 input_ids_data = input_ids.data <int64_t >();
189164 }
@@ -234,8 +209,8 @@ class ModelRunner {
234209 sequence_group->get_prompt_ids ()[position_id] :
235210 sequence->get_generated_ids ()[position_id - prompt_len];
236211 } else if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
237- auto embeds_pos = position_id < prompt_len ? 0 : hidden_size * (position_id - prompt_len );
238- const float * src = position_id < prompt_len ? sequence_group->get_input_embeds ()[position_id].data () : generated_ids_embeds_data + embeds_pos ;
212+ const auto & generated_embeds = sequence-> get_generated_ids_embeds ( );
213+ const float * src = position_id < prompt_len ? sequence_group->get_input_embeds ()[position_id].data () : generated_embeds[position_id - prompt_len]. data () ;
239214 std::copy_n (src, hidden_size, inputs_embeds_data + token_id * hidden_size);
240215 } else {
241216 OPENVINO_THROW (" Unknown model inputs type." );
@@ -271,7 +246,6 @@ class ModelRunner {
271246 input_ids_data += num_scheduled_tokens;
272247 } else if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
273248 inputs_embeds_data += num_scheduled_tokens * hidden_size;
274- generated_ids_embeds_data += sequence->get_generated_len () * hidden_size;
275249 }
276250
277251 position_ids_data += num_scheduled_tokens;
@@ -337,6 +311,63 @@ class ModelRunner {
337311 return m_request.get_tensor (" logits" );
338312 }
339313
314+ void append_embeddings (const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {
315+ size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids .size ();
316+ size_t num_generated_ids_without_embeddings = 0 ;
317+ OPENVINO_ASSERT (sequence_groups.size () > 0 );
318+
319+ // compute aggregated values
320+ for (size_t i = 0 ; i < num_sequence_groups; ++i) {
321+ size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids [i];
322+ SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
323+ size_t num_sequences = sequence_group->num_running_seqs ();
324+ OPENVINO_ASSERT (sequence_group->get_sequence_group_type () == SequenceGroupType::EMBEDDINGS);
325+ for (auto seq: sequence_group->get_running_sequences ()) {
326+ num_generated_ids_without_embeddings += seq->get_generated_len () - seq->get_generated_ids_embeds ().size ();
327+ }
328+ }
329+ size_t hidden_size = sequence_groups[0 ]->get_hidden_size ();
330+
331+ ov::Tensor generated_ids_embeds;
332+ float *generated_ids_embeds_data = nullptr ;
333+
334+ OPENVINO_ASSERT (m_embedding.get_request (), " Got sequence group with embeddings, but embeddings model wasn't set." );
335+
336+ ov::Tensor generated_ids = ov::Tensor (ov::element::i64 , {1 , num_generated_ids_without_embeddings});
337+ int64_t *generated_ids_data = generated_ids.data <int64_t >();
338+ size_t pos = 0 ;
339+ for (size_t i = 0 ; i < num_sequence_groups; ++i) {
340+ size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids [i];
341+ SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
342+ for (auto seq: sequence_group->get_running_sequences ()) {
343+ const auto & generated_ids = seq->get_generated_ids ();
344+ for (size_t token_idx = seq->get_generated_ids_embeds ().size (); token_idx < generated_ids.size (); token_idx++) {
345+ generated_ids_data[pos] = generated_ids[token_idx];
346+ pos++;
347+ }
348+ }
349+ }
350+ if (pos > 0 ) {
351+ generated_ids_embeds = m_embedding.infer (generated_ids);
352+ generated_ids_embeds_data = generated_ids_embeds.data <float >();
353+
354+ for (size_t i = 0 ; i < num_sequence_groups; ++i) {
355+ size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids [i];
356+ size_t embeds_pos = 0 ;
357+ SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id];
358+ for (auto seq: sequence_group->get_running_sequences ()) {
359+ auto generated_ids = seq->get_generated_ids ();
360+ size_t new_embeds_count = seq->get_generated_len () - seq->get_generated_ids_embeds ().size ();
361+ ov::Coordinate start{0 , embeds_pos, 0 };
362+ ov::Coordinate end{1 , embeds_pos + new_embeds_count, hidden_size};
363+ ov::Tensor embedding (generated_ids_embeds, start, end);
364+ seq->append_generated_ids_embeds (embedding);
365+ embeds_pos += new_embeds_count;
366+ }
367+ }
368+ }
369+ }
370+
340371private:
341372 void _fill_indices_from_block_tables (
342373 const std::vector<std::string>& dst_tensor_names,
0 commit comments