Add duration metric for model forward pass in isolation

njhill · njhill · commit ab351a8e6fd6 · 2023-09-28T14:54:16.000-07:00
Signed-off-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/proto/generate.proto b/proto/generate.proto
@@ -157,8 +157,12 @@ message PrefillRequest {
 message GenerateResult {
     /// Next tokens
     repeated Token output_tokens = 1;
+    /// Request-specific errors
     repeated GenerateError errors = 2;
     uint64 batch_id = 3;
+
+    /// Time taken by model forward pass in nanoseconds
+    uint64 forward_time_ns = 4;
 }
 
 message PrefillResponse {
diff --git a/router/client/src/client.rs b/router/client/src/client.rs
@@ -15,6 +15,8 @@ pub struct Client {
     stub: TextGenerationServiceClient<Channel>,
 }
 
+pub type GenerateTokenResponse = (Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64, Duration);
+
 impl Client {
     /// Returns a client connected to the given url
     pub async fn connect(uri: Uri) -> Result<Self> {
@@ -116,7 +118,7 @@ impl Client {
     #[instrument(skip(self))]
     pub async fn prefill(
         &mut self, batch: Batch, to_prune: Vec<CachedBatch>,
-    ) -> Result<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)> {
+    ) -> Result<GenerateTokenResponse> {
         let request = tonic::Request::new(PrefillRequest{
             batch: Some(batch), to_prune,
         });
@@ -129,17 +131,22 @@ impl Client {
         let result = response
             .result
             .ok_or_else(|| ClientError::Generation("Unexpected empty response".into()))?;
-        Ok((result.output_tokens, response.input_tokens, result.errors, result.batch_id))
+        Ok((
+            result.output_tokens,
+            response.input_tokens,
+            result.errors,
+            result.batch_id,
+            Duration::from_nanos(result.forward_time_ns),
+        ))
     }
 
     /// Generate one token for each request in the given cached batch(es)
     ///
     /// Returns next generated token of each request in the batches and id of the next cached batch
     #[instrument(skip(self))]
     pub async fn next_token(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<Option<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)>> {
+        &mut self, batches: Vec<CachedBatch>,
+    ) -> Result<Option<GenerateTokenResponse>> {
         let request = tonic::Request::new(
             NextTokenRequest { batches }
         );
@@ -149,6 +156,12 @@ impl Client {
             .instrument(info_span!("generate_with_cache"))
             .await?
             .into_inner();
-        Ok(response.result.map(|r| (r.output_tokens, vec![], r.errors, r.batch_id)))
+        Ok(response.result.map(|result| (
+            result.output_tokens,
+            vec![],
+            result.errors,
+            result.batch_id,
+            Duration::from_nanos(result.forward_time_ns),
+        )))
     }
 }
diff --git a/router/client/src/lib.rs b/router/client/src/lib.rs
@@ -13,6 +13,7 @@ pub use pb::generate::v1::{
 };
 pub use pb::generate::v1::next_token_chooser_parameters::LengthPenalty;
 pub use sharded_client::ShardedClient;
+pub use client::GenerateTokenResponse;
 use thiserror::Error;
 use tonic::transport;
 use tonic::Status;
diff --git a/router/client/src/sharded_client.rs b/router/client/src/sharded_client.rs
@@ -1,11 +1,12 @@
 /// Multi shard Client
-use crate::{ClientError, GenerateError, Result};
-use crate::{Batch, Client, HealthResponse, Token};
+use crate::{ClientError, Result};
+use crate::{Batch, Client, HealthResponse};
 use futures::future::join_all;
 use tokio::runtime::Handle;
 use tokio::sync::{broadcast, mpsc};
 use tonic::transport::Uri;
-use crate::pb::generate::v1::{CachedBatch, InputTokens};
+use crate::client::GenerateTokenResponse;
+use crate::pb::generate::v1::CachedBatch;
 use crate::pb::generate::v1::model_info_response::ModelType;
 use crate::sharded_client::Request::{NextToken, Prefill};
 
@@ -19,9 +20,7 @@ enum Request {
 #[derive(Debug)]
 pub struct ShardedClient {
     clients: Vec<Client>,
-    sender: broadcast::Sender<(Request, mpsc::Sender<
-        Result<Option<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)>>
-    >)>,
+    sender: broadcast::Sender<(Request, mpsc::Sender<Result<Option<GenerateTokenResponse>>>)>,
     handle: Handle,
 }
 
@@ -94,7 +93,7 @@ impl ShardedClient {
     /// Optionally prunes existing batches first to maximize available memory
     pub async fn prefill(
         &mut self, batch: Batch, to_prune: Vec<CachedBatch>,
-    ) -> Result<Option<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)>> {
+    ) -> Result<Option<GenerateTokenResponse>> {
         if batch.requests.is_empty() {
             return Ok(None);
         }
@@ -108,9 +107,8 @@ impl ShardedClient {
     ///
     /// Returns next generated token of each request in the batches and id of the next cached batch
     pub async fn next_token(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<Option<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)>> {
+        &mut self, batches: Vec<CachedBatch>,
+    ) -> Result<Option<GenerateTokenResponse>> {
         let (tx, mut rx) = mpsc::channel(1);
         self.sender.send((NextToken(batches), tx))
             .map_err(|e| ClientError::Generation(e.to_string()))?;
diff --git a/router/src/batcher.rs b/router/src/batcher.rs
@@ -13,7 +13,7 @@ use std::task::{Context, Poll};
 use futures::{FutureExt, pin_mut, TryFutureExt};
 use futures::future::Map;
 use nohash_hasher::IntMap;
-use text_generation_client::{ClientError, Token, ShardedClient, CachedBatch, RequestsStatus, InputTokens, GenerateError, Batch};
+use text_generation_client::{ClientError, Token, ShardedClient, CachedBatch, RequestsStatus, InputTokens, GenerateError, Batch, GenerateTokenResponse};
 use thiserror::Error;
 use tokio::select;
 
@@ -547,9 +547,7 @@ impl<'a> TokenProcessor<'a> {
     /// Wrap a future inside a match statement to handle errors and send the response to the Batcher
     async fn _wrap_future<B: BatchType>(
         &mut self,
-        future: impl Future<Output = Result<
-            Option<(Vec<Token>, Vec<InputTokens>, Vec<GenerateError>, u64)>, ClientError
-        >>,
+        future: impl Future<Output = Result<Option<GenerateTokenResponse>, ClientError>>,
         method: &'static str,
         start_time: Instant,
         // First request id in this batch if it doesn't comprise all current entries
@@ -573,7 +571,7 @@ impl<'a> TokenProcessor<'a> {
 
         match result {
             Ok(
-                Some((generated_tokens, input_tokens, errors, next_batch_id))
+                Some((generated_tokens, input_tokens, errors, next_batch_id, forward_duration))
             ) => {
                 self.process_input_tokens(input_tokens);
                 let completed_request_ids = self.process_next_tokens(
@@ -587,6 +585,12 @@ impl<'a> TokenProcessor<'a> {
                     "method" => method,
                     "makeup" => "single_only", // later will possibly be beam_only or mixed
                 );
+                metrics::histogram!(
+                    "tgi_batch_inference_forward_duration",
+                    forward_duration,
+                    "method" => method,
+                    "makeup" => "single_only", // later will possibly be beam_only or mixed
+                );
                 // Probably don't need this additional counter because the duration histogram
                 // records a total count
                 metrics::increment_counter!("tgi_batch_inference_success", "method" => method);
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from operator import itemgetter
 
 import torch
@@ -467,7 +468,7 @@ def __init__(
 
         # Perform a forward pass to determine the ordering of past key attention tensor dimensions
         one_token = torch.tensor([[1]], device=inference_engine.get_device())
-        _, past_key_values = self.forward(input_ids=one_token, attention_mask=one_token)
+        _, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
         key_past, value_past = past_key_values[0]
         keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
         self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
@@ -487,7 +488,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]], int]:
         model_inputs = self.model.prepare_inputs_for_generation(
             input_ids, past_key_values,
             attention_mask=attention_mask,
@@ -506,18 +507,18 @@ def forward(
             model_inputs["inputs_embeds"] = inputs_embeds
 
         # Model Forward
+        start_time = time.time_ns()
         outputs = self.model.forward(**model_inputs)
-        return (
-            outputs.logits, outputs.past_key_values,
-        )
+        took_ns = time.time_ns() - start_time
+        return outputs.logits, outputs.past_key_values, took_ns
 
     def generate_token(
         self, batch: CausalLMBatch, first: bool = False, for_concat: bool = False,
-    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError]]:
+    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError], int]:
         # slice the attention mask to the correct shape
         attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
 
-        logits, past = self.forward(
+        logits, past, forward_time_ns = self.forward(
             batch.input_ids, attention_mask, batch.position_ids, batch.past_key_values, batch.inputs_embeds,
         )
 
@@ -605,7 +606,7 @@ def generate_token(
         batch.max_sequence_length += 1
         batch.padding_right_offset -= 1
 
-        return generated_tokens, input_token_infos, decode_errors
+        return generated_tokens, input_token_infos, decode_errors, forward_time_ns
 
 
 class KeysDimTransposedCausalLMBatch(CausalLMBatch):
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from operator import itemgetter
 
 import torch
@@ -319,7 +320,7 @@ def batch_type(self) -> Type[FlashCausalLMBatch]:
 
     def generate_token(
         self, batch: FlashCausalLMBatch, first: bool = False, for_concat: bool = False,
-    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError]]:
+    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError], int]:
 
         batch_size = len(batch)
         past_key_values = batch.past_key_values if first or batch_size > 1 \
@@ -333,6 +334,7 @@ def generate_token(
         else:
             prealloc_length = None
 
+        start_time = time.time_ns()
         out, present = self.model.forward(
             batch.input_ids,
             batch.position_ids,
@@ -342,6 +344,7 @@ def generate_token(
             past_key_values,
             prealloc_length,
         )
+        forward_time_ns = time.time_ns() - start_time
 
         # Update present
         present_pad = self.present_pad
@@ -369,7 +372,7 @@ def generate_token(
         batch.cu_seqlens.add_(batch.cu_seqlens_q)
         batch.max_seqlen += 1
 
-        return generated_tokens, input_token_infos, decode_errors
+        return generated_tokens, input_token_infos, decode_errors, forward_time_ns
 
     def _process_prefill(
         self, batch: FlashCausalLMBatch, out,
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -123,7 +123,7 @@ def batch_type(self) -> Type[B]:
     @abstractmethod
     def generate_token(
         self, batch: B, first: bool = False, for_concat: bool = False,
-    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError]]:
+    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError], int]:
         raise NotImplementedError
 
     @staticmethod
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from operator import itemgetter
 
 import torch
@@ -492,7 +493,7 @@ def __init__(
 
         # Perform a forward pass to determine the ordering of past key attention tensor dimensions
         one_token = torch.tensor([[bos_token_id]], device=inference_engine.get_device())
-        _, _, past_key_values = self.forward(
+        _, _, past_key_values, _ = self.forward(
             input_ids=one_token,
             attention_mask=torch.ones_like(one_token),
             decoder_input_ids=one_token,
@@ -523,12 +524,14 @@ def forward(
         torch.Tensor,
         torch.Tensor,
         List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
+        int,
     ]:
         if inputs_embeds is not None:
             input_ids = None
         if decoder_inputs_embeds is not None:
             decoder_input_ids = None
 
+        start_time = time.time_ns()
         outputs = self.model.forward(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
@@ -541,21 +544,22 @@ def forward(
             use_cache=True,
             return_dict=True,
         )
+        took_ns = time.time_ns() - start_time
         return (
-            outputs.logits, outputs.encoder_last_hidden_state, outputs.past_key_values,
+            outputs.logits, outputs.encoder_last_hidden_state, outputs.past_key_values, took_ns
         )
 
     def generate_token(
         self, batch: Seq2SeqLMBatch, first: bool = False, for_concat: bool = False,
-    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError]]:
+    ) -> Tuple[List[TokenInfo], Optional[List[InputTokens]], List[GenerateError], int]:
         # slice to the correct shape
         decoder_attention_mask = None if batch.decoder_attention_mask is None \
             else batch.decoder_attention_mask[:, : -batch.padding_right_offset]
 
         encoder_outputs = None if batch.encoder_last_hidden_state is None \
             else BaseModelOutput(last_hidden_state=batch.encoder_last_hidden_state)
 
-        logits, encoder_last_hidden_state, past = self.forward(
+        logits, encoder_last_hidden_state, past, forward_time_ns = self.forward(
             batch.input_ids,
             batch.attention_mask,
             batch.decoder_input_ids,
@@ -647,7 +651,7 @@ def generate_token(
         batch.max_decoder_input_length += 1
         batch.padding_right_offset -= 1
 
-        return generated_tokens, input_token_infos, decode_errors
+        return generated_tokens, input_token_infos, decode_errors, forward_time_ns
 
 
 class KeysDimTransposedSeq2SeqLMBatch(Seq2SeqLMBatch):
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -120,7 +120,7 @@ async def Prefill(self, request: generate_pb2.PrefillRequest, context) -> genera
             if batch is not None:
                 for_concat = len(self.cache) > 0
                 # Prefill and generate first token
-                output_tokens, input_token_info, decode_errors = self.model.generate_token(
+                output_tokens, input_token_info, decode_errors, forward_time_ns = self.model.generate_token(
                     batch, first=True, for_concat=for_concat,
                 )
                 if not is_healthcheck:
@@ -140,6 +140,7 @@ async def Prefill(self, request: generate_pb2.PrefillRequest, context) -> genera
                     ],
                     errors=[err.to_pb() for err in errors] if errors else None,
                     batch_id=batch_id,
+                    forward_time_ns=forward_time_ns,
                 ),
                 input_tokens=[
                     input_tokens.to_pb() for input_tokens in input_token_info
@@ -175,7 +176,7 @@ async def NextToken(self, request: generate_pb2.NextTokenRequest, context) -> ge
             # Ensure batches are garbage-collected post-concatenation
             del batches
 
-            output_tokens, _, errors = self.model.generate_token(batch)
+            output_tokens, _, errors, forward_time_ns = self.model.generate_token(batch)
             self.cache.set(batch)
 
             return generate_pb2.NextTokenResponse(
@@ -185,6 +186,7 @@ async def NextToken(self, request: generate_pb2.NextTokenRequest, context) -> ge
                     ],
                     errors=[err.to_pb() for err in errors] if errors else None,
                     batch_id=batch.get_id(),
+                    forward_time_ns=forward_time_ns,
                 )
             )