Add missing grpc metrics

maxdebayser · joerunde · njhill · njhill · commit 62c500003750 · 2024-01-12T14:55:41.000-08:00
* Add grpc interceptor to catch OOM exceptions and set the grpc status code to RESOURCE_EXHAUSTED

* test

* test 2

* 🔊 add batch concat metric

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* Make code more idiomatic

* remove some lines of code

* Restore original shape of the code

* Remove remnant of an obsolete metric

* 🎨 record OOMs the NickHill way

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ♻️ revert all changes to client.rs

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ♻️ move context.abort to decorator

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* 👷 put python-tests in CI

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* Revert "👷 put python-tests in CI"

This reverts commit a4fec4357e565282e080840d2f5a2cf02fdaa5c0.

* 🐛 fix batch error metrics

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ✨ map unavailable to ocnnection error

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* 📝 Update metrics in README

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* 🔥 remove context aborts on ABort or generic Exception

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* 🦺 more robust indexing

Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;

Co-authored-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -119,40 +119,41 @@ Prometheus metrics are exposed on the same port as the health probe endpoint (de
 
 They are all prefixed with `tgi_`. Descriptions will be added to the table below soon.
 
-| Metric                                     | Kind        | Labels                                              | Description                                                                        |
-|--------------------------------------------|-------------|-----------------------------------------------------|------------------------------------------------------------------------------------|
-| `tgi_request_count`                        | `counter`   | kind = "single" or "batch" or "stream"              | Count of generate requests (batch of n counts as 1)                                |
-| `tgi_request_input_count`                  | `counter`   |                                                     | Count of generate request inputs (batch of n counts as n)                          |
-| `tgi_request_failure`                      | `counter`   | err                                                 | Count of failed requests, segmented by error type                                  |
-| `tgi_request_success`                      | `counter`   | stop_reason, kind = "single" or "batch" or "stream" | Count of successful requests                                                       |
-| `tgi_request_max_new_tokens`               | `histogram` |                                                     | Value of `max_new_tokens` request parameter                                        |
-| `tgi_request_input_length`                 | `histogram` |                                                     | Request input length in tokens                                                     |
-| `tgi_request_raw_input_length`             | `histogram` |                                                     | Raw request input length in tokens (including "too long" validation failures)      |
-| `tgi_request_mean_time_per_token_duration` | `histogram` |                                                     | Mean time per token, per request (in seconds)                                      |
-| `tgi_request_validation_duration`          | `histogram` |                                                     | Request validation time (in seconds)                                               |
-| `tgi_request_queue_duration`               | `histogram` |                                                     | Request time spent in queue (in seconds)                                           |
-| `tgi_request_generated_tokens`             | `histogram` |                                                     | Number of tokens generated for request                                             |
-| `tgi_request_total_tokens`                 | `histogram` |                                                     | Total sequence length of request (input tokens + generated tokens)                 |
-| `tgi_request_duration`                     | `histogram` |                                                     | End-to-end generate request duration (in seconds)                                  |
-| `tgi_request_inference_duration`           | `histogram` |                                                     | Duration of inferencing portion of request (in seconds)                            |
-| `tgi_batch_inference_count`                | `counter`   | method = "prefill" or "next_token"                  | Count of model forward-pass iterations                                             |
-| `tgi_batch_inference_success`              | `counter`   | method = "prefill" or "next_token"                  | Count of successful model forward-pass iterations                                  |
-| `tgi_batch_inference_failure`              | `counter`   | method = "prefill" or "next_token"                  | Count of failed model forward-pass iterations                                      |
-| `tgi_batch_inference_batch_size`           | `histogram` | method = "prefill" or "next_token"                  | Batch size for each forward-pass iteration                                         |
-| `tgi_batch_inference_duration`             | `histogram` | method = "prefill" or "next_token", makeup          | Time taken for each forward-pass iteration (in seconds)                            |
-| `tgi_batch_inference_forward_duration`     | `histogram` | method = "prefill" or "next_token", makeup          | Time taken for each model `forward()` method invocation (in seconds)               |
-| `tgi_batch_inference_tokproc_duration`     | `histogram` | method = "prefill" or "next_token", makeup          | Rust-side token-processing time per model forward-pass iteration (in secs)         |
-| `tgi_batch_next_tokens`                    | `histogram` |                                                     | Total number of tokens included in prefill batch (including padding)               |
-| `tgi_batch_current_size`                   | `gauge`     |                                                     | Current batch size                                                                 |
-| `tgi_batch_input_tokens`                   | `gauge`     |                                                     | Total number of input tokens in current batch, including padding tokens            |
-| `tgi_batch_max_remaining_tokens`           | `gauge`     |                                                     | Maximum number of to-be-generated tokens of requests in current batch              |
-| `tgi_queue_size`                           | `gauge`     |                                                     | Current number of queued requests                                                  |
-| `tgi_queue_jump`                           | `counter`   |                                                     | Count of queue-jumps when batch filling                                            |
-| `tgi_granular_batch_addition`              | `counter`   |                                                     | Count of batch additions due to granular analysis that would not otherwise fit     |
-| `tgi_prefill_weight_limit_exceeded`        | `counter`   |                                                     | Count of times the max prefill weight is reached during new batch construction     |
-| `tgi_prompt_load_failure`                  | `counter`   |                                                     | Count of failed tuned soft-prompt loads                                            |
-| `tgi_prompt_load_duration`                 | `histogram` |                                                     | Time taken to JIT-load tuned soft-prompt in seconds (includes count of such loads) |
-| `tgi_tokenize_request_count`               | `counter`   |                                                     | Count of tokenize requests (batch of n counts as 1)                                |
-| `tgi_tokenize_request_input_count`         | `counter`   |                                                     | Count of tokenize request inputs (batch of n counts as n)                          |
-| `tgi_tokenize_request_tokens`              | `histogram` |                                                     | Count of tokenized tokens per tokenize request                                     |
-| `tgi_tokenize_request_duration`            | `histogram` |                                                     | Tokenize request duration (in seconds)                                             |
+| Metric                                     | Kind        | Labels                                                                        | Description                                                                        |
+|--------------------------------------------|-------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------------|
+| `tgi_request_count`                        | `counter`   | kind = "single" or "batch" or "stream"                                        | Count of generate requests (batch of n counts as 1)                                |
+| `tgi_request_input_count`                  | `counter`   |                                                                               | Count of generate request inputs (batch of n counts as n)                          |
+| `tgi_request_failure`                      | `counter`   | err                                                                           | Count of failed requests, segmented by error type                                  |
+| `tgi_request_success`                      | `counter`   | stop_reason, kind = "single" or "batch" or "stream"                           | Count of successful requests                                                       |
+| `tgi_request_max_new_tokens`               | `histogram` |                                                                               | Value of `max_new_tokens` request parameter                                        |
+| `tgi_request_input_length`                 | `histogram` |                                                                               | Request input length in tokens                                                     |
+| `tgi_request_raw_input_length`             | `histogram` |                                                                               | Raw request input length in tokens (including "too long" validation failures)      |
+| `tgi_request_mean_time_per_token_duration` | `histogram` |                                                                               | Mean time per token, per request (in seconds)                                      |
+| `tgi_request_validation_duration`          | `histogram` |                                                                               | Request validation time (in seconds)                                               |
+| `tgi_request_queue_duration`               | `histogram` |                                                                               | Request time spent in queue (in seconds)                                           |
+| `tgi_request_generated_tokens`             | `histogram` |                                                                               | Number of tokens generated for request                                             |
+| `tgi_request_total_tokens`                 | `histogram` |                                                                               | Total sequence length of request (input tokens + generated tokens)                 |
+| `tgi_request_duration`                     | `histogram` |                                                                               | End-to-end generate request duration (in seconds)                                  |
+| `tgi_request_inference_duration`           | `histogram` |                                                                               | Duration of inferencing portion of request (in seconds)                            |
+| `tgi_batch_concatenation_count`            | `counter`   |                                                                               | How many times the continuous batcher combined a new batch into the running batch  |
+| `tgi_batch_inference_count`                | `counter`   | method = "prefill" or "next_token"                                            | Count of model forward-pass iterations                                             |
+| `tgi_batch_inference_success`              | `counter`   | method = "prefill" or "next_token"                                            | Count of successful model forward-pass iterations                                  |
+| `tgi_batch_inference_failure`              | `counter`   | method = "prefill" or "next_token", reason = "oom", "connection", or "error"  | Count of failed model forward-pass iterations                                      |
+| `tgi_batch_inference_batch_size`           | `histogram` | method = "prefill" or "next_token"                                            | Batch size for each forward-pass iteration                                         |
+| `tgi_batch_inference_duration`             | `histogram` | method = "prefill" or "next_token", makeup                                    | Time taken for each forward-pass iteration (in seconds)                            |
+| `tgi_batch_inference_forward_duration`     | `histogram` | method = "prefill" or "next_token", makeup                                    | Time taken for each model `forward()` method invocation (in seconds)               |
+| `tgi_batch_inference_tokproc_duration`     | `histogram` | method = "prefill" or "next_token", makeup                                    | Rust-side token-processing time per model forward-pass iteration (in secs)         |
+| `tgi_batch_next_tokens`                    | `histogram` |                                                                               | Total number of tokens included in prefill batch (including padding)               |
+| `tgi_batch_current_size`                   | `gauge`     |                                                                               | Current batch size                                                                 |
+| `tgi_batch_input_tokens`                   | `gauge`     |                                                                               | Total number of input tokens in current batch, including padding tokens            |
+| `tgi_batch_max_remaining_tokens`           | `gauge`     |                                                                               | Maximum number of to-be-generated tokens of requests in current batch              |
+| `tgi_queue_size`                           | `gauge`     |                                                                               | Current number of queued requests                                                  |
+| `tgi_queue_jump`                           | `counter`   |                                                                               | Count of queue-jumps when batch filling                                            |
+| `tgi_granular_batch_addition`              | `counter`   |                                                                               | Count of batch additions due to granular analysis that would not otherwise fit     |
+| `tgi_prefill_weight_limit_exceeded`        | `counter`   |                                                                               | Count of times the max prefill weight is reached during new batch construction     |
+| `tgi_prompt_load_failure`                  | `counter`   |                                                                               | Count of failed tuned soft-prompt loads                                            |
+| `tgi_prompt_load_duration`                 | `histogram` |                                                                               | Time taken to JIT-load tuned soft-prompt in seconds (includes count of such loads) |
+| `tgi_tokenize_request_count`               | `counter`   |                                                                               | Count of tokenize requests (batch of n counts as 1)                                |
+| `tgi_tokenize_request_input_count`         | `counter`   |                                                                               | Count of tokenize request inputs (batch of n counts as n)                          |
+| `tgi_tokenize_request_tokens`              | `histogram` |                                                                               | Count of tokenized tokens per tokenize request                                     |
+| `tgi_tokenize_request_duration`            | `histogram` |                                                                               | Tokenize request duration (in seconds)                                             |
diff --git a/router/client/src/lib.rs b/router/client/src/lib.rs
@@ -15,7 +15,7 @@ pub use pb::generate::v1::next_token_chooser_parameters::LengthPenalty;
 pub use sharded_client::ShardedClient;
 pub use client::GenerateTokenResponse;
 use thiserror::Error;
-use tonic::transport;
+use tonic::{Code, transport};
 use tonic::Status;
 
 #[derive(Error, Debug, Clone)]
@@ -24,11 +24,17 @@ pub enum ClientError {
     Connection(String),
     #[error("{0}")]
     Generation(String),
+    #[error("GPU out of memory")]
+    OutOfMemory(),
 }
 
 impl From<Status> for ClientError {
     fn from(err: Status) -> Self {
-        Self::Generation(err.message().to_string())
+        match err.code() {
+            Code::ResourceExhausted => Self::OutOfMemory(),
+            Code::Unavailable => Self::Connection(err.message().to_string()),
+            _ => Self::Generation(err.message().to_string())
+        }
     }
 }
 
diff --git a/router/src/batcher.rs b/router/src/batcher.rs
@@ -436,6 +436,7 @@ async fn batching_task<B: BatchType>(
                             if added_batch_size > 0 {
                                 info!("Extending batch #{} of {} with additional batch #{} of {}",
                                 batch_id, batch_size, new_batch_id, added_batch_size);
+                                metrics::increment_counter!("tgi_batch_concatenation_count");
                             }
                         } else {
                             combined_batch_id = new_batch_id;
@@ -616,8 +617,13 @@ impl<'a> TokenProcessor<'a> {
             Err(err) => {
                 // Update health
                 self.generation_health.store(false, Ordering::SeqCst);
+                let reason = match err {
+                    ClientError::OutOfMemory() => "oom",
+                    ClientError::Connection(_) => "connection",
+                    _ => "error"
+                };
+                metrics::increment_counter!("tgi_batch_inference_failure", "method" => method, "reason" => reason);
                 self.send_errors(err, start_id);
-                metrics::increment_counter!("tgi_batch_inference_failure", "method" => method);
                 None
             },
         }
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -365,7 +365,6 @@ async fn do_run<B: BatchType>(
     // Total tokens buckets
     let total_tokens_matcher = Matcher::Full(String::from("tgi_request_total_tokens"));
     // Batch size buckets
-    let batch_size_matcher = Matcher::Full(String::from("tgi_batch_next_size"));
     let batch_inference_size_matcher = Matcher::Full(String::from("tgi_batch_inference_batch_size"));
     let batch_size_buckets: Vec<f64> = (0..args.max_batch_size).map(|x| (x + 1) as f64).collect();
 
@@ -377,7 +376,6 @@ async fn do_run<B: BatchType>(
         .set_buckets_for_metric(generated_tokens_matcher, &max_new_tokens_buckets).unwrap()
         .set_buckets_for_metric(max_new_tokens_matcher, &max_new_tokens_buckets).unwrap()
         .set_buckets_for_metric(total_tokens_matcher, &max_sequence_length_buckets).unwrap()
-        .set_buckets_for_metric(batch_size_matcher, &batch_size_buckets).unwrap()
         .set_buckets_for_metric(batch_inference_size_matcher, &batch_size_buckets).unwrap();
     let prom_handle = builder
         .install_recorder()
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py