@@ -328,8 +328,11 @@ pub struct ResponseMetricCollector {
328328 osl : usize ,
329329 // we track if cached_tokens has been observed to ensure we only increment once per request
330330 cached_tokens_observed : bool ,
331- // we track if tokenizer latency has been observed to ensure we only increment once per request
332- tokenizer_latency_observed : bool ,
331+ // we track if tokenize latency has been observed to ensure we only increment once per request
332+ tokenize_latency_observed : bool ,
333+ // latest accumulated detokenize latency and sample count reported by tracker
334+ detokenize_latency_total : Duration ,
335+ detokenize_count_total : u64 ,
333336 // Prefill worker info for TTFT attribution (set from LLMMetricAnnotation)
334337 prefill_worker_id : Option < u64 > ,
335338 prefill_dp_rank : Option < u32 > ,
@@ -987,7 +990,9 @@ impl ResponseMetricCollector {
987990 start_time : Instant :: now ( ) ,
988991 osl : 0 ,
989992 cached_tokens_observed : false ,
990- tokenizer_latency_observed : false ,
993+ tokenize_latency_observed : false ,
994+ detokenize_latency_total : Duration :: ZERO ,
995+ detokenize_count_total : 0 ,
991996 prefill_worker_id : None ,
992997 prefill_dp_rank : None ,
993998 prefill_worker_type : None ,
@@ -1052,17 +1057,30 @@ impl ResponseMetricCollector {
10521057 }
10531058 }
10541059
1055- /// Observe tokenizer latency in milliseconds, once per request.
1056- pub fn observe_tokenizer_latency ( & mut self , tokenizer_latency : Option < Duration > ) {
1057- if let Some ( latency) = tokenizer_latency
1058- && !self . tokenizer_latency_observed
1060+ /// Observe tokenize/detokenize latencies in milliseconds.
1061+ /// Tokenize is observed once per request; detokenize is accumulated and observed at request end.
1062+ pub fn observe_tokenize_latencies (
1063+ & mut self ,
1064+ tokenize_latency : Option < Duration > ,
1065+ detokenize_latency : Option < Duration > ,
1066+ detokenize_count : Option < u64 > ,
1067+ ) {
1068+ if let Some ( latency) = tokenize_latency
1069+ && !self . tokenize_latency_observed
10591070 {
1060- self . tokenizer_latency_observed = true ;
1071+ self . tokenize_latency_observed = true ;
10611072 self . metrics
10621073 . tokenizer_latency
10631074 . with_label_values ( & [ frontend_service:: operation:: TOKENIZE ] )
10641075 . observe ( latency. as_secs_f64 ( ) * 1000.0 ) ;
10651076 }
1077+
1078+ if let Some ( latency) = detokenize_latency {
1079+ self . detokenize_latency_total = latency;
1080+ }
1081+ if let Some ( count) = detokenize_count {
1082+ self . detokenize_count_total = count;
1083+ }
10661084 }
10671085
10681086 /// Observe a response with input sequence length and number of new tokens
@@ -1155,6 +1173,15 @@ impl ResponseMetricCollector {
11551173
11561174impl Drop for ResponseMetricCollector {
11571175 fn drop ( & mut self ) {
1176+ if !self . detokenize_latency_total . is_zero ( ) && self . detokenize_count_total > 0 {
1177+ let avg_detokenize_latency_ms = ( self . detokenize_latency_total . as_secs_f64 ( ) * 1000.0 )
1178+ / self . detokenize_count_total as f64 ;
1179+ self . metrics
1180+ . tokenizer_latency
1181+ . with_label_values ( & [ frontend_service:: operation:: DETOKENIZE ] )
1182+ . observe ( avg_detokenize_latency_ms) ;
1183+ }
1184+
11581185 // Publish final OSL when the collector is dropped
11591186 self . metrics
11601187 . output_sequence_length
@@ -1179,7 +1206,11 @@ pub fn process_response_and_observe_metrics<T>(
11791206 if let Ok ( Some ( metrics) ) = LLMMetricAnnotation :: from_annotation ( annotated) {
11801207 response_collector. observe_current_osl ( metrics. output_tokens ) ;
11811208 response_collector. observe_cached_tokens ( metrics. cached_tokens ) ;
1182- response_collector. observe_tokenizer_latency ( metrics. tokenizer_latency ) ;
1209+ response_collector. observe_tokenize_latencies (
1210+ metrics. tokenize_latency ,
1211+ metrics. detokenize_total_latency ,
1212+ metrics. detokenize_count ,
1213+ ) ;
11831214 response_collector. set_worker_info (
11841215 metrics. prefill_worker_id ,
11851216 metrics. prefill_dp_rank ,
@@ -1229,7 +1260,11 @@ pub fn process_response_using_event_converter_and_observe_metrics<T: Serialize>(
12291260 if let Ok ( Some ( metrics) ) = LLMMetricAnnotation :: from_annotation ( & annotated) {
12301261 response_collector. observe_current_osl ( metrics. output_tokens ) ;
12311262 response_collector. observe_cached_tokens ( metrics. cached_tokens ) ;
1232- response_collector. observe_tokenizer_latency ( metrics. tokenizer_latency ) ;
1263+ response_collector. observe_tokenize_latencies (
1264+ metrics. tokenize_latency ,
1265+ metrics. detokenize_total_latency ,
1266+ metrics. detokenize_count ,
1267+ ) ;
12331268 response_collector. set_worker_info (
12341269 metrics. prefill_worker_id ,
12351270 metrics. prefill_dp_rank ,
@@ -1735,7 +1770,9 @@ mod tests {
17351770 decode_worker_id : None ,
17361771 decode_dp_rank : None ,
17371772 decode_worker_type : None ,
1738- tokenizer_latency : Some ( Duration :: from_millis ( 8 ) ) ,
1773+ tokenize_latency : Some ( Duration :: from_millis ( 8 ) ) ,
1774+ detokenize_total_latency : Some ( Duration :: from_micros ( 100 ) ) ,
1775+ detokenize_count : Some ( 2 ) ,
17391776 } ;
17401777
17411778 let annotation = llm_metrics. to_annotation :: < ( ) > ( ) . unwrap ( ) ;
@@ -1753,6 +1790,9 @@ mod tests {
17531790 // Should return Ok(None) for metrics annotation events
17541791 assert ! ( matches!( result, Ok ( None ) ) ) ;
17551792
1793+ // Drop collector so the detokenize observation fires in Drop
1794+ drop ( collector) ;
1795+
17561796 // Should have observed the cached tokens from the metrics annotation event
17571797 let metric_families = registry. gather ( ) ;
17581798 let histogram_family = metric_families
@@ -1770,11 +1810,31 @@ mod tests {
17701810 . iter ( )
17711811 . find ( |mf| mf. name ( ) == expected_tokenizer_metric_name)
17721812 . expect ( "histogram should be registered" ) ;
1773- assert_eq ! (
1774- histogram_family. get_metric( ) [ 0 ]
1775- . get_histogram( )
1776- . get_sample_count( ) ,
1777- 1
1813+
1814+ // Find the tokenize and detokenize observations by label
1815+ let tokenize_metric = histogram_family
1816+ . get_metric ( )
1817+ . iter ( )
1818+ . find ( |m| m. get_label ( ) . iter ( ) . any ( |l| l. value ( ) == "tokenize" ) )
1819+ . expect ( "tokenize metric should exist" ) ;
1820+ assert_eq ! ( tokenize_metric. get_histogram( ) . get_sample_count( ) , 1 ) ;
1821+ // 8ms
1822+ assert ! (
1823+ ( tokenize_metric. get_histogram( ) . get_sample_sum( ) - 8.0 ) . abs( ) < 0.001 ,
1824+ "tokenize latency should be 8.0ms"
1825+ ) ;
1826+
1827+ let detokenize_metric = histogram_family
1828+ . get_metric ( )
1829+ . iter ( )
1830+ . find ( |m| m. get_label ( ) . iter ( ) . any ( |l| l. value ( ) == "detokenize" ) )
1831+ . expect ( "detokenize metric should exist" ) ;
1832+ assert_eq ! ( detokenize_metric. get_histogram( ) . get_sample_count( ) , 1 ) ;
1833+ // Average: 100us total / 2 samples = 50us = 0.05ms
1834+ assert ! (
1835+ ( detokenize_metric. get_histogram( ) . get_sample_sum( ) - 0.05 ) . abs( ) < 0.001 ,
1836+ "detokenize average latency should be 0.05ms, got {}" ,
1837+ detokenize_metric. get_histogram( ) . get_sample_sum( )
17781838 ) ;
17791839 }
17801840
@@ -1813,7 +1873,9 @@ mod tests {
18131873 decode_worker_id : None ,
18141874 decode_dp_rank : None ,
18151875 decode_worker_type : None ,
1816- tokenizer_latency : Some ( Duration :: from_millis ( 8 ) ) ,
1876+ tokenize_latency : Some ( Duration :: from_millis ( 8 ) ) ,
1877+ detokenize_total_latency : Some ( Duration :: from_micros ( 100 ) ) ,
1878+ detokenize_count : Some ( 2 ) ,
18171879 } ;
18181880
18191881 let annotation = llm_metrics. to_annotation :: < ( ) > ( ) . unwrap ( ) ;
@@ -1824,6 +1886,9 @@ mod tests {
18241886 let mut http_queue_guard = None ;
18251887 process_response_and_observe_metrics ( & annotated, & mut collector, & mut http_queue_guard) ;
18261888
1889+ // Drop collector so the detokenize observation fires in Drop
1890+ drop ( collector) ;
1891+
18271892 // Should have observed the cached tokens from the metrics annotation event
18281893 let metric_families = registry. gather ( ) ;
18291894 let histogram_family = metric_families
@@ -1841,11 +1906,26 @@ mod tests {
18411906 . iter ( )
18421907 . find ( |mf| mf. name ( ) == expected_tokenizer_metric_name)
18431908 . expect ( "histogram should be registered" ) ;
1844- assert_eq ! (
1845- histogram_family. get_metric( ) [ 0 ]
1846- . get_histogram( )
1847- . get_sample_count( ) ,
1848- 1
1909+
1910+ // Find the tokenize and detokenize observations by label
1911+ let tokenize_metric = histogram_family
1912+ . get_metric ( )
1913+ . iter ( )
1914+ . find ( |m| m. get_label ( ) . iter ( ) . any ( |l| l. value ( ) == "tokenize" ) )
1915+ . expect ( "tokenize metric should exist" ) ;
1916+ assert_eq ! ( tokenize_metric. get_histogram( ) . get_sample_count( ) , 1 ) ;
1917+
1918+ let detokenize_metric = histogram_family
1919+ . get_metric ( )
1920+ . iter ( )
1921+ . find ( |m| m. get_label ( ) . iter ( ) . any ( |l| l. value ( ) == "detokenize" ) )
1922+ . expect ( "detokenize metric should exist" ) ;
1923+ assert_eq ! ( detokenize_metric. get_histogram( ) . get_sample_count( ) , 1 ) ;
1924+ // Average: 100us total / 2 samples = 50us = 0.05ms
1925+ assert ! (
1926+ ( detokenize_metric. get_histogram( ) . get_sample_sum( ) - 0.05 ) . abs( ) < 0.001 ,
1927+ "detokenize average latency should be 0.05ms, got {}" ,
1928+ detokenize_metric. get_histogram( ) . get_sample_sum( )
18491929 ) ;
18501930 }
18511931}
0 commit comments