forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstatistics.py
More file actions
212 lines (171 loc) · 6.83 KB
/
statistics.py
File metadata and controls
212 lines (171 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from __future__ import annotations
from typing import Any, List, Optional
from pydantic import BaseModel, computed_field
class RequestRecord(BaseModel):
id: int = -1
num_input_tokens: int = -1
tokens: List[int] = []
error_tokens: int = 0
start_timestamp: int = -1
first_token_timestamp: int = -1
end_timestamp: int = -1
decode_iteration: int = 0
def register_event(self,
is_error: bool,
is_final: bool,
timestamp: int,
decoding_iter: int,
tokens: List[int],
first_token_timestamp: int = None) -> None:
if is_final:
self.end_timestamp = timestamp
elif self.first_token_timestamp == -1:
self.first_token_timestamp = timestamp
if first_token_timestamp is not None and is_final:
self.first_token_timestamp = first_token_timestamp
if is_error:
self.error_tokens += 1
self.tokens += tokens
self.decode_iteration = decoding_iter
@computed_field
def num_total_output_tokens(self) -> int:
"""
Returns the total number of output tokens generated by the request.
"""
return len(self.tokens)
@computed_field
def num_generated_tokens(self) -> int:
"""
Returns the number of generated (OSL - 1) tokens by the request.
"""
return self.num_total_output_tokens - 1
@computed_field
def generation_time(self) -> int:
"""
Returns the generation time of the request (E2E Latency - TTFT).
"""
return self.end_to_end_latency - self.time_to_first_token
@computed_field
def time_to_first_token(self) -> int:
return (self.first_token_timestamp -
self.start_timestamp if self.first_token_timestamp > 0 else 0.0)
@computed_field
def intertoken_latency(self) -> float:
"""
Returns the time-per-output-token latency of the request [(OSL - 1) / (E2E Latency - TTFT)].
"""
return ((self.end_timestamp - self.first_token_timestamp) /
self.num_generated_tokens
if self.num_generated_tokens > 0 else 0.0)
@computed_field
def end_to_end_latency(self) -> int:
"""
Returns the end-to-end latency of the request (end time - start time).
"""
return self.end_timestamp - self.start_timestamp
@computed_field
def output_token_throughput(self) -> float:
"""
Returns the total token throughput of the request (Total output tokens / E2E Latency).
"""
return float(self.num_total_output_tokens) / self.end_to_end_latency
@computed_field
def generation_token_throughput(self) -> float:
return (self.num_generated_tokens / self.generation_time)
class PercentileStats(BaseModel):
p50: float
p90: float
p95: float
p99: float
minimum: float
maximum: float
average: float
@classmethod
def from_iterable(cls, values: List[Any]) -> PercentileStats:
length = len(values)
sorted_values = sorted(values)
return cls(
p50=sorted_values[int(length * 0.50)],
p90=sorted_values[int(length * 0.90)],
p95=sorted_values[int(length * 0.95)],
p99=sorted_values[int(length * 0.99)],
average=float(sum(values)) / length,
minimum=min(values),
maximum=max(values),
)
class BenchmarkStatistics(BaseModel):
# Time-related Properties
total_latency_ns: float
# Token-related Properties
total_output_tokens: int
total_input_tokens: int
# General Information
num_requests: int
issue_rate_ns: float
# Speculative Information
acceptance_length: float
# Energy Monitoring
total_energy: Optional[float] = None
# Percentile-related Statistics
request_latency_percentiles: Optional[PercentileStats] = None
output_throughput_percentiles: Optional[PercentileStats] = None
token_percentiles: Optional[PercentileStats] = None
tpot_percentiles: Optional[PercentileStats] = None
ttft_percentiles: Optional[PercentileStats] = None
generation_tp_percentiles: Optional[PercentileStats] = None
generation_latency_percentiles: Optional[PercentileStats] = None
# Percentile-related Speculative Statistics
num_draft_tokens_percentiles: Optional[PercentileStats] = None
num_accepted_draft_tokens_percentiles: Optional[PercentileStats] = None
draft_acceptance_rate_percentiles: Optional[PercentileStats] = None
acceptance_length_percentiles: Optional[PercentileStats] = None
@computed_field
def sum_per_request_latencies_ns(self) -> float:
return self.request_latency_percentiles.average * self.num_requests
@computed_field
def avg_concurrent_requests(self) -> int:
return self.sum_per_request_latencies_ns / self.total_latency_ns
@computed_field
def generation_tokens(self) -> int:
return int(self.total_output_tokens - self.num_requests)
@computed_field
def total_generation_time_ns(self) -> float:
return self.generation_latency_percentiles.average * self.num_requests
@computed_field
def per_user_time_per_output_token_ns(self) -> float:
return self.tpot_percentiles.average
@computed_field
def per_user_time_to_first_token_ns(self) -> float:
return self.ttft_percentiles.average
@computed_field
def per_user_generation_token_throughput_ns(self) -> float:
return self.generation_tp_percentiles.average
@computed_field
def request_throughput_ns(self) -> float:
return float(self.num_requests) / self.total_latency_ns
@computed_field
def average_input_length(self) -> float:
return float(self.total_input_tokens) / self.num_requests
@computed_field
def average_output_length(self) -> float:
return float(self.total_output_tokens) / self.num_requests
@computed_field
def output_throughput_tok_ns(self) -> float:
return float(self.total_output_tokens) / self.total_latency_ns
@computed_field
def total_token_throughput_tok_ns(self) -> float:
return float(self.total_input_tokens +
self.total_output_tokens) / self.total_latency_ns
@computed_field
def output_throughput_tok_ns_per_user(self) -> float:
return self.output_throughput_percentiles.average
@computed_field
def output_tps_per_w(self) -> Optional[float]:
if not self.total_energy:
return None
return float(self.total_output_tokens / self.total_energy)
@computed_field
def total_gpu_power(self) -> Optional[float]:
if not self.total_energy or not self.total_latency_ns:
return None
return float((self.total_energy * 1e9) / self.total_latency_ns)