diff --git a/src/llmperf/ray_clients/openai_chat_completions_client.py b/src/llmperf/ray_clients/openai_chat_completions_client.py index f2e0a91..afd934b 100644 --- a/src/llmperf/ray_clients/openai_chat_completions_client.py +++ b/src/llmperf/ray_clients/openai_chat_completions_client.py @@ -109,7 +109,7 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: print(f"Warning Or Error: {e}") print(error_response_code) - metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now + metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) - ttft metrics[common_metrics.TTFT] = ttft metrics[common_metrics.E2E_LAT] = total_request_time metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index 63216b1..e3528d3 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -117,8 +117,8 @@ def launch_request(thread_index): num_output_tokens = get_token_length(gen_text) with completed_requests_lock: if num_completed_requests < max_num_completed_requests: - if num_output_tokens: - request_metrics[common_metrics.INTER_TOKEN_LAT] /= request_metrics[common_metrics.NUM_OUTPUT_TOKENS] + if num_output_tokens > 1: + request_metrics[common_metrics.INTER_TOKEN_LAT] /= (num_output_tokens - 1) else: request_metrics[common_metrics.INTER_TOKEN_LAT] = 0 request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens @@ -154,8 +154,8 @@ def launch_request(thread_index): num_output_tokens = get_token_length(gen_text) with completed_requests_lock: if num_completed_requests < max_num_completed_requests: - if num_output_tokens: - request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens + if num_output_tokens > 1: + request_metrics[common_metrics.INTER_TOKEN_LAT] /= (num_output_tokens - 1) else: request_metrics[common_metrics.INTER_TOKEN_LAT] = 0 request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens