|
27 | 27 | ) |
28 | 28 | from vllm.attention.layer import Attention, MLAAttention |
29 | 29 | from vllm.compilation.counter import compilation_counter |
30 | | -from vllm.compilation.cuda_graph import CUDAGraphWrapper |
| 30 | +from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper |
31 | 31 | from vllm.compilation.monitor import set_cudagraph_capturing_enabled |
32 | 32 | from vllm.config import ( |
33 | 33 | CompilationMode, |
@@ -257,6 +257,7 @@ class ExecuteModelState(NamedTuple): |
257 | 257 | sample_hidden_states: torch.Tensor |
258 | 258 | aux_hidden_states: list[torch.Tensor] | None |
259 | 259 | ec_connector_output: ECConnectorOutput | None |
| 260 | + cudagraph_stats: CUDAGraphStat | None |
260 | 261 |
|
261 | 262 |
|
262 | 263 | class GPUModelRunner( |
@@ -2755,7 +2756,11 @@ def _determine_batch_execution_and_padding( |
2755 | 2756 | force_uniform_decode: bool | None = None, |
2756 | 2757 | force_has_lora: bool | None = None, |
2757 | 2758 | ) -> tuple[ |
2758 | | - CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None |
| 2759 | + CUDAGraphMode, |
| 2760 | + BatchDescriptor, |
| 2761 | + UBatchSlices | None, |
| 2762 | + torch.Tensor | None, |
| 2763 | + CUDAGraphStat | None, |
2759 | 2764 | ]: |
2760 | 2765 | num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) |
2761 | 2766 | uniform_decode = ( |
@@ -2820,7 +2825,22 @@ def _determine_batch_execution_and_padding( |
2820 | 2825 | # num_tokens_across_dp will no-longer be valid |
2821 | 2826 | assert batch_descriptor.num_tokens == num_tokens_padded |
2822 | 2827 |
|
2823 | | - return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp |
| 2828 | + cudagraph_stats = None |
| 2829 | + if self.vllm_config.observability_config.cudagraph_metrics: |
| 2830 | + cudagraph_stats = CUDAGraphStat( |
| 2831 | + num_unpadded_tokens=num_tokens, |
| 2832 | + num_padded_tokens=batch_descriptor.num_tokens, |
| 2833 | + num_paddings=batch_descriptor.num_tokens - num_tokens, |
| 2834 | + runtime_mode=str(cudagraph_mode), |
| 2835 | + ) |
| 2836 | + |
| 2837 | + return ( |
| 2838 | + cudagraph_mode, |
| 2839 | + batch_descriptor, |
| 2840 | + ubatch_slices, |
| 2841 | + num_tokens_across_dp, |
| 2842 | + cudagraph_stats, |
| 2843 | + ) |
2824 | 2844 |
|
2825 | 2845 | @torch.inference_mode() |
2826 | 2846 | def execute_model( |
@@ -2918,6 +2938,7 @@ def execute_model( |
2918 | 2938 | batch_desc, |
2919 | 2939 | ubatch_slices, |
2920 | 2940 | num_tokens_across_dp, |
| 2941 | + cudagraph_stats, |
2921 | 2942 | ) = self._determine_batch_execution_and_padding( |
2922 | 2943 | num_tokens=num_tokens_unpadded, |
2923 | 2944 | num_reqs=num_reqs, |
@@ -3067,6 +3088,7 @@ def execute_model( |
3067 | 3088 | sample_hidden_states, |
3068 | 3089 | aux_hidden_states, |
3069 | 3090 | ec_connector_output, |
| 3091 | + cudagraph_stats, |
3070 | 3092 | ) |
3071 | 3093 | self.kv_connector_output = kv_connector_output |
3072 | 3094 | return None |
@@ -3102,6 +3124,7 @@ def sample_tokens( |
3102 | 3124 | sample_hidden_states, |
3103 | 3125 | aux_hidden_states, |
3104 | 3126 | ec_connector_output, |
| 3127 | + cudagraph_stats, |
3105 | 3128 | ) = self.execute_model_state |
3106 | 3129 | # Clear ephemeral state. |
3107 | 3130 | self.execute_model_state = None |
@@ -3217,6 +3240,7 @@ def propose_draft_token_ids(sampled_token_ids): |
3217 | 3240 | if self.supports_mm_inputs |
3218 | 3241 | else None, |
3219 | 3242 | num_nans_in_logits=num_nans_in_logits, |
| 3243 | + cudagraph_stats=cudagraph_stats, |
3220 | 3244 | ) |
3221 | 3245 |
|
3222 | 3246 | if not self.use_async_scheduling: |
@@ -3937,7 +3961,7 @@ def _dummy_run( |
3937 | 3961 |
|
3938 | 3962 | num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) |
3939 | 3963 |
|
3940 | | - _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = ( |
| 3964 | + _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = ( |
3941 | 3965 | self._determine_batch_execution_and_padding( |
3942 | 3966 | num_tokens=num_tokens_unpadded, |
3943 | 3967 | num_reqs=num_reqs, |
|
0 commit comments