[perf] feat: Add MFU for Qwen3-VL dense (verl-project#4753)

zhihaofang1017 · web-flow · commit 11764c6b2e7d · 2026-01-07T20:14:12.000+08:00
### What does this PR do? Add the _estimate_qwen3_vit_flop and _estimate_qwen3_vl_flops function to calculate the FLOPs of Qwen3-VL dense models. Update the test cases to verify the calculation accuracy of Qwen3-VL models. ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test The following is the output result of running the test file. <img width="1271" height="152" alt="image" src="https://github.com/user-attachments/assets/2a3d426c-bd32-4369-9c07-c8a17c60e98b" /> > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/tests/utils/test_flops_counter.py b/tests/utils/test_flops_counter.py
@@ -16,14 +16,16 @@
 
 import pytest
 
-from verl.utils.flops_counter import _DEVICE_FLOPS, FlopsCounter, get_device_flops
+from verl.utils.flops_counter import FlopsCounter
 
 VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"}
 
 
 class Config:
     def __init__(self, config_dict):
         for key, value in config_dict.items():
+            if isinstance(value, dict):
+                value = Config(value)
             setattr(self, key, value)
 
 
@@ -300,28 +302,101 @@ def __init__(self, config_dict):
         # S*(2*V*H + L*(4*H**2 + k_mlp*H*I + k_qkn*H)) * (SUM[seqlen]) + 12*SUM[seqlen**2]*L*H
         "expected_flops_tuple": (199154680725504 / 1e12, 732294071451648 / 1e12),
     },
+    "qwen3_vl": {
+        "config": {  # Qwen/Qwen3-VL-8B
+            "model_type": "qwen3_vl",
+            # -------- Text config --------
+            "text_config": {
+                "vocab_size": 151936,
+                "hidden_size": 4096,
+                "intermediate_size": 12288,
+                "num_hidden_layers": 36,
+                "num_attention_heads": 32,
+                "num_key_value_heads": 8,
+                "head_dim": 128,
+            },
+            # -------- Vision config (ViT) --------
+            "vision_config": {
+                "deepstack_visual_indexes": [8, 16, 24],
+                "num_heads": 16,
+                "depth": 27,
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "out_hidden_size": 4096,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+                "in_channels": 3,
+                "patch_size": 16,
+            },
+        },
+        "batch_seqlens_tuple": (
+            [512, 1024, 2048],
+            [4096, 4096, 4096],
+        ),
+        "images_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # -----Text-----
+        # 6*(vocab*hidden*2
+        #   + layer*(hidden*(q+k+v+o) + hidden*inter*3)
+        # )*token_sum
+        # + 12*sum(seqlen^2)*layer*hidden
+        #
+        # -----ViT-----
+        # patch_embed_N =hidden*temporal_patch_size*in_channels* patch_size^2
+        # attn_linear_N =hidden*(4*hidden)
+        # mlp_N =hidden*inter*2
+        # merger_N =((o+hidden*spatial_merge_size^2) * (hidden*spatial_merge_size^2))
+        # deepstack_merger_N =merger_N * 3
+        # dense_N =patch_embed_N + (attn_linear_N + mlp_N) * 27 + deepstack_merger_N + merger_N
+        #
+        # 6*(151936*4096*2
+        #   + 36*(4096*(4096+1024+1024+4096) + 4096*12288*3)
+        # )*(512+1024+2048)
+        # + 12*(512*512+1024*1024+2048*2048)*36*4096
+        # + 6 * dense_N * (512 + 1024 + 2048)
+        # + 12 * (512**2 + 1024**2 + 2048**2) * 27 * 16 * 72
+        #
+        # 6*(151936*4096*2
+        #   + 36*(4096*(4096+1024+1024+4096) + 4096*12288*3)
+        # )*(4096+4096+4096)
+        # + 12*(4096*4096+4096*4096+4096*4096)*36*4096
+        # + 6 * dense_N * (4096 + 4096 + 2048)
+        # + 12 * (4096**2 + 4096**2 + 4096**2) * 27 * 16 * 72
+        "expected_flops_tuple": (
+            200250312622080 / 1e12,
+            753976643420160 / 1e12,
+        ),
+    },
 }
 
 
 @pytest.mark.parametrize(
     "config_type",
-    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus", "gpt_oss"],
+    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus", "gpt_oss", "qwen3_vl"],
 )
 def test_flops_counter(config_type: str):
     test_config = CONFIG[config_type]
     config = Config(test_config["config"])
     flops_counter = FlopsCounter(config)
-    for batch_seqlens, expected_flops in zip(
-        test_config["batch_seqlens_tuple"], test_config["expected_flops_tuple"], strict=True
-    ):
-        # set delta time to 1 to get the flops
-        counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1)
-        print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
-        assert math.isclose(counted_flops, expected_flops), (
-            f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
-        )
-
-
-def test_device_flops():
-    for key, val in _DEVICE_FLOPS.items():
-        assert get_device_flops(unit="B", device_name=key) == val
+    if "images_seqlens_tuple" in test_config:
+        for batch_seqlens, images_seqlens, expected_flops in zip(
+            test_config["batch_seqlens_tuple"],
+            test_config["images_seqlens_tuple"],
+            test_config["expected_flops_tuple"],
+            strict=True,
+        ):
+            # set delta time to 1 to get the flops
+            counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1, images_seqlens=images_seqlens)
+            print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
+            assert math.isclose(counted_flops, expected_flops), (
+                f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
+            )
+    else:
+        for batch_seqlens, expected_flops in zip(
+            test_config["batch_seqlens_tuple"], test_config["expected_flops_tuple"], strict=True
+        ):
+            # set delta time to 1 to get the flops
+            counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1)
+            print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
+            assert math.isclose(counted_flops, expected_flops), (
+                f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
+            )
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -660,6 +660,10 @@ def _compute_multi_modal_inputs(self, output, input_ids) -> dict[str, torch.Tens
         # We must use dict(multi_modal_inputs) to convert BatchFeature values to a new dict
         # because np.array() only keeps the keys for BatchFeature.
         multi_modal_inputs = dict(multi_modal_inputs.convert_to_tensors("pt"))
+        image_grid_thw = multi_modal_inputs.get("image_grid_thw")
+        if image_grid_thw is not None:
+            images_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0])
+            multi_modal_inputs["images_seqlens"] = images_seqlens
         return multi_modal_inputs
 
     def _compute_position_ids(self, input_ids, attention_mask, multi_modal_inputs) -> torch.Tensor:
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -1449,7 +1449,13 @@ def fit(self):
 
                     # compute global_valid tokens
                     batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-
+                    # get images_seqlens
+                    images_seqlens_all = []
+                    for multi_modal_input in batch.non_tensor_batch["multi_modal_inputs"]:
+                        if "image_grid_thw" not in multi_modal_input.keys():
+                            continue
+                        images_seqlens_all.extend(multi_modal_input["images_seqlens"].tolist())
+                    batch.meta_info["images_seqlens"] = images_seqlens_all
                     with marked_timer("reward", timing_raw, color="yellow"):
                         # compute reward model score
                         if self.use_rm and "rm_scores" not in batch.batch.keys():
diff --git a/verl/utils/flops_counter.py b/verl/utils/flops_counter.py
@@ -118,6 +118,99 @@ def _estimate_qwen2_flops(config, tokens_sum, batch_seqlens, delta_time):
     return flops_achieved
 
 
+def _estimate_qwen3_vl_flops(config, tokens_sum, batch_seqlens, delta_time, **kargs):
+    # qwen3_vl uses text_config and vision_config to distinguish configs of different parts.
+    hidden_size = config.text_config.hidden_size
+    vocab_size = config.text_config.vocab_size
+    num_hidden_layers = config.text_config.num_hidden_layers
+    num_key_value_heads = config.text_config.num_key_value_heads
+    num_attention_heads = config.text_config.num_attention_heads
+    intermediate_size = config.text_config.intermediate_size
+
+    head_dim = hidden_size // num_attention_heads
+    q_size = num_attention_heads * head_dim
+    k_size = num_key_value_heads * head_dim
+    v_size = num_key_value_heads * head_dim
+
+    # non-attn per layer parm
+    mlp_N = hidden_size * intermediate_size * 3
+    attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim)
+    emd_and_lm_head_N = vocab_size * hidden_size * 2
+    # non-attn all_layer parm
+    dense_N = (mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N
+    # non-attn all_layer & all_token fwd & bwd flops
+    dense_N_flops = 6 * dense_N * tokens_sum
+
+    # qwen3_vl uses deepstack to merge visual embeds and text embeds, but it has no tensor operation.
+
+    # attn all_layer & all_token fwd & bwd flops
+    seqlen_square_sum = 0
+    for seqlen in batch_seqlens:
+        seqlen_square_sum += seqlen * seqlen
+    attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers
+
+    # vit flops
+    images_seqlens = kargs.get("images_seqlens", None)
+    if images_seqlens is not None:
+        vit_flops = _estimate_qwen3_vit_flop(images_seqlens, config.vision_config)
+    else:
+        vit_flops = 0
+
+    # all_layer & all_token fwd & bwd flops
+    flops_all_token = dense_N_flops + attn_qkv_flops + vit_flops
+    flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
+    return flops_achieved
+
+
+def _estimate_qwen3_vit_flop(images_seqlens, config):
+    """
+    Estimate the FLOPS of the vision encoder for Qwen3-VL
+    """
+
+    if config is None:
+        return 0
+    tokens_sum = sum(images_seqlens)
+
+    num_heads = config.num_heads
+    depth = config.depth
+
+    dim = config.hidden_size
+    mlp_hidden_dim = config.intermediate_size
+    out_hidden_size = config.out_hidden_size
+
+    spatial_merge_size = config.spatial_merge_size
+
+    head_dim = dim // num_heads
+
+    # every vision token's patch_embed comes from a conv of (C, T, H, W) -> (dim,)
+    patch_embed_N = dim * config.in_channels * config.temporal_patch_size * config.patch_size * config.patch_size
+    # Qwen3 VL vision mlp does not use GLU, thus 2.
+    mlp_N = dim * mlp_hidden_dim * 2
+    attn_linear_N = dim * (4 * dim)  # qkv and output proj
+    merger_N = (out_hidden_size + (dim * (spatial_merge_size**2))) * (dim * (spatial_merge_size**2))
+
+    # Qwen3 VL uses deep stack, one merger for every deepstack layer
+    deepstack_merger_N = merger_N * len(config.deepstack_visual_indexes)
+    # non-attn all_layer parm
+    dense_N = patch_embed_N + (mlp_N + attn_linear_N) * depth + deepstack_merger_N + merger_N
+
+    # non-attn all_layer & all_token fwd & bwd flops
+    dense_N_flops = 6 * dense_N * tokens_sum
+
+    # In Qwen3 VL, full attention is used in all vision layers.
+    full_attn_layer_num = depth
+
+    # full attn layer & all_token fwd & bwd flops
+    seqlen_square_sum = 0
+    for seqlen in images_seqlens:
+        seqlen_square_sum += seqlen * seqlen
+    attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_heads * full_attn_layer_num
+
+    vit_flops = dense_N_flops + attn_qkv_flops
+
+    return vit_flops
+
+
 def _estimate_deepseek_v3_flops(config, tokens_sum, batch_seqlens, delta_time):
     hidden_size = config.hidden_size
     vocab_size = config.vocab_size
@@ -397,7 +490,7 @@ def _estimate_unknown_flops(config, tokens_sum, batch_seqlens, delta_time):
     "qwen2_5_vl": _estimate_qwen2_flops,
     "qwen3": _estimate_qwen2_flops,
     "qwen3_moe": _estimate_qwen2_moe_flops,
-    "qwen3_vl": _estimate_qwen2_flops,
+    "qwen3_vl": _estimate_qwen3_vl_flops,
     "qwen3_vl_moe": _estimate_qwen2_moe_flops,
     "deepseek_v3": _estimate_deepseek_v3_flops,
     "minicpmv": _estimate_qwen2_flops,
@@ -429,10 +522,10 @@ def __init__(self, config: PretrainedConfig):
                 f"zero."
             )
 
-        self.config = getattr(config, "text_config", config)
+        self.config = config
 
     # TODO: actually we can make this a static method
-    def estimate_flops(self, batch_seqlens, delta_time):
+    def estimate_flops(self, batch_seqlens, delta_time, **kargs):
         """
         Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken.
 
@@ -447,6 +540,10 @@ def estimate_flops(self, batch_seqlens, delta_time):
         """
         tokens_sum = sum(batch_seqlens)
         func = ESTIMATE_FUNC.get(self.config.model_type, _estimate_unknown_flops)
-        estimated_flops = func(self.config, tokens_sum, batch_seqlens, delta_time)
+        images_seqlens = kargs.get("images_seqlens", None)
+        if images_seqlens is not None and "vl" in func.__name__:
+            estimated_flops = func(self.config, tokens_sum, batch_seqlens, delta_time, **kargs)
+        else:
+            estimated_flops = func(self.config, tokens_sum, batch_seqlens, delta_time)
         promised_flops = get_device_flops()
         return estimated_flops, promised_flops
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -926,7 +926,10 @@ def update_actor(self, data: DataProto):
                 metrics = self.actor.update_policy(data=data)
             delta_time = timer.last
             global_num_tokens = data.meta_info["global_token_num"]
-            estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
+            images_seqlens = data.meta_info.get("images_seqlens", None)
+            estimated_flops, promised_flops = self.flops_counter.estimate_flops(
+                global_num_tokens, delta_time, images_seqlens=images_seqlens
+            )
             metrics["perf/mfu/actor"] = (
                 estimated_flops * self.config.actor.ppo_epochs / promised_flops / self.world_size
             )
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
@@ -738,6 +738,10 @@ def update_actor(self, data: DataProto):
             metrics = self.actor.update_policy(dataloader=dataloader)
         delta_time = timer.last
         global_num_tokens = data.meta_info["global_token_num"]
+        images_seqlens = data.meta_info.get("images_seqlens", None)
+        estimated_flops, promised_flops = self.flops_counter.estimate_flops(
+            global_num_tokens, delta_time, images_seqlens=images_seqlens
+        )
         estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
         metrics["perf/mfu/actor"] = estimated_flops * self.config.actor.ppo_epochs / promised_flops / self.world_size
         metrics["perf/max_memory_allocated_gb"] = get_torch_device().max_memory_allocated() / (1024**3)