fix(cloud.infer): reduce Qwen3-MoE export OOM risk (#821)

jd316 · web-flow · commit 815309ec8033 · 2026-03-11T09:27:24.000+05:30
Summary - Keep `use_onnx_subfunctions` disabled by default in `QEfficient.cloud.infer` - Provide explicit opt-in via `--use-onnx-subfunctions` only - Remove `--no-use-onnx-subfunctions` - Update infer unit tests for explicit-enable and default-disabled behavior - Update quick-start and text-generation docs to reflect explicit opt-in behavior Why - Align infer behavior with reviewer feedback to keep defaults unchanged and avoid model-specific auto-enable behavior. Fixes - Fixes #702 Validation - `python -m py_compile QEfficient/cloud/infer.py tests/cloud/test_infer.py` - `ruff check QEfficient/cloud/infer.py tests/cloud/test_infer.py` - `pytest -q tests/cloud/test_infer.py -m "not on_qaic"` (2 passed, 5 deselected) --------- Signed-off-by: jd316 <jd316biswas@gmail.com>
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -139,6 +139,7 @@ def main(
     qnn_config: Optional[str] = None,
     trust_remote_code: Optional[bool] = False,
     ccl_enabled: Optional[bool] = False,
+    use_onnx_subfunctions: bool = False,
     **kwargs,
 ) -> None:
     """
@@ -205,6 +206,8 @@ def main(
         Path of the QNN Config parameters file. Default is None.
     trust_remote_code : bool, optional
         If True, trusts remote code when loading models from HuggingFace. Default is False.
+    use_onnx_subfunctions : bool, optional
+        Enables ONNX subfunctions during export and compile. Default is False.
     **kwargs :
         Additional compiler options passed directly to `qaic-compile`. Any flag supported by
         `qaic-compile` can be passed. Parameters are converted to flags as follows:
@@ -231,12 +234,10 @@ def main(
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
 
-    if "--mxfp6" in sys.argv:
-        if args.mxfp6:
-            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
-    if "--mxint8" in sys.argv:
-        if args.mxint8:
-            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
+    if "--mxfp6" in sys.argv and mxfp6:
+        logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if "--mxint8" in sys.argv and mxint8:
+        logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
     qaic_config = {"ccl_enabled": True} if ccl_enabled else None
 
@@ -280,6 +281,7 @@ def main(
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
+        use_onnx_subfunctions=use_onnx_subfunctions,
         **kwargs,
     )
 
@@ -382,6 +384,14 @@ def main(
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
+    parser.add_argument(
+        "--use-onnx-subfunctions",
+        "--use_onnx_subfunctions",
+        dest="use_onnx_subfunctions",
+        action="store_true",
+        default=False,
+        help="Enable ONNX subfunctions during export/compile.",
+    )
     parser.add_argument(
         "--num_cores", "--num-cores", type=int, required=True, help="Number of cores to compile on Cloud AI 100"
     )
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -111,13 +111,19 @@ This is the single e2e CLI API, which takes `model_card` name as input along wit
 
 * HuggingFace model files Download → Optimize for Cloud AI 100 → Export to `ONNX` → Compile on Cloud AI 100 → [Execute](#execute_api)
 * It skips the export/compile stage based if `ONNX` or `qpc` files are found. If you use infer second time with different compilation arguments, it will automatically skip `ONNX` model creation and directly jump to compile stage.
+* ONNX subfunctions can be enabled explicitly using `--use-onnx-subfunctions`.
 
 
 ```bash
 # Check out the options using the help
 python -m QEfficient.cloud.infer --help
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first
 ```
+
+```bash
+# Optional: explicitly control ONNX subfunction usage
+python -m QEfficient.cloud.infer --model_name Qwen/Qwen3-30B-A3B-Instruct-2507 --batch_size 1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is" --use-onnx-subfunctions
+```
 If executing for batch size>1,
 You can pass input prompts in single string but separate with pipe (|) symbol". Example below
 
diff --git a/examples/text_generation/README.md b/examples/text_generation/README.md
@@ -115,6 +115,7 @@ This example:
 - Demonstrates MoE model inference
 - Uses sparse expert activation for efficiency
 - Works with Qwen, Mixtral, and other MoE models
+- Supports explicit ONNX subfunction enablement with `--use-onnx-subfunctions`
 
 
 ## CLI Workflow
@@ -216,6 +217,7 @@ This uses the pre-compiled QPC for fast inference. You can run this multiple tim
 | `--device_group` | Device IDs to use | `[0]` | `[0]` or `[0,1,2,3]` |
 | `--mxfp6` | Enable MXFP6 quantization | False | Add flag to enable |
 | `--mxint8_kv_cache` | Enable MXINT8 KV cache | False | Add flag to enable |
+| `--use-onnx-subfunctions` | Enable ONNX subfunctions for export/compile | False | Add flag to enable |
 | `--mos` | Memory optimization strategy | 1 | `1` or `2` |
 | `--aic_enable_depth_first` | Enable depth-first execution | False | Add flag to enable |
 
@@ -312,4 +314,3 @@ This script demonstrates:
 By default, exported models and QPC files are stored in `~/.cache/qeff_cache`. Customize this with:
 - `QEFF_HOME`: Primary cache directory
 - `XDG_CACHE_HOME`: Alternative cache location
-
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
@@ -5,14 +5,22 @@
 #
 # -----------------------------------------------------------------------------
 
+from types import SimpleNamespace
+
 import pytest
 
 import QEfficient
 from QEfficient.cloud.infer import main as infer
 
 
 def check_infer(
-    mocker, model_name, prompt="My name is", full_batch_size=None, enable_qnn=False, image_url=None, generation_len=20
+    mocker,
+    model_name,
+    prompt="My name is",
+    full_batch_size=None,
+    enable_qnn=False,
+    image_url=None,
+    generation_len=20,
 ):
     check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.infer, "check_and_assign_cache_dir")
     qeff_model_load_spy = mocker.spy(QEfficient.cloud.infer.QEFFCommonLoader, "from_pretrained")
@@ -99,3 +107,42 @@ def test_infer_vlm(mocker):
         prompt="Describe the image.",
         image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
     )
+
+
+class _DummyQEFFModel:
+    def __init__(self, architecture):
+        self.model = SimpleNamespace(config=SimpleNamespace(architectures=[architecture]))
+        self.compile_kwargs = None
+
+    def compile(self, **kwargs):
+        self.compile_kwargs = kwargs
+        return "/tmp/qpc"
+
+    def generate(self, *args, **kwargs):
+        return {}
+
+
+def _run_infer_with_dummy_model(mocker, architecture, **infer_kwargs):
+    dummy_model = _DummyQEFFModel(architecture=architecture)
+    mocker.patch.object(QEfficient.cloud.infer, "check_and_assign_cache_dir", return_value="/tmp/cache")
+    mocker.patch.object(QEfficient.cloud.infer.QEFFCommonLoader, "from_pretrained", return_value=dummy_model)
+    mocker.patch.object(QEfficient.cloud.infer, "load_hf_tokenizer", return_value=object())
+
+    infer(
+        model_name="dummy/model",
+        num_cores=16,
+        prompt=["hello"],
+        generation_len=1,
+        **infer_kwargs,
+    )
+    return dummy_model
+
+
+def test_infer_enables_onnx_subfunctions_when_explicitly_set(mocker):
+    dummy_model = _run_infer_with_dummy_model(mocker, architecture="Qwen3MoeForCausalLM", use_onnx_subfunctions=True)
+    assert dummy_model.compile_kwargs["use_onnx_subfunctions"] is True
+
+
+def test_infer_keeps_onnx_subfunctions_disabled_by_default(mocker):
+    dummy_model = _run_infer_with_dummy_model(mocker, architecture="LlamaForCausalLM")
+    assert dummy_model.compile_kwargs["use_onnx_subfunctions"] is False