sdpython
diff --git a/‎_unittests/ut_helpers/test_model_builder_helper.py‎
Lines changed: 46 additions & 6 deletions b/‎_unittests/ut_helpers/test_model_builder_helper.py‎
Lines changed: 46 additions & 6 deletions
diff --git a/‎onnx_diagnostic/helpers/model_builder_helper.py‎
Lines changed: 254 additions & 12 deletions b/‎onnx_diagnostic/helpers/model_builder_helper.py‎
Lines changed: 254 additions & 12 deletions
@@ -4,15 +4,18 @@
     ExtTestCase,
     requires_torch,
     requires_transformers,
+    hide_stdout,
 )
 from onnx_diagnostic.helpers.model_builder_helper import (
     download_model_builder_to_cache,
     import_model_builder,
-    create_model,
+    create_model_builder,
+    save_model_builder,
 )
 from onnx_diagnostic.torch_models.hghub import (
     get_untrained_model_with_inputs,
 )
+from onnx_diagnostic.helpers.rt_helper import make_feeds
 
 
 class TestModelBuilderHelper(ExtTestCase):
@@ -28,15 +31,52 @@ def test_download_model_builder(self):
     # This is to limit impact on CI.
     @requires_transformers("4.52")
     @requires_torch("2.7.99")
+    @hide_stdout()
     def test_model_builder_id(self):
+        # clear&&python ~/.cache/onnx-diagnostic/builder.py
+        # --model arnir0/Tiny-LLM -p fp16 -c dump_cache -e cpu -o dump_model
         folder = self.get_dump_folder("test_model_builder_id")
         data = get_untrained_model_with_inputs("arnir0/Tiny-LLM")
-        model = create_model(
-            data["configuration"], precision="fp32", execution_provider="cpu", cache_dir=folder
+        onnx_model = create_model_builder(
+            data["configuration"],
+            data["model"],
+            precision="fp32",
+            execution_provider="cpu",
+            cache_dir=folder,
+            verbose=1,
         )
-        self.assertGreater(len(model.nodes), 5)
-        model.save_model(folder)
-        self.assertExists(os.path.join(folder, "model.onnx"))
+        self.assertGreater(len(onnx_model.nodes), 5)
+
+        proto = save_model_builder(onnx_model, verbose=1)
+        import onnxruntime
+
+        onnxruntime.InferenceSession(
+            proto.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+
+        # We need to start again.
+        onnx_model = create_model_builder(
+            data["configuration"],
+            data["model"],
+            precision="fp32",
+            execution_provider="cpu",
+            cache_dir=folder,
+            verbose=1,
+        )
+        save_model_builder(onnx_model, folder, verbose=1)
+        model_name = os.path.join(folder, "model.onnx")
+        self.assertExists(model_name)
+
+        feeds = make_feeds(proto, data["inputs"], use_numpy=True)
+        expected = data["model"](**data["inputs"])
+
+        sess = onnxruntime.InferenceSession(model_name, providers=["CPUExecutionProvider"])
+        try:
+            got = sess.run(None, feeds)
+        except onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument as e:
+            if "batch_size must be 1 when sequence_length > 1" in str(e):
+                raise unittest.SkipTest("batch_size must be 1 when sequence_length > 1")
+        self.assertEqualAny(expected, got)
 
 
 if __name__ == "__main__":
 
@@ -5,8 +5,9 @@
 from pathlib import Path
 from typing import Any, Optional
 from urllib.parse import urlparse
+from onnx import helper, save_model, external_data_helper, ModelProto
 
-CACHE_SUBDIR = "onnx_diagnostic_cache"
+CACHE_SUBDIR = "onnx-diagnostic"
 
 
 def download_model_builder_to_cache(
@@ -53,35 +54,276 @@ def import_model_builder(module_name: str = "builder") -> object:
     return module
 
 
-def create_model(
+def _make_model(self, model, verbose: int = 0):
+    # Make inputs and outputs to ONNX model
+    import torch
+
+    self.make_inputs_and_outputs()
+
+    # Make pre-processing nodes
+    self.make_preprocessing_nodes()
+
+    # Loop through model and map each module to ONNX/ORT ops
+    self.layer_id = 0
+    for module in model.modules():
+        if (
+            isinstance(module, torch.nn.Embedding)
+            and module.weight.shape[0] == self.vocab_size
+        ) or (hasattr(model, "embedding") and module == model.embedding):
+            # Checks (Hugging Face logic) or (GGUF logic)
+            if not self.exclude_embeds:
+                # Embedding layer
+                if verbose:
+                    print("[_make_model] Reading embedding layer")
+                self.make_embedding(module.weight.detach().cpu())
+            else:
+                # Exclude embedding layer from model
+                self.layernorm_attrs["root_input"] = "inputs_embeds"
+                self.layernorm_attrs["skip_input"] = "inputs_embeds"
+
+        elif (
+            module.__class__.__name__.endswith("DecoderLayer")
+            or module.__class__.__name__.endswith("GLMBlock")
+        ) and self.layer_id < self.num_layers:
+            # Each decoder layer of model
+            if verbose:
+                print(f"[_make_model] Reading decoder layer {self.layer_id}")
+            self.make_layer(self.layer_id, module)
+            self.layer_id += 1
+
+        elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
+            # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
+            if verbose:
+                print("[_make_model] Reading final norm")
+            self.make_layernorm(
+                self.layer_id,
+                module,
+                skip=True,
+                simple=self.layernorm_attrs["simple"],
+                location="final_norm",
+            )
+
+        elif (
+            isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size
+        ) or (hasattr(model, "lm_head") and module == model.lm_head):
+            # Checks (Hugging Face logic) or (GGUF logic)
+            if not self.exclude_lm_head:
+                # Language modeling head (SkipLayerNorm --> logits)
+                if verbose:
+                    print("[_make_model] Reading LM head")
+                self.make_lm_head(module)
+
+
+def save_model_builder(self, out_dir: Optional[str] = "", verbose: int = 0) -> ModelProto:
+    """
+    Saves a model created by function :func:`create_model_builder`.
+    If out_dir is empty or not specified, the function still returns the
+    generated model.
+    """
+    if verbose:
+        print(f"[save_model_builder] Saving ONNX model in {out_dir}")
+
+    # Create ONNX model
+    model = helper.make_model(
+        opset_imports=[
+            self.clear_field(
+                helper.make_operatorsetid("", 21 if self.quant_attrs["use_qdq"] else 14),
+                "domain",
+            ),
+            helper.make_operatorsetid("com.microsoft", 1),
+        ],
+        ir_version=7,
+        producer_name="onnxruntime-genai",
+        producer_version="0.0.0",
+        graph=self.make_graph(
+            name="main_graph",
+            inputs=self.inputs,
+            outputs=self.outputs,
+            initializer=self.initializers,
+            value_info=self.value_infos,
+            nodes=self.nodes,
+        ),
+    )
+
+    # Load external data into ONNX model
+    external_data_helper.load_external_data_for_model(model, self.cache_dir)
+
+    # Delete external data files on disk before re-saving
+    for path in os.listdir(self.cache_dir):
+        if path.endswith(".bin"):
+            os.remove(os.path.join(self.cache_dir, path))
+
+    # Delete temporary cache dir if empty
+    # if len(os.listdir(self.cache_dir)) == 0:
+    #    os.rmdir(self.cache_dir)
+
+    # Quantize ONNX model to desired precision
+    already_quantized_in_qdq_format = (
+        self.quant_type is not None and self.quant_attrs["use_qdq"]
+    )  # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path
+    if self.onnx_dtype == "int4" and not already_quantized_in_qdq_format:
+        model = self.to_int4(model)
+
+    # Save ONNX model with only one external data file and delete any existing duplicate copies
+    if out_dir:
+        out_path = os.path.join(out_dir, self.filename)
+        data_path = os.path.join(out_dir, os.path.basename(out_path) + ".data")
+        if os.path.exists(out_path):
+            if verbose:
+                print(f"[save_model_builder] Overwriting {out_path!r}")
+            os.remove(out_path)
+        if os.path.exists(data_path):
+            if verbose:
+                print(f"[save_model_builder] Overwriting {data_path!r}")
+            os.remove(data_path)
+
+    if out_dir:
+        location = os.path.basename(data_path)
+        if os.path.exists(location):
+            os.remove(location)
+        save_model(
+            model,
+            out_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location=location,
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+        return None
+    return model
+
+
+def create_model_builder(
     config: Any,
-    cache_dir: Optional[str] = None,
+    model: "torch.nn.Module",  # noqa: F821
+    cache_dir: str,
     precision: str = "fp32",
     execution_provider: str = "cpu",
+    verbose: int = 0,
     **extra_options,
 ) -> "Model":  # noqa: F821
     """
     Creates a model based on a configuration.
+    The onnx model is returned by function :func:`save_model_builder`.
 
     :param config: configuration
     :param cache_dir: cache directory
     :param precision: precision
     :param execution_provider: execution provider
+    :param verbose: verbosity
     :param extra_options: extra options
     :return: model
     """
+    assert cache_dir, "create_model_builder does not work without cache_dir."
+    assert os.path.exists(cache_dir), f"cache_dir={cache_dir!r} does not exists"
     download_model_builder_to_cache()
     builder = import_model_builder()
-    extra_kwargs = {}
     io_dtype = builder.set_io_dtype(precision, execution_provider, extra_options)
-    onnx_model = builder.Model(
-        config,
-        io_dtype,
-        precision,
-        execution_provider,
-        cache_dir,
-        extra_options,
-        **extra_kwargs,
+
+    arch_map = {
+        "ChatGLMForConditionalGeneration": builder.ChatGLMModel,
+        "ChatGLMModel": builder.ChatGLMModel,
+        "GemmaForCausalLM": builder.Gemma2Model,
+        "Gemma3ForCausalLM": builder.Gemma3Model,
+        "Gemma3ForConditionalGeneration": builder.Gemma3Model,
+        "GraniteForCausalLM": builder.GraniteModel,
+        "LlamaForCausalLM": builder.LlamaModel,
+        "MistralForCausalLM": builder.MistralModel,
+        "NemotronForCausalLM": builder.NemotronModel,
+        "OlmoForCausalLM": builder.OLMoModel,
+        "PhiForCausalLM": builder.PhiModel,
+        "Phi3ForCausalLM": (
+            lambda config, *_: (
+                builder.Phi3MiniModel
+                if config.max_position_embeddings == config.original_max_position_embeddings
+                else builder.Phi3MiniLongRoPEModel
+            )
+        ),
+        "PhiMoEForCausalLM": builder.Phi3MoELongRoPEModel,
+        "Phi3SmallForCausalLM": (
+            lambda config, *_: (
+                builder.Phi3SmallModel
+                if config.max_position_embeddings == config.original_max_position_embeddings
+                else builder.Phi3SmallLongRoPEModel
+            )
+        ),
+        "Phi3VForCausalLM": builder.Phi3VModel,
+        "Phi4MMForCausalLM": builder.Phi4MMModel,
+        "Qwen2ForCausalLM": builder.QwenModel,
+        "Qwen3ForCausalLM": builder.Qwen3Model,
+    }
+
+    assert config.architectures[0] in arch_map, (
+        f"Unable find {config.architectures[0]!r} in the supported list "
+        f"of architectures: {sorted(arch_map)}"
+    )
+
+    # Additional validations.
+    post = None
+    if config.architectures[0] in ("ChatGLMForConditionalGeneration", "ChatGLMModel"):
+        # Quantized ChatGLM model has ChatGLMForConditionalGeneration
+        # as architecture whereas HF model as the latter
+        config.hidden_act = "swiglu"
+    elif config.architectures[0] == "Gemma2ForCausalLM":
+        assert precision == "bfp16", (
+            f"architecture {config.architectures[0]!r} loses accuracy "
+            f"with float16 precision, use bfp16."
+        )
+    elif config.architectures[0] == "Gemma3ForCausalLM":
+        assert precision == "bfp16", (
+            f"architecture {config.architectures[0]!r} loses accuracy "
+            f"with float16 precision, use bfp16."
+        )
+
+        def _post(onnx_model):
+            onnx_model.model_type = "gemma3_text"
+
+        post = _post
+    elif config.architectures[0] == "Gemma3ForConditionalGeneration":
+        assert extra_options.get("exclude_embeds", False), (
+            f"This is only generating the text component of architecture "
+            f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
+        )
+        assert precision == "bfp16", (
+            f"architecture {config.architectures[0]!r} loses accuracy "
+            f"with float16 precision, use bfp16."
+        )
+        text_config = config.text_config
+        for key in text_config:
+            if not hasattr(config, key):
+                setattr(config, key, getattr(text_config, key))
+    elif (
+        config.architectures[0] == "PhiMoEForCausalLM"
+        and config.max_position_embeddings != config.original_max_position_embeddings
+    ):
+        assert execution_provider == "cuda", (
+            f"architecture {config.architectures[0]!r} works on 'cuda' "
+            f"because `MoE` is only supported for CUDA in ONNX Runtime."
+        )
+        assert precision == "int4", f"architecture {config.architectures[0]!r} supports int4."
+    elif config.architectures[0] == "Phi3VForCausalLM":
+        assert extra_options.get("exclude_embeds", False), (
+            f"This is only generating the text component of architecture "
+            f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
+        )
+    elif config.architectures[0] == "Phi4MMForCausalLM":
+        assert extra_options.get("exclude_embeds", False), (
+            f"This is only generating the text component of architecture "
+            f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
+        )
+
+    cls = arch_map[config.architectures[0]]
+    onnx_model = cls(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
+    if post:
+        post(onnx_model)
+    _make_model(onnx_model, model, verbose=verbose)
+
+    assert onnx_model.nodes, (
+        f"No node in the model, io_dtype={io_dtype!r}, "
+        f"precision={precision!r}, execution_provider={execution_provider!r}, "
+        f"extra_options={extra_options!r}, cache_dir={cache_dir!r}, "
+        f"\n-- config --\n{config}"
     )
     # onnx_model.make_genai_config(hf_name, extra_kwargs, output_dir)
     # onnx_model.save_processing(hf_name, extra_kwargs, output_dir)