Skip to content

Commit 2f0b482

Browse files
committed
mb
1 parent 77dc839 commit 2f0b482

File tree

4 files changed

+395
-30
lines changed

4 files changed

+395
-30
lines changed

_unittests/ut_helpers/test_model_builder_helper.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,18 @@
44
ExtTestCase,
55
requires_torch,
66
requires_transformers,
7+
hide_stdout,
78
)
89
from onnx_diagnostic.helpers.model_builder_helper import (
910
download_model_builder_to_cache,
1011
import_model_builder,
11-
create_model,
12+
create_model_builder,
13+
save_model_builder,
1214
)
1315
from onnx_diagnostic.torch_models.hghub import (
1416
get_untrained_model_with_inputs,
1517
)
18+
from onnx_diagnostic.helpers.rt_helper import make_feeds
1619

1720

1821
class TestModelBuilderHelper(ExtTestCase):
@@ -28,15 +31,52 @@ def test_download_model_builder(self):
2831
# This is to limit impact on CI.
2932
@requires_transformers("4.52")
3033
@requires_torch("2.7.99")
34+
@hide_stdout()
3135
def test_model_builder_id(self):
36+
# clear&&python ~/.cache/onnx-diagnostic/builder.py
37+
# --model arnir0/Tiny-LLM -p fp16 -c dump_cache -e cpu -o dump_model
3238
folder = self.get_dump_folder("test_model_builder_id")
3339
data = get_untrained_model_with_inputs("arnir0/Tiny-LLM")
34-
model = create_model(
35-
data["configuration"], precision="fp32", execution_provider="cpu", cache_dir=folder
40+
onnx_model = create_model_builder(
41+
data["configuration"],
42+
data["model"],
43+
precision="fp32",
44+
execution_provider="cpu",
45+
cache_dir=folder,
46+
verbose=1,
3647
)
37-
self.assertGreater(len(model.nodes), 5)
38-
model.save_model(folder)
39-
self.assertExists(os.path.join(folder, "model.onnx"))
48+
self.assertGreater(len(onnx_model.nodes), 5)
49+
50+
proto = save_model_builder(onnx_model, verbose=1)
51+
import onnxruntime
52+
53+
onnxruntime.InferenceSession(
54+
proto.SerializeToString(), providers=["CPUExecutionProvider"]
55+
)
56+
57+
# We need to start again.
58+
onnx_model = create_model_builder(
59+
data["configuration"],
60+
data["model"],
61+
precision="fp32",
62+
execution_provider="cpu",
63+
cache_dir=folder,
64+
verbose=1,
65+
)
66+
save_model_builder(onnx_model, folder, verbose=1)
67+
model_name = os.path.join(folder, "model.onnx")
68+
self.assertExists(model_name)
69+
70+
feeds = make_feeds(proto, data["inputs"], use_numpy=True)
71+
expected = data["model"](**data["inputs"])
72+
73+
sess = onnxruntime.InferenceSession(model_name, providers=["CPUExecutionProvider"])
74+
try:
75+
got = sess.run(None, feeds)
76+
except onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument as e:
77+
if "batch_size must be 1 when sequence_length > 1" in str(e):
78+
raise unittest.SkipTest("batch_size must be 1 when sequence_length > 1")
79+
self.assertEqualAny(expected, got)
4080

4181

4282
if __name__ == "__main__":

onnx_diagnostic/helpers/model_builder_helper.py

Lines changed: 254 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
from pathlib import Path
66
from typing import Any, Optional
77
from urllib.parse import urlparse
8+
from onnx import helper, save_model, external_data_helper, ModelProto
89

9-
CACHE_SUBDIR = "onnx_diagnostic_cache"
10+
CACHE_SUBDIR = "onnx-diagnostic"
1011

1112

1213
def download_model_builder_to_cache(
@@ -53,35 +54,276 @@ def import_model_builder(module_name: str = "builder") -> object:
5354
return module
5455

5556

56-
def create_model(
57+
def _make_model(self, model, verbose: int = 0):
58+
# Make inputs and outputs to ONNX model
59+
import torch
60+
61+
self.make_inputs_and_outputs()
62+
63+
# Make pre-processing nodes
64+
self.make_preprocessing_nodes()
65+
66+
# Loop through model and map each module to ONNX/ORT ops
67+
self.layer_id = 0
68+
for module in model.modules():
69+
if (
70+
isinstance(module, torch.nn.Embedding)
71+
and module.weight.shape[0] == self.vocab_size
72+
) or (hasattr(model, "embedding") and module == model.embedding):
73+
# Checks (Hugging Face logic) or (GGUF logic)
74+
if not self.exclude_embeds:
75+
# Embedding layer
76+
if verbose:
77+
print("[_make_model] Reading embedding layer")
78+
self.make_embedding(module.weight.detach().cpu())
79+
else:
80+
# Exclude embedding layer from model
81+
self.layernorm_attrs["root_input"] = "inputs_embeds"
82+
self.layernorm_attrs["skip_input"] = "inputs_embeds"
83+
84+
elif (
85+
module.__class__.__name__.endswith("DecoderLayer")
86+
or module.__class__.__name__.endswith("GLMBlock")
87+
) and self.layer_id < self.num_layers:
88+
# Each decoder layer of model
89+
if verbose:
90+
print(f"[_make_model] Reading decoder layer {self.layer_id}")
91+
self.make_layer(self.layer_id, module)
92+
self.layer_id += 1
93+
94+
elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
95+
# SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
96+
if verbose:
97+
print("[_make_model] Reading final norm")
98+
self.make_layernorm(
99+
self.layer_id,
100+
module,
101+
skip=True,
102+
simple=self.layernorm_attrs["simple"],
103+
location="final_norm",
104+
)
105+
106+
elif (
107+
isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size
108+
) or (hasattr(model, "lm_head") and module == model.lm_head):
109+
# Checks (Hugging Face logic) or (GGUF logic)
110+
if not self.exclude_lm_head:
111+
# Language modeling head (SkipLayerNorm --> logits)
112+
if verbose:
113+
print("[_make_model] Reading LM head")
114+
self.make_lm_head(module)
115+
116+
117+
def save_model_builder(self, out_dir: Optional[str] = "", verbose: int = 0) -> ModelProto:
118+
"""
119+
Saves a model created by function :func:`create_model_builder`.
120+
If out_dir is empty or not specified, the function still returns the
121+
generated model.
122+
"""
123+
if verbose:
124+
print(f"[save_model_builder] Saving ONNX model in {out_dir}")
125+
126+
# Create ONNX model
127+
model = helper.make_model(
128+
opset_imports=[
129+
self.clear_field(
130+
helper.make_operatorsetid("", 21 if self.quant_attrs["use_qdq"] else 14),
131+
"domain",
132+
),
133+
helper.make_operatorsetid("com.microsoft", 1),
134+
],
135+
ir_version=7,
136+
producer_name="onnxruntime-genai",
137+
producer_version="0.0.0",
138+
graph=self.make_graph(
139+
name="main_graph",
140+
inputs=self.inputs,
141+
outputs=self.outputs,
142+
initializer=self.initializers,
143+
value_info=self.value_infos,
144+
nodes=self.nodes,
145+
),
146+
)
147+
148+
# Load external data into ONNX model
149+
external_data_helper.load_external_data_for_model(model, self.cache_dir)
150+
151+
# Delete external data files on disk before re-saving
152+
for path in os.listdir(self.cache_dir):
153+
if path.endswith(".bin"):
154+
os.remove(os.path.join(self.cache_dir, path))
155+
156+
# Delete temporary cache dir if empty
157+
# if len(os.listdir(self.cache_dir)) == 0:
158+
# os.rmdir(self.cache_dir)
159+
160+
# Quantize ONNX model to desired precision
161+
already_quantized_in_qdq_format = (
162+
self.quant_type is not None and self.quant_attrs["use_qdq"]
163+
) # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path
164+
if self.onnx_dtype == "int4" and not already_quantized_in_qdq_format:
165+
model = self.to_int4(model)
166+
167+
# Save ONNX model with only one external data file and delete any existing duplicate copies
168+
if out_dir:
169+
out_path = os.path.join(out_dir, self.filename)
170+
data_path = os.path.join(out_dir, os.path.basename(out_path) + ".data")
171+
if os.path.exists(out_path):
172+
if verbose:
173+
print(f"[save_model_builder] Overwriting {out_path!r}")
174+
os.remove(out_path)
175+
if os.path.exists(data_path):
176+
if verbose:
177+
print(f"[save_model_builder] Overwriting {data_path!r}")
178+
os.remove(data_path)
179+
180+
if out_dir:
181+
location = os.path.basename(data_path)
182+
if os.path.exists(location):
183+
os.remove(location)
184+
save_model(
185+
model,
186+
out_path,
187+
save_as_external_data=True,
188+
all_tensors_to_one_file=True,
189+
location=location,
190+
size_threshold=1024,
191+
convert_attribute=False,
192+
)
193+
return None
194+
return model
195+
196+
197+
def create_model_builder(
57198
config: Any,
58-
cache_dir: Optional[str] = None,
199+
model: "torch.nn.Module", # noqa: F821
200+
cache_dir: str,
59201
precision: str = "fp32",
60202
execution_provider: str = "cpu",
203+
verbose: int = 0,
61204
**extra_options,
62205
) -> "Model": # noqa: F821
63206
"""
64207
Creates a model based on a configuration.
208+
The onnx model is returned by function :func:`save_model_builder`.
65209
66210
:param config: configuration
67211
:param cache_dir: cache directory
68212
:param precision: precision
69213
:param execution_provider: execution provider
214+
:param verbose: verbosity
70215
:param extra_options: extra options
71216
:return: model
72217
"""
218+
assert cache_dir, "create_model_builder does not work without cache_dir."
219+
assert os.path.exists(cache_dir), f"cache_dir={cache_dir!r} does not exists"
73220
download_model_builder_to_cache()
74221
builder = import_model_builder()
75-
extra_kwargs = {}
76222
io_dtype = builder.set_io_dtype(precision, execution_provider, extra_options)
77-
onnx_model = builder.Model(
78-
config,
79-
io_dtype,
80-
precision,
81-
execution_provider,
82-
cache_dir,
83-
extra_options,
84-
**extra_kwargs,
223+
224+
arch_map = {
225+
"ChatGLMForConditionalGeneration": builder.ChatGLMModel,
226+
"ChatGLMModel": builder.ChatGLMModel,
227+
"GemmaForCausalLM": builder.Gemma2Model,
228+
"Gemma3ForCausalLM": builder.Gemma3Model,
229+
"Gemma3ForConditionalGeneration": builder.Gemma3Model,
230+
"GraniteForCausalLM": builder.GraniteModel,
231+
"LlamaForCausalLM": builder.LlamaModel,
232+
"MistralForCausalLM": builder.MistralModel,
233+
"NemotronForCausalLM": builder.NemotronModel,
234+
"OlmoForCausalLM": builder.OLMoModel,
235+
"PhiForCausalLM": builder.PhiModel,
236+
"Phi3ForCausalLM": (
237+
lambda config, *_: (
238+
builder.Phi3MiniModel
239+
if config.max_position_embeddings == config.original_max_position_embeddings
240+
else builder.Phi3MiniLongRoPEModel
241+
)
242+
),
243+
"PhiMoEForCausalLM": builder.Phi3MoELongRoPEModel,
244+
"Phi3SmallForCausalLM": (
245+
lambda config, *_: (
246+
builder.Phi3SmallModel
247+
if config.max_position_embeddings == config.original_max_position_embeddings
248+
else builder.Phi3SmallLongRoPEModel
249+
)
250+
),
251+
"Phi3VForCausalLM": builder.Phi3VModel,
252+
"Phi4MMForCausalLM": builder.Phi4MMModel,
253+
"Qwen2ForCausalLM": builder.QwenModel,
254+
"Qwen3ForCausalLM": builder.Qwen3Model,
255+
}
256+
257+
assert config.architectures[0] in arch_map, (
258+
f"Unable find {config.architectures[0]!r} in the supported list "
259+
f"of architectures: {sorted(arch_map)}"
260+
)
261+
262+
# Additional validations.
263+
post = None
264+
if config.architectures[0] in ("ChatGLMForConditionalGeneration", "ChatGLMModel"):
265+
# Quantized ChatGLM model has ChatGLMForConditionalGeneration
266+
# as architecture whereas HF model as the latter
267+
config.hidden_act = "swiglu"
268+
elif config.architectures[0] == "Gemma2ForCausalLM":
269+
assert precision == "bfp16", (
270+
f"architecture {config.architectures[0]!r} loses accuracy "
271+
f"with float16 precision, use bfp16."
272+
)
273+
elif config.architectures[0] == "Gemma3ForCausalLM":
274+
assert precision == "bfp16", (
275+
f"architecture {config.architectures[0]!r} loses accuracy "
276+
f"with float16 precision, use bfp16."
277+
)
278+
279+
def _post(onnx_model):
280+
onnx_model.model_type = "gemma3_text"
281+
282+
post = _post
283+
elif config.architectures[0] == "Gemma3ForConditionalGeneration":
284+
assert extra_options.get("exclude_embeds", False), (
285+
f"This is only generating the text component of architecture "
286+
f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
287+
)
288+
assert precision == "bfp16", (
289+
f"architecture {config.architectures[0]!r} loses accuracy "
290+
f"with float16 precision, use bfp16."
291+
)
292+
text_config = config.text_config
293+
for key in text_config:
294+
if not hasattr(config, key):
295+
setattr(config, key, getattr(text_config, key))
296+
elif (
297+
config.architectures[0] == "PhiMoEForCausalLM"
298+
and config.max_position_embeddings != config.original_max_position_embeddings
299+
):
300+
assert execution_provider == "cuda", (
301+
f"architecture {config.architectures[0]!r} works on 'cuda' "
302+
f"because `MoE` is only supported for CUDA in ONNX Runtime."
303+
)
304+
assert precision == "int4", f"architecture {config.architectures[0]!r} supports int4."
305+
elif config.architectures[0] == "Phi3VForCausalLM":
306+
assert extra_options.get("exclude_embeds", False), (
307+
f"This is only generating the text component of architecture "
308+
f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
309+
)
310+
elif config.architectures[0] == "Phi4MMForCausalLM":
311+
assert extra_options.get("exclude_embeds", False), (
312+
f"This is only generating the text component of architecture "
313+
f"{config.architectures[0]!r}. Set extra_options exclude_embeds=true."
314+
)
315+
316+
cls = arch_map[config.architectures[0]]
317+
onnx_model = cls(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
318+
if post:
319+
post(onnx_model)
320+
_make_model(onnx_model, model, verbose=verbose)
321+
322+
assert onnx_model.nodes, (
323+
f"No node in the model, io_dtype={io_dtype!r}, "
324+
f"precision={precision!r}, execution_provider={execution_provider!r}, "
325+
f"extra_options={extra_options!r}, cache_dir={cache_dir!r}, "
326+
f"\n-- config --\n{config}"
85327
)
86328
# onnx_model.make_genai_config(hf_name, extra_kwargs, output_dir)
87329
# onnx_model.save_processing(hf_name, extra_kwargs, output_dir)

0 commit comments

Comments
 (0)