microsoft · qti-kromero · Aug 13, 2025 · Aug 13, 2025 · Aug 14, 2025 · Aug 15, 2025
diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
@@ -0,0 +1,29 @@
+# Gemma-3-4B Model Optimization
+
+This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md)
+
+## Automated Setup (Linux Only)
+
+Requirements:
+* Python 3.10
+* uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
+
+This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above:
+
+```bash
+source env_setup.sh
+```
+
+## Optimization Process
+
+Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage.
+
+Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively.
+
+```bash
+olive run --config gemma3-4b-text-qnn-config.json
+```
+
+```bash
+olive run --config gemma3-4b-vision-qnn-config.json
+```
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+
+import torch
+from transformers import AutoModel
+
+
+def load_gemma3_model(model_path):
+    return AutoModel.from_pretrained("google/gemma-3-4b-it")
+
+
+def get_dummy_inputs(model_handler):
+    return {
+        "input_ids": torch.full((1, 256), 262144, dtype=torch.long),  # Image token ID
+        "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32),
+        "attention_mask": torch.ones((1, 256), dtype=torch.long),
+    }
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
@@ -0,0 +1,23 @@
+
+# Installing setuptools to build Olive from source
+uv pip install setuptools
+
+# Requires installation of uv
+uv pip install -r ../requirements.txt
+
+# Require installation of Olive dependencies
+uv pip install -r ../../../requirements.txt
+
+# Disable CUDA extension build
+export BUILD_CUDA_EXT=0
+
+# Install AutoGPTQ from source
+uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
+
+# Install GptqModel from source
+uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649
+
+# Install onnxruntime-qnn without installing onnxruntime
+# Note: Installing both at the same time may cause conflicts
+uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
+uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -0,0 +1,80 @@
+{
+    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "gemma_text_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" }
+        }
+    ],
+    "passes": {
+        "q": { "type": "QuaRot" },
+        "g": {
+            "type": "GptqModel",
+            "bits": 4,
+            "sym": true,
+            "group_size": -1,
+            "lm_head": false,
+            "device": "cuda",
+            "data_config": "gemma_text_data_config"
+        },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
+        "mb": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_block_size": 32,
+            "int4_accuracy_level": 4,
+            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
+        },
+        "mq": {
+            "type": "MatMulNBitsToQDQ",
+            "use_int4": true,
+            "add_zero_point": true,
+            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
+            "save_as_external_data": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "RemoveRopeMultiCache" },
+                { "surgeon": "AttentionMaskToSequenceLengths" },
+                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+            ],
+            "save_as_external_data": true
+        },
+        "sq": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "gemma_text_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "quant_preprocess": true,
+            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
+            "save_as_external_data": true
+        },
+        "sp": { "type": "SplitModel" },
+        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_performance_mode": "burst",
+                "htp_graph_finalization_optimization_mode": "3",
+                "soc_model": "60"
+            },
+            "weight_sharing": true
+        },
+        "cp": { "type": "ComposeOnnxModels" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 1,
+    "output_dir": "models/gemma-3-4b-it-text",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -0,0 +1,47 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_script": "custom_gemma3_4b_it_vision.py",
+        "model_loader": "load_gemma3_model",
+        "dummy_inputs_func": "get_dummy_inputs",
+        "io_config": {
+            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
+            "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ],
+            "input_types": [ "int64", "float32", "int64" ],
+            "output_names": [ "last_hidden_state" ],
+            "output_shapes": [ [ 1, 256, 2560 ] ]
+        }
+    },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "gemma_vision_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" }
+        }
+    ],
+    "passes": {
+        "conversion": { "type": "OnnxConversion", "target_opset": 17 },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "quant_preprocess": true,
+            "data_config": "gemma_vision_data_config",
+            "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ],
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax"
+        },
+        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 1,
+    "output_dir": "models/gemma-3-4b-it-vision",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}