microsoft
diff --git a/‎google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb‎
Lines changed: 16 additions & 0 deletions b/‎google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json‎
Lines changed: 1 addition & 1 deletion b/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json‎
Lines changed: 114 additions & 21 deletions b/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json‎
Lines changed: 114 additions & 21 deletions
diff --git a/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json‎
Lines changed: 55 additions & 12 deletions b/‎google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json‎
Lines changed: 55 additions & 12 deletions
@@ -155,6 +155,15 @@
     "This is needed for running the Olive recipies for this model"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!patch ./olive_venv/lib/python3.10/site-packages/gptqmodel/utils/model.py < gptqmodel_int8.patch"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -177,6 +186,13 @@
     "    print(f\"Downloaded and replaced: {dest}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
 
@@ -7,7 +7,7 @@
             "input_names": [ "input_ids", "image_features" ],
             "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
             "input_types": [ "int64", "float32" ],
-            "output_names": [ "/model/embed_tokens/Mul/output_0" ],
+            "output_names": [ "/model/embed_tokens/Mul_output_cast_0" ],
             "output_shapes": [ [ 1, 64, 2560 ] ],
             "dynamic_axes": {
                 "input_ids": { "0": "batch_size", "1": "seq_length" },
 
@@ -9,62 +9,150 @@
         "qnn_system": {
             "type": "PythonEnvironment",
             "python_environment_path": "",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+            "accelerators": [
+                {
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
         }
     },
     "data_configs": [
         {
-            "name": "gemma_data_config",
-            "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
+            "name": "wikitext2_train_joined",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train"
+            },
+            "pre_process_data_config": {
+                "strategy": "join",
+                "add_special_tokens": false,
+                "max_seq_len": 4096,
+                "max_samples": 128
+            }
+        },
+        {
+            "name": "wikitext2_train_act",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train"
+            },
+            "pre_process_data_config": {
+                "strategy": "line-by-line",
+                "add_special_tokens": true,
+                "max_samples": 200,
+                "max_seq_len": 4096
+            }
         }
     ],
     "passes": {
+        "cs": {
+            "type": "CaptureSplitInfo",
+            "num_splits": 2,
+            "unique_embeds_lm_head_splits": true
+        },
         "g": {
             "type": "GptqModel",
             "bits": 4,
             "sym": true,
             "group_size": -1,
             "lm_head": false,
             "device": "cuda",
-            "data_config": "gemma_data_config"
+            "data_config": "wikitext2_train_joined",
+            "dynamic": {
+                "+:.*v_proj*": {
+                    "bits": 8,
+                    "sym": true,
+                    "group_size": -1,
+                    "desc_act": true
+                },
+                "+:.*k_proj*": {
+                    "bits": 8,
+                    "sym": true,
+                    "group_size": -1,
+                    "desc_act": true
+                },
+                "+:.*q_proj*": {
+                    "bits": 8,
+                    "sym": true,
+                    "group_size": -1,
+                    "desc_act": true
+                }
+            }
         },
-        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
         "mb": {
             "type": "ModelBuilder",
             "precision": "int4",
-            "int4_block_size": 32,
+            "int4_block_size": 16,
             "int4_accuracy_level": 4,
-            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
+            "int4_op_types_to_quantize": ["Gather", "MatMul"]
         },
         "mq": {
             "type": "MatMulNBitsToQDQ",
             "use_int4": true,
             "add_zero_point": true,
-            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
+            "nodes_to_exclude": [
+                "/lm_head/MatMul_Q4"
+            ],
             "save_as_external_data": true
         },
         "gs": {
             "type": "GraphSurgeries",
             "surgeries": [
-                { "surgeon": "RemoveRopeMultiCache" },
-                { "surgeon": "AttentionMaskToSequenceLengths" },
-                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+                {
+                    "surgeon": "RemoveRopeMultiCache"
+                },
+                {
+                    "surgeon": "AttentionMaskToSequenceLengths"
+                },
+                {
+                    "surgeon": "SimplifiedLayerNormToL2Norm"
+                }
+            ],
+            "save_as_external_data": true
+        },
+        "f16": {
+            "type": "OnnxFloatToFloat16",
+            "op_include_list": [
+                "GroupQueryAttention"
+            ],
+            "keep_io_types": [
+                "logits"
             ],
             "save_as_external_data": true
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "gemma_data_config",
+            "data_config": "wikitext2_train_act",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "calibration_providers": [
+                "CUDAExecutionProvider"
+            ],
             "quant_preprocess": true,
-            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
-            "save_as_external_data": true
+            "op_types_to_exclude": [
+                "GatherBlockQuantized",
+                "GroupQueryAttention",
+                "MatMulNBits"
+            ],
+            "save_as_external_data": true,
+            "extra_option": {
+                "CalibStridedMinMax": 4
+            }
+        },
+        "sp": {
+            "type": "SplitModel"
+        },
+        "st": {
+            "type": "StaticLLM",
+            "batch_size": 1,
+            "context_length": 64
         },
-        "sp": { "type": "SplitModel" },
-        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
         "cb": {
             "type": "EPContextBinaryGenerator",
             "provider_options": {
@@ -74,14 +162,19 @@
                 "htp_arch": "v73",
                 "soc_model": "60"
             },
-            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
+            "session_options": {
+                "intra_op_num_threads": 2,
+                "inter_op_num_threads": 1
+            },
             "weight_sharing": true
         },
-        "cp": { "type": "ComposeOnnxModels" }
+        "cp": {
+            "type": "ComposeOnnxModels"
+        }
     },
     "target": "qnn_system",
     "log_severity_level": 0,
     "output_dir": "models/gemma3_qnn",
     "cache_dir": "cache",
     "no_artifacts": true
-}
+}
@@ -4,38 +4,78 @@
         "model_script": "custom_gemma3_4b_vision.py",
         "model_loader": "load_gemma3_vision_model",
         "io_config": {
-            "input_names": [ "pixel_values" ],
-            "input_shapes": [ [ 1, 3, 896, 896 ] ],
-            "input_types": [ "float32" ],
-            "output_names": [ "image_features" ],
-            "output_shapes": [ [ 1, 256, 2560 ] ]
+            "input_names": [
+                "pixel_values"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    3,
+                    896,
+                    896
+                ]
+            ],
+            "input_types": [
+                "float32"
+            ],
+            "output_names": [
+                "image_features"
+            ],
+            "output_shapes": [
+                [
+                    1,
+                    256,
+                    2560
+                ]
+            ]
         }
     },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
             "python_environment_path": "",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+            "accelerators": [
+                {
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_vision_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": {
+                "type": "gemma_image_dataset",
+                "model_id": "google/gemma-3-4b-it"
+            }
         }
     ],
     "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
-        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "MatMulAddToGemm"
+                }
+            ]
+        },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": true,
             "data_config": "gemma_vision_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "calibration_providers": [
+                "CUDAExecutionProvider"
+            ],
             "per_channel": true,
             "weight_symmetric": true
         },
@@ -49,11 +89,14 @@
                 "soc_model": "60"
             }
         },
-        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+        "add_metadata": {
+            "type": "AddOliveMetadata",
+            "graph_name": "gemma-3-4b-it-vision"
+        }
     },
     "target": "qnn_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-vision",
     "cache_dir": "cache-vision",
     "no_artifacts": true
-}
+}