[ET-VK][Llama] Apply XNNPACK partitoner as well when lowering to Vulkan

SS-JIA · web-flow · commit b6da37297b14 · 2024-11-14T10:16:57.000-08:00
Differential Revision: D65899827 Pull Request resolved: #6830
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
                 args.enable_dynamic_shape,
             )
         )
+        # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
+        partitioners.append(
+            get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
+        )
         modelname = f"vulkan_{modelname}"
 
     if args.mps:
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -157,7 +157,17 @@ def quantize(  # noqa C901
         model = gptq_quantizer.quantize(model, inputs)
         return model
     elif qmode == "vulkan_4w":
-        model = VkInt4WeightOnlyQuantizer().quantize(model)
+        q_group_size = 256 if group_size is None else group_size
+        model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
+
+        # Apply additional quantizer for linear layers that aren't lowered to Vulkan
+        # at the moment
+        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
+
+        model = Int8DynActInt4WeightQuantizer(
+            precision=torch_dtype, groupsize=q_group_size
+        ).quantize(model)
+
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")

Original file line number	Diff line number	Diff line change
`@@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901`
`682`	`682`	`args.enable_dynamic_shape,`
`683`	`683`	`)`
`684`	`684`	`)`
	`685`	`+ # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK`
	`686`	`+ partitioners.append(`
	`687`	`+ get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)`
	`688`	`+ )`
`685`	`689`	`modelname = f"vulkan_{modelname}"`
`686`	`690`
`687`	`691`	`if args.mps:`