Skip to content

Commit b6da372

Browse files
authored
[ET-VK][Llama] Apply XNNPACK partitoner as well when lowering to Vulkan
Differential Revision: D65899827 Pull Request resolved: #6830
1 parent 59da214 commit b6da372

File tree

2 files changed

+15
-1
lines changed

2 files changed

+15
-1
lines changed

examples/models/llama/export_llama_lib.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
682682
args.enable_dynamic_shape,
683683
)
684684
)
685+
# Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
686+
partitioners.append(
687+
get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
688+
)
685689
modelname = f"vulkan_{modelname}"
686690

687691
if args.mps:

examples/models/llama/source_transformation/quantize.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,17 @@ def quantize( # noqa C901
157157
model = gptq_quantizer.quantize(model, inputs)
158158
return model
159159
elif qmode == "vulkan_4w":
160-
model = VkInt4WeightOnlyQuantizer().quantize(model)
160+
q_group_size = 256 if group_size is None else group_size
161+
model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
162+
163+
# Apply additional quantizer for linear layers that aren't lowered to Vulkan
164+
# at the moment
165+
from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
166+
167+
model = Int8DynActInt4WeightQuantizer(
168+
precision=torch_dtype, groupsize=q_group_size
169+
).quantize(model)
170+
161171
return model
162172
else:
163173
raise Exception(f"Unrecognized quantize mode: {qmode}")

0 commit comments

Comments
 (0)