Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/models/llama/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
args.enable_dynamic_shape,
)
)
# Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
partitioners.append(
get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
)
modelname = f"vulkan_{modelname}"

if args.mps:
Expand Down
11 changes: 10 additions & 1 deletion examples/models/llama/source_transformation/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,16 @@ def quantize( # noqa C901
model = gptq_quantizer.quantize(model, inputs)
return model
elif qmode == "vulkan_4w":
model = VkInt4WeightOnlyQuantizer().quantize(model)
q_group_size = 256 if group_size is None else group_size
model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)

# Apply additional quantizer for linear layers that aren't lowered to Vulkan
# at the moment
from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
model = Int8DynActInt4WeightQuantizer(
precision=torch_dtype, groupsize=q_group_size
).quantize(model)

return model
else:
raise Exception(f"Unrecognized quantize mode: {qmode}")
Expand Down
Loading