File tree Expand file tree Collapse file tree 2 files changed +15
-1
lines changed Expand file tree Collapse file tree 2 files changed +15
-1
lines changed Original file line number Diff line number Diff line change @@ -682,6 +682,10 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
682682 args .enable_dynamic_shape ,
683683 )
684684 )
685+ # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
686+ partitioners .append (
687+ get_xnnpack_partitioner (dynamic_quant_only_partitioner = False )
688+ )
685689 modelname = f"vulkan_{ modelname } "
686690
687691 if args .mps :
Original file line number Diff line number Diff line change @@ -157,7 +157,17 @@ def quantize( # noqa C901
157157 model = gptq_quantizer .quantize (model , inputs )
158158 return model
159159 elif qmode == "vulkan_4w" :
160- model = VkInt4WeightOnlyQuantizer ().quantize (model )
160+ q_group_size = 256 if group_size is None else group_size
161+ model = VkInt4WeightOnlyQuantizer (groupsize = q_group_size ).quantize (model )
162+
163+ # Apply additional quantizer for linear layers that aren't lowered to Vulkan
164+ # at the moment
165+ from torchao .quantization .quant_api import Int8DynActInt4WeightQuantizer
166+
167+ model = Int8DynActInt4WeightQuantizer (
168+ precision = torch_dtype , groupsize = q_group_size
169+ ).quantize (model )
170+
161171 return model
162172 else :
163173 raise Exception (f"Unrecognized quantize mode: { qmode } " )
You can’t perform that action at this time.
0 commit comments