@@ -128,11 +128,19 @@ def _validate_ios_version() -> None:
128128 "block_size" : 32 ,
129129 "weight_threshold" : 512 ,
130130 }
131+
132+ assert ios == 18
133+ print ("OVERRIDING CONFIG TO BE 4B PER_CHANNEL" )
134+ op_linear_quantizer_config = {
135+ "mode" : "linear_symmetric" ,
136+ "dtype" : "int4" ,
137+ "granularity" : "per_channel" ,
138+ }
131139 compile_specs = CoreMLBackend .generate_compile_specs ( # pyre-fixme[16]
132140 minimum_deployment_target = minimum_deployment_target ,
133141 compute_precision = ct .precision (ct .precision .FLOAT16 .value ),
134142 # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
135- compute_unit = ct .ComputeUnit [ct .ComputeUnit .CPU_AND_GPU .name .upper ()],
143+ compute_unit = ct .ComputeUnit [ct .ComputeUnit .CPU_AND_NE .name .upper ()],
136144 model_type = CoreMLBackend .MODEL_TYPE .MODEL , # pyre-fixme[16]
137145 op_linear_quantizer_config = op_linear_quantizer_config ,
138146 )
@@ -142,6 +150,10 @@ def _validate_ios_version() -> None:
142150 return CoreMLPartitioner ( # pyre-fixme[16]
143151 compile_specs = compile_specs ,
144152 take_over_mutable_buffer = take_over_mutable_buffer ,
153+ skip_ops_for_coreml_delegation = [
154+ "quantized_decomposed.embedding_4bit.dtype" ,
155+ "aten.embedding.default" ,
156+ ],
145157 )
146158
147159
0 commit comments