Skip to content

Commit 11adccb

Browse files
committed
perf-repro
1 parent dc2e02a commit 11adccb

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

extension/llm/export/partitioner_lib.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,19 @@ def _validate_ios_version() -> None:
128128
"block_size": 32,
129129
"weight_threshold": 512,
130130
}
131+
132+
assert ios == 18
133+
print("OVERRIDING CONFIG TO BE 4B PER_CHANNEL")
134+
op_linear_quantizer_config = {
135+
"mode": "linear_symmetric",
136+
"dtype": "int4",
137+
"granularity": "per_channel",
138+
}
131139
compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16]
132140
minimum_deployment_target=minimum_deployment_target,
133141
compute_precision=ct.precision(ct.precision.FLOAT16.value),
134142
# using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
135-
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
143+
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_NE.name.upper()],
136144
model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16]
137145
op_linear_quantizer_config=op_linear_quantizer_config,
138146
)
@@ -142,6 +150,10 @@ def _validate_ios_version() -> None:
142150
return CoreMLPartitioner( # pyre-fixme[16]
143151
compile_specs=compile_specs,
144152
take_over_mutable_buffer=take_over_mutable_buffer,
153+
skip_ops_for_coreml_delegation=[
154+
"quantized_decomposed.embedding_4bit.dtype",
155+
"aten.embedding.default",
156+
],
145157
)
146158

147159

0 commit comments

Comments
 (0)