@@ -115,7 +115,7 @@ def load_pretrained_quantization_parameters(
115
115
116
116
def apply_quantization_config (
117
117
model : Module , config : Union [QuantizationConfig , None ], run_compressed : bool = False
118
- ) -> Dict [ str , QuantizationScheme ] :
118
+ ):
119
119
"""
120
120
Initializes the model for quantization in-place based on the given config.
121
121
Optionally coverts quantizable modules to compressed_linear modules
@@ -125,26 +125,22 @@ def apply_quantization_config(
125
125
:param run_compressed: Whether the model will be run in compressed mode or
126
126
decompressed fully on load
127
127
"""
128
- # Workaround for when HF Quantizer passes None, see PR #180
129
- if config is None :
130
- return dict ()
128
+ from compressed_tensors .linear .compressed_linear import CompressedLinear
131
129
132
- # remove reference to the original `config`
133
- # argument. This function can mutate it, and we'd
134
- # like to keep the original `config` as it is.
135
130
config = deepcopy (config )
131
+ if config is None : # see PR #180
132
+ return dict ()
133
+
134
+ # preprocess to support kv cache scheme
135
+ config = process_quantization_config (config )
136
+
136
137
# build mapping of targets to schemes for easier matching
137
138
# use ordered dict to preserve target ordering in config
138
139
target_to_scheme = OrderedDict ()
139
- config = process_quantization_config (config )
140
- names_to_scheme = dict ()
141
140
for scheme in config .config_groups .values ():
142
141
for target in scheme .targets :
143
142
target_to_scheme [target ] = scheme
144
143
145
- if run_compressed :
146
- from compressed_tensors .linear .compressed_linear import CompressedLinear
147
-
148
144
# mark appropriate layers for quantization by setting their quantization schemes
149
145
for name , submodule in match_named_modules (
150
146
model , target_to_scheme , config .ignore , warn_on_fail = True
@@ -153,7 +149,12 @@ def apply_quantization_config(
153
149
# quant scheme to the matching layers
154
150
matched_targets = match_targets (name , submodule , target_to_scheme )
155
151
scheme = _scheme_from_targets (target_to_scheme , matched_targets , name )
156
- if run_compressed :
152
+ # target matched - add layer and scheme to target list
153
+ submodule .quantization_scheme = scheme
154
+
155
+ # replace with run compressed if applicable
156
+ # FUTURE: move this to model compressor
157
+ if isinstance (submodule , torch .nn .Linear ) and run_compressed :
157
158
format = config .format
158
159
if format != CompressionFormat .dense .value :
159
160
if isinstance (submodule , torch .nn .Linear ):
@@ -165,14 +166,8 @@ def apply_quantization_config(
165
166
)
166
167
replace_module (model , name , compressed_linear )
167
168
168
- # target matched - add layer and scheme to target list
169
- submodule .quantization_scheme = scheme
170
-
171
- names_to_scheme [name ] = submodule .quantization_scheme
172
-
173
169
# apply current quantization status across all targeted layers
174
170
apply_quantization_status (model , config .quantization_status )
175
- return names_to_scheme
176
171
177
172
178
173
def process_quantization_config (config : QuantizationConfig ) -> QuantizationConfig :
0 commit comments