refactor: extract _process_tensors_microscale to reduce duplication

dzhengAP · dzhengAP · commit 7230bb184132 · 2026-03-20T14:35:14.000-07:00
Signed-off-by: David Zheng &lt;dqzheng1996@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/model_free/process.py b/src/llmcompressor/entrypoints/model_free/process.py
@@ -122,7 +122,7 @@ def process_file_microscale_scheme(
 
     :param file_path: safetensors file to process
     :param save_path: save path of file with quantized weights
-    :param scheme: quantization scheme to apply to tensors
+    :param scheme: microscale quantization scheme (NVFP4, MXFP4)
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
     :param device: device used to quantize and compress weights
@@ -135,64 +135,10 @@ def process_file_microscale_scheme(
     if converter is not None:
         converter.process(tensors)
 
-    fused_sets, unmatched_sets = get_fused_names(tensors)
+    fused_sets, unmatched_sets = get_fused_names(list(tensors.keys()))
     assert len(unmatched_sets) <= 0  # should be caught by validate_safetensors_index
 
-    fused_name_to_fused_index: dict[str, int]  # fused_name -> fused_index
-    fused_modules: dict[int, dict[str, Module]]  # fused_index -> named_modules
-
-    fused_name_to_fused_index = {
-        name: index
-        for index, matched_set in enumerate(fused_sets)
-        for name in matched_set.values()
-    }
-    fused_modules = defaultdict(dict)
-
-    for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
-        validate_weight_for_quantization(tensors[name], scheme, name)
-
-        # 1. initialize module with qparams (on device)
-        module = initialize_quantized_linear(tensors[name], scheme, device)
-
-        # 2. calibrate weight qparams. Delay scale/zp calibration for fused modules
-        calibrate_global_scale(module)
-        if name in fused_name_to_fused_index:
-            fused_index = fused_name_to_fused_index[name]
-            fused_modules[fused_index][name] = module
-            continue
-
-        calibrate_scale_zp(module)
-
-        # 3. compress module using qparams
-        compress_module(module)
-
-        # 4. save compressed data (on cpu)
-        del tensors[name]
-        prefix = module_name + "."
-        for key, value in module.state_dict(prefix=prefix).items():
-            tensors[key] = value.to("cpu")
-
-    # compress and save microscale fused modules
-    for named_modules in fused_modules.values():
-        # 2.1. fuse global scales
-        global_scales = [m.weight_global_scale for m in named_modules.values()]
-        fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
-
-        for name, module in named_modules.items():
-            module_name, _ = name.rsplit(".", 1)
-            module.weight_global_scale.data.copy_(fused_global_scale)
-
-            # 2.2. finish calibration with fused global scales
-            calibrate_scale_zp(module)
-
-            # 3. compress module using microscale qparams
-            compress_module(module)
-
-            # 4. save compressed data (on cpu)
-            del tensors[name]
-            prefix = module_name + "."
-            for key, value in module.state_dict(prefix=prefix).items():
-                tensors[key] = value.to("cpu")
+    tensors, _ = _process_tensors_microscale(tensors, scheme, ignore, device)
 
     save_file(tensors, save_path)
     total_size = sum(tensor.nbytes for tensor in tensors.values())
@@ -254,6 +200,54 @@ def process_file_group_microscale_scheme(
         "This is a bug in group_files_by_fused_weights."
     )
 
+    tensors, tensor_to_shard = _process_tensors_microscale(
+        tensors, scheme, ignore, device, tensor_to_shard
+    )
+
+    # Re-shard: write each tensor back to its original output file
+    output_shards: dict[str, dict[str, torch.Tensor]] = defaultdict(dict)
+    for name, tensor in tensors.items():
+        output_shards[tensor_to_shard[name]][name] = tensor
+
+    total_size = 0
+    weight_map: dict[str, str] = {}
+    for save_path in save_paths:
+        shard_name = os.path.basename(save_path)
+        shard_tensors = output_shards.get(shard_name, {})
+        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
+        save_file(shard_tensors, save_path)
+        total_size += sum(t.nbytes for t in shard_tensors.values())
+        weight_map.update({k: shard_name for k in shard_tensors})
+
+    return total_size, weight_map
+
+
+def _process_tensors_microscale(
+    tensors: dict[str, torch.Tensor],
+    scheme: QuantizationScheme,
+    ignore: Iterable[str],
+    device: str | torch.device,
+    tensor_to_shard: dict[str, str] | None = None,
+) -> tuple[dict[str, torch.Tensor], dict[str, str] | None]:
+    """
+    Core microscale quantization logic shared by process_file_microscale_scheme
+    and process_file_group_microscale_scheme.
+
+    Processes all quantizable tensors in the given dict in-place, handling
+    global scale fusion for fused weight sets (q/k/v, gate/up). When
+    tensor_to_shard is provided, shard assignments are updated to follow
+    compressed tensor keys.
+
+    :param tensors: dict of tensor name -> tensor, modified in-place
+    :param scheme: microscale quantization scheme (NVFP4, MXFP4)
+    :param ignore: modules to ignore
+    :param device: device used to quantize and compress weights
+    :param tensor_to_shard: optional mapping of tensor name -> shard filename,
+        updated in-place when compressed tensors produce new keys
+    :return: (tensors, tensor_to_shard) tuple with updated contents
+    """
+    fused_sets, _ = get_fused_names(list(tensors.keys()))
+
     fused_name_to_fused_index: dict[str, int] = {
         name: index
         for index, matched_set in enumerate(fused_sets)
@@ -280,13 +274,14 @@ def process_file_group_microscale_scheme(
         # 3. compress module using qparams
         compress_module(module)
 
-        # 4. save compressed data back to cpu, preserving shard assignment
-        original_shard = tensor_to_shard[name]
+        # 4. save compressed data back to cpu
+        original_shard = tensor_to_shard[name] if tensor_to_shard else None
         del tensors[name]
         prefix = module_name + "."
         for key, value in module.state_dict(prefix=prefix).items():
             tensors[key] = value.to("cpu")
-            tensor_to_shard[key] = original_shard
+            if tensor_to_shard is not None:
+                tensor_to_shard[key] = original_shard
 
     # compress and save microscale fused modules (with fused global scales)
     for named_modules in fused_modules.values():
@@ -304,27 +299,13 @@ def process_file_group_microscale_scheme(
             # 3. compress module using microscale qparams
             compress_module(module)
 
-            # 4. save compressed data back to cpu, preserving shard assignment
-            original_shard = tensor_to_shard[name]
+            # 4. save compressed data back to cpu
+            original_shard = tensor_to_shard[name] if tensor_to_shard else None
             del tensors[name]
             prefix = module_name + "."
             for key, value in module.state_dict(prefix=prefix).items():
                 tensors[key] = value.to("cpu")
-                tensor_to_shard[key] = original_shard
-
-    # Re-shard: write each tensor back to its original output file
-    output_shards: dict[str, dict[str, torch.Tensor]] = defaultdict(dict)
-    for name, tensor in tensors.items():
-        output_shards[tensor_to_shard[name]][name] = tensor
-
-    total_size = 0
-    weight_map: dict[str, str] = {}
-    for save_path in save_paths:
-        shard_name = os.path.basename(save_path)
-        shard_tensors = output_shards.get(shard_name, {})
-        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
-        save_file(shard_tensors, save_path)
-        total_size += sum(t.nbytes for t in shard_tensors.values())
-        weight_map.update({k: shard_name for k in shard_tensors})
+                if tensor_to_shard is not None:
+                    tensor_to_shard[key] = original_shard
 
-    return total_size, weight_map
+    return tensors, tensor_to_shard