refactor: extract _process_tensors_microscale to reduce duplication

dzhengAP · dzhengAP · commit 11240f101cdb · 2026-03-20T14:12:51.000-07:00
Shared microscale processing logic now lives in _process_tensors_microscale,
called by both process_file_microscale_scheme and
process_file_group_microscale_scheme.

Signed-off-by: David Zheng &lt;dqzheng1996@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/model_free/process.py b/src/llmcompressor/entrypoints/model_free/process.py
@@ -122,7 +122,7 @@ def process_file_microscale_scheme(
 
     :param file_path: safetensors file to process
     :param save_path: save path of file with quantized weights
-    :param scheme: quantization scheme to apply to tensors
+    :param scheme: microscale quantization scheme (NVFP4, MXFP4)
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
     :param device: device used to quantize and compress weights
@@ -138,61 +138,7 @@ def process_file_microscale_scheme(
     fused_sets, unmatched_sets = get_fused_names(tensors)
     assert len(unmatched_sets) <= 0  # should be caught by validate_safetensors_index
 
-    fused_name_to_fused_index: dict[str, int]  # fused_name -> fused_index
-    fused_modules: dict[int, dict[str, Module]]  # fused_index -> named_modules
-
-    fused_name_to_fused_index = {
-        name: index
-        for index, matched_set in enumerate(fused_sets)
-        for name in matched_set.values()
-    }
-    fused_modules = defaultdict(dict)
-
-    for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
-        validate_weight_for_quantization(tensors[name], scheme, name)
-
-        # 1. initialize module with qparams (on device)
-        module = initialize_quantized_linear(tensors[name], scheme, device)
-
-        # 2. calibrate weight qparams. Delay scale/zp calibration for fused modules
-        calibrate_global_scale(module)
-        if name in fused_name_to_fused_index:
-            fused_index = fused_name_to_fused_index[name]
-            fused_modules[fused_index][name] = module
-            continue
-
-        calibrate_scale_zp(module)
-
-        # 3. compress module using qparams
-        compress_module(module)
-
-        # 4. save compressed data (on cpu)
-        del tensors[name]
-        prefix = module_name + "."
-        for key, value in module.state_dict(prefix=prefix).items():
-            tensors[key] = value.to("cpu")
-
-    # compress and save microscale fused modules
-    for named_modules in fused_modules.values():
-        # 2.1. fuse global scales
-        global_scales = [m.weight_global_scale for m in named_modules.values()]
-        fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
-
-        for name, module in named_modules.items():
-            module_name, _ = name.rsplit(".", 1)
-            module.weight_global_scale.data.copy_(fused_global_scale)
-
-            # 2.2. finish calibration with fused global scales
-            calibrate_scale_zp(module)
-
-            # 3. compress module using microscale qparams
-            compress_module(module)
-
-            # 4. save compressed data (on cpu)
-            del tensors[name]
-            prefix = module_name + "."
-            for key, value in module.state_dict(prefix=prefix).items():
-                tensors[key] = value.to("cpu")
+    tensors, _ = _process_tensors_microscale(tensors, scheme, ignore, device)
 
     save_file(tensors, save_path)
     total_size = sum(tensor.nbytes for tensor in tensors.values())
@@ -231,9 +177,9 @@ def process_file_group_microscale_scheme(
         "Use `process_file` or `process_file_microscale_scheme` for "
         "non-microscale schemes"
     )
-    assert len(file_paths) == len(
-        save_paths
-    ), "file_paths and save_paths must have the same length"
+    assert len(file_paths) == len(save_paths), (
+        "file_paths and save_paths must have the same length"
+    )
 
     # Load all tensors from the group, tracking which output shard each belongs to
     tensor_to_shard: dict[str, str] = {}
@@ -254,6 +200,54 @@ def process_file_group_microscale_scheme(
         "This is a bug in group_files_by_fused_weights."
     )
 
+    tensors, tensor_to_shard = _process_tensors_microscale(
+        tensors, scheme, ignore, device, tensor_to_shard
+    )
+
+    # Re-shard: write each tensor back to its original output file
+    output_shards: dict[str, dict[str, torch.Tensor]] = defaultdict(dict)
+    for name, tensor in tensors.items():
+        output_shards[tensor_to_shard[name]][name] = tensor
+
+    total_size = 0
+    weight_map: dict[str, str] = {}
+    for save_path in save_paths:
+        shard_name = os.path.basename(save_path)
+        shard_tensors = output_shards.get(shard_name, {})
+        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
+        save_file(shard_tensors, save_path)
+        total_size += sum(t.nbytes for t in shard_tensors.values())
+        weight_map.update({k: shard_name for k in shard_tensors})
+
+    return total_size, weight_map
+
+
+def _process_tensors_microscale(
+    tensors: dict[str, torch.Tensor],
+    scheme: QuantizationScheme,
+    ignore: Iterable[str],
+    device: str | torch.device,
+    tensor_to_shard: dict[str, str] | None = None,
+) -> tuple[dict[str, torch.Tensor], dict[str, str] | None]:
+    """
+    Core microscale quantization logic shared by process_file_microscale_scheme
+    and process_file_group_microscale_scheme.
+
+    Processes all quantizable tensors in the given dict in-place, handling
+    global scale fusion for fused weight sets (q/k/v, gate/up). When
+    tensor_to_shard is provided, shard assignments are updated to follow
+    compressed tensor keys.
+
+    :param tensors: dict of tensor name -> tensor, modified in-place
+    :param scheme: microscale quantization scheme (NVFP4, MXFP4)
+    :param ignore: modules to ignore
+    :param device: device used to quantize and compress weights
+    :param tensor_to_shard: optional mapping of tensor name -> shard filename,
+        updated in-place when compressed tensors produce new keys
+    :return: (tensors, tensor_to_shard) tuple with updated contents
+    """
+    fused_sets, _ = get_fused_names(list(tensors.keys()))
+
     fused_name_to_fused_index: dict[str, int] = {
         name: index
         for index, matched_set in enumerate(fused_sets)
@@ -280,13 +274,14 @@ def process_file_group_microscale_scheme(
         # 3. compress module using qparams
         compress_module(module)
 
-        # 4. save compressed data back to cpu, preserving shard assignment
-        original_shard = tensor_to_shard[name]
+        # 4. save compressed data back to cpu
+        original_shard = tensor_to_shard[name] if tensor_to_shard else None
         del tensors[name]
         prefix = module_name + "."
         for key, value in module.state_dict(prefix=prefix).items():
             tensors[key] = value.to("cpu")
-            tensor_to_shard[key] = original_shard
+            if tensor_to_shard is not None:
+                tensor_to_shard[key] = original_shard
 
     # compress and save microscale fused modules (with fused global scales)
     for named_modules in fused_modules.values():
@@ -304,27 +299,13 @@ def process_file_group_microscale_scheme(
             # 3. compress module using microscale qparams
             compress_module(module)
 
-            # 4. save compressed data back to cpu, preserving shard assignment
-            original_shard = tensor_to_shard[name]
+            # 4. save compressed data back to cpu
+            original_shard = tensor_to_shard[name] if tensor_to_shard else None
             del tensors[name]
             prefix = module_name + "."
             for key, value in module.state_dict(prefix=prefix).items():
                 tensors[key] = value.to("cpu")
-                tensor_to_shard[key] = original_shard
-
-    # Re-shard: write each tensor back to its original output file
-    output_shards: dict[str, dict[str, torch.Tensor]] = defaultdict(dict)
-    for name, tensor in tensors.items():
-        output_shards[tensor_to_shard[name]][name] = tensor
-
-    total_size = 0
-    weight_map: dict[str, str] = {}
-    for save_path in save_paths:
-        shard_name = os.path.basename(save_path)
-        shard_tensors = output_shards.get(shard_name, {})
-        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
-        save_file(shard_tensors, save_path)
-        total_size += sum(t.nbytes for t in shard_tensors.values())
-        weight_map.update({k: shard_name for k in shard_tensors})
+                if tensor_to_shard is not None:
+                    tensor_to_shard[key] = original_shard
 
-    return total_size, weight_map
+    return tensors, tensor_to_shard
diff --git a/tests/llmcompressor/entrypoints/model_free/test_reindexing_elimination.py b/tests/llmcompressor/entrypoints/model_free/test_reindexing_elimination.py
@@ -122,10 +122,16 @@ def qkv_tensors(self):
         }
 
     def _save_split_shards(self, tmp_path, tensors):
-        shard1 = {"model.layers.0.self_attn.q_proj.weight":
-                  tensors["model.layers.0.self_attn.q_proj.weight"]}
-        shard2 = {k: v for k, v in tensors.items()
-                  if k != "model.layers.0.self_attn.q_proj.weight"}
+        shard1 = {
+            "model.layers.0.self_attn.q_proj.weight": tensors[
+                "model.layers.0.self_attn.q_proj.weight"
+            ]
+        }
+        shard2 = {
+            k: v
+            for k, v in tensors.items()
+            if k != "model.layers.0.self_attn.q_proj.weight"
+        }
         shard1_path = tmp_path / "shard-00001.safetensors"
         shard2_path = tmp_path / "shard-00002.safetensors"
         save_file(shard1, shard1_path)
@@ -175,9 +181,7 @@ def test_group_processing_produces_same_keys_as_single_shard(
 
         assert set(weight_map_group.keys()) == set(weight_map_merged.keys())
 
-    def test_group_processing_preserves_original_sharding(
-        self, qkv_tensors, tmp_path
-    ):
+    def test_group_processing_preserves_original_sharding(self, qkv_tensors, tmp_path):
         scheme = _make_nvfp4_scheme()
         split_dir = tmp_path / "split"
         split_dir.mkdir()
@@ -201,9 +205,7 @@ def test_group_processing_preserves_original_sharding(
             assert save_path.exists()
             assert save_path.stat().st_size > 0
 
-    def test_group_processing_total_size_matches_merged(
-        self, qkv_tensors, tmp_path
-    ):
+    def test_group_processing_total_size_matches_merged(self, qkv_tensors, tmp_path):
         scheme = _make_nvfp4_scheme()
         split_dir = tmp_path / "split"
         split_dir.mkdir()
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py
@@ -709,4 +709,3 @@ def test_search_observer_invalid_rejected():
 
     with pytest.raises(ValidationError, match="search_observer must be one of"):
         AWQModifier(scheme="W4A16_ASYM", search_observer="invalid_observer")
-

Original file line number	Diff line number	Diff line change
`@@ -709,4 +709,3 @@ def test_search_observer_invalid_rejected():`
`709`	`709`
`710`	`710`	`with pytest.raises(ValidationError, match="search_observer must be one of"):`
`711`	`711`	`AWQModifier(scheme="W4A16_ASYM", search_observer="invalid_observer")`
`712`		`-`