reindex_fused_weights.py script

kylesayrs · kylesayrs · commit 5be6f79639d3 · 2025-11-18T11:25:57.000-05:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/model_free/__init__.py b/src/llmcompressor/entrypoints/model_free/__init__.py
@@ -12,11 +12,7 @@
 from loguru import logger
 from safetensors.torch import load_file, save_file
 
-from llmcompressor.entrypoints.model_free.helpers import (
-    gpu_if_available,
-    validate_safetensors_index,
-    validate_scheme,
-)
+from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
 from llmcompressor.entrypoints.model_free.lifecycle import (
     calibrate_global_scale,
     calibrate_scale_zp,
@@ -35,6 +31,10 @@
     update_config,
     update_safetensors_index,
 )
+from llmcompressor.entrypoints.model_free.validate import (
+    validate_safetensors_index,
+    validate_scheme,
+)
 
 __all__ = ["model_free_ptq"]
 
@@ -71,15 +71,15 @@ def model_free_ptq(
         if not is_microscale_scheme(scheme)
         else _process_file_microscale_scheme
     )
-    for file_path, resolved_path in model_files:
+    for file_path, resolved_path in model_files.items():
         save_path = Path(save_directory) / file_path
 
         if file_path.endswith("safetensors"):
             jobs.append((job_fn, resolved_path, save_path, scheme, ignore, device))
 
         else:
             if is_weights_file(file_path):
-                logger.warning(f"Skipping weights file {file_path}")
+                logger.warning(f"Skip processing for weights file {file_path}")
             save_path.parent.mkdir(parents=True, exist_ok=True)
             logger.info(f"Copying {file_path} {save_path}")
             shutil.copyfile(resolved_path, save_path)
diff --git a/src/llmcompressor/entrypoints/model_free/helpers.py b/src/llmcompressor/entrypoints/model_free/helpers.py
@@ -1,83 +1,18 @@
-import json
+import os
+from collections import defaultdict
+from typing import Mapping, TypeVar
 
 import torch
-from compressed_tensors.quantization import (
-    QuantizationScheme,
-    preset_name_to_scheme,
-)
-from compressed_tensors.utils import getattr_chain
 from loguru import logger
+from transformers.file_utils import CONFIG_NAME
 
-from .microscale import get_fused_names, is_microscale_scheme
-
-__all__ = ["validate_scheme", "gpu_if_available"]
-
-
-def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme]:
-    # treat strings as preset schemes
-    if isinstance(scheme, str):
-        scheme_name, scheme = scheme, preset_name_to_scheme(scheme, [])
-    else:
-        scheme_name = "config_group_0"
-
-    # weight quantization must be provided
-    if scheme.weights is None:
-        raise ValueError(
-            "Must provide a weights quanitization scheme to perform weights-only PTQ"
-        )
-
-    # activation quantization must be dynamic
-    input_dynamic = getattr_chain(scheme, "input_activations.dynamic", True)
-    output_dynamic = getattr_chain(scheme, "output_activations.dynamic", True)
-    if input_dynamic is not True or output_dynamic is not True:
-        raise ValueError(
-            "Model Free PTQ cannot calibrate activations. "
-            "Please use `oneshot` instead."
-        )
-
-    # override with static observers
-    # Remove after https://github.com/vllm-project/compressed-tensors/pull/489
-    if scheme.weights.observer in ("minmax", "mse"):
-        new_observer = f"static_{scheme.weights.observer}"
-        logger.warning(
-            f"Scheme uses {scheme.weights.observer} weight observer. "
-            f"Using {new_observer} instead"
-        )
-        scheme.weights.observer = new_observer
-
-    # target all modules; filter by ignore list
-    # technically this should be "re:.*", but vllm's
-    # ct moe layer has a hard coded check for "Linear"
-    scheme.targets = ["Linear"]
-    return scheme_name, scheme
-
-
-def validate_safetensors_index(
-    model_files: list[tuple[str, str]], scheme: QuantizationScheme
-):
-    resolved_paths = [
-        resolved_path
-        for file_path, resolved_path in model_files
-        if file_path.endswith("safetensors.index.json")
-    ]
-    if len(resolved_paths) <= 0:
-        return
-    resolved_path = resolved_paths[0]
-
-    if is_microscale_scheme(scheme):
-        with open(resolved_path, "r") as file:
-            weight_map: dict[str, str] = json.load(file)["weight_map"]
-
-        fused_names = get_fused_names(weight_map)
-        for submodule_names in fused_names.values():
-            file_names = [weight_map[name] for name in submodule_names]
-            if not all(file_name == file_names[0] for file_name in file_names):
-                raise NotImplementedError(
-                    "When using a microscale scheme (NVFP4, MXFP4), global scales "
-                    "will be fused. Current implmentation requires that all fused "
-                    "modules (attention and non-moe mlp) be stored in the same file. "
-                    f"Instead, got {submodule_names}\n\n {file_names}"
-                )
+__all__ = [
+    "gpu_if_available",
+    "find_safetensors_index_path",
+    "find_config_path",
+    "find_safetensors_index_file",
+    "invert_mapping",
+]
 
 
 def gpu_if_available(device: torch.device | str | None) -> torch.device:
@@ -93,3 +28,42 @@ def gpu_if_available(device: torch.device | str | None) -> torch.device:
     else:
         logger.warning("CUDA/XPU is not available! Compressing model on CPU instead")
         return torch.device("cpu")
+
+
+def find_safetensors_index_path(save_directory: str | os.PathLike) -> str | None:
+    for file_name in os.listdir(save_directory):
+        if file_name.endswith("safetensors.index.json"):
+            return os.path.join(save_directory, file_name)
+
+    return None
+
+
+def find_config_path(save_directory: str | os.PathLike) -> str | None:
+    for file_name in os.listdir(save_directory):
+        if file_name in (CONFIG_NAME, "params.json"):
+            return os.path.join(save_directory, file_name)
+
+    return None
+
+
+def find_safetensors_index_file(model_files: dict[str, str]) -> str | None:
+    for file_path, resolved_path in model_files.items():
+        if file_path.endswith("safetensors.index.json"):
+            return resolved_path
+
+    return None
+
+
+KeyType = TypeVar("K")
+ValueType = TypeVar("V")
+
+
+def invert_mapping(
+    mapping: Mapping[KeyType, ValueType],
+) -> dict[ValueType, list[KeyType]]:
+    inverse = defaultdict(list)
+
+    for key, value in mapping.items():
+        inverse[value].append(key)
+
+    return inverse
diff --git a/src/llmcompressor/entrypoints/model_free/microscale.py b/src/llmcompressor/entrypoints/model_free/microscale.py
@@ -1,14 +1,52 @@
 import torch
 from compressed_tensors.quantization import QuantizationScheme, QuantizationStrategy
+from compressed_tensors.utils.match import _match_name
 
-__all__ = ["get_fused_names", "is_microscale_scheme"]
+__all__ = ["get_fused_names", "is_microscale_scheme", "match_names_set_eager"]
+
+
+MatchedNamesSet = dict[str, str | None]
 
 
 def is_microscale_scheme(scheme: QuantizationScheme) -> bool:
     assert scheme.weights is not None
     return scheme.weights.strategy == QuantizationStrategy.TENSOR_GROUP
 
 
+def match_names_set_eager(
+    tensor_names: set[str] | list[str],
+    targets: set[str] | list[str],
+    return_unmatched: bool = True,
+) -> list[MatchedNamesSet] | tuple[list[MatchedNamesSet], MatchedNamesSet]:
+    matched_sets = []
+    matches = dict.fromkeys(targets, None)
+
+    for name in tensor_names:
+        # match until we get a full set
+        for target in targets:
+            if _match_name(name, target):
+                if matches[target] is None:
+                    matches[target] = name
+                else:
+                    # matched target twice without completing a set
+                    raise ValueError(
+                        f"Matched a {target} twice before "
+                        f"completing set ({matches[target]}, {name})"
+                    )
+
+        # once we have a full set, yield and reset
+        if all((matches[target] is not None for target in targets)):
+            matched_sets.append(matches)
+            matches = dict.fromkeys(targets, None)
+
+    unmatched_set = matches if any((v is not None for v in matches.values())) else None
+
+    if return_unmatched:
+        return matched_sets, unmatched_set
+    else:
+        return matched_sets
+
+
 def get_fused_names(tensors: dict[str, torch.Tensor]) -> dict[str, list[str]]:
     fused_names = {}
 
diff --git a/src/llmcompressor/entrypoints/model_free/model_utils.py b/src/llmcompressor/entrypoints/model_free/model_utils.py
@@ -18,15 +18,15 @@ def is_weights_file(file_name: str) -> bool:
     return any(file_name.endswith(suffix) for suffix in weights_files)
 
 
-def get_checkpoint_files(model_stub: str | os.PathLike) -> list[tuple[str, str]]:
+def get_checkpoint_files(model_stub: str | os.PathLike) -> dict[str, str]:
     # In the future, this function can accept and pass download kwargs to cached_file
 
     if os.path.exists(model_stub):
         file_paths = walk_file_paths(model_stub, ignore=".cache")
     else:
         file_paths = list_repo_files(model_stub)
 
-    return [(file_path, cached_file(model_stub, file_path)) for file_path in file_paths]
+    return {file_path: cached_file(model_stub, file_path) for file_path in file_paths}
 
 
 def walk_file_paths(root_dir: str, ignore: str | None = None) -> list[str]:
diff --git a/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py b/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py
@@ -0,0 +1,139 @@
+import json
+import os
+import shutil
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import torch
+import tqdm
+from loguru import logger
+from safetensors.torch import load_file, save_file
+
+from llmcompressor.entrypoints.model_free.helpers import (
+    find_safetensors_index_file,
+    invert_mapping,
+)
+from llmcompressor.entrypoints.model_free.microscale import match_names_set_eager
+from llmcompressor.entrypoints.model_free.model_utils import (
+    get_checkpoint_files,
+    is_weights_file,
+)
+from llmcompressor.entrypoints.model_free.save_utils import update_safetensors_index
+
+# very naive script
+# assumes weight locality, meaning that if a set of fused weights are not in a file,
+# 1. the incomplete set is the last set of weights (sorted alphabetically)
+# 2. the remainder of the incomplete set is the next file (sorted alphabetically)
+
+model_stub = ""
+fused_mappings: list[list[str]] = []
+
+DEFAULT_FUSED_MAPPINGS = [
+    [
+        "re:.*(attn|attention)\.q_proj\.weight$",
+        "re:.*(attn|attention)\.k_proj\.weight$",
+        "re:.*(attn|attention)\.v_proj\.weight$",
+    ],
+    [
+        "re:.*(attn|attention)\.wq_a\.weight$",
+        "re:.*(attn|attention)\.wkv_a_with_mqa\.weight$",
+    ],
+    ["re:.*mlp\.gate_proj\.weight$", "re:.*attn\.up_proj\.weight$"],
+    ["re:.*w1\.weight$", "re:.*w3\.weight$"],
+]
+
+
+def main(
+    model_stub: str,
+    save_directory: str,
+    fused_mappings: list[list[str]] = DEFAULT_FUSED_MAPPINGS,
+):
+    # read files
+    model_files = get_checkpoint_files(model_stub)
+    index_file = find_safetensors_index_file(model_files)
+    if index_file is None:
+        raise ValueError(
+            "This script is used to modify safetensor file shards, "
+            "but was unable to find safetenors index file"
+        )
+
+    # copy non-weight files
+    for file_path, resolved_path in model_files.items():
+        save_path = Path(save_directory) / file_path
+
+        if file_path.endswith("safetensors"):
+            continue
+        else:
+            if is_weights_file(file_path):
+                logger.warning(f"Skip processing for weights file {file_path}")
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Copying {file_path} {save_path}")
+            shutil.copyfile(resolved_path, save_path)
+
+    # read index file
+    with open(index_file, "r") as file:
+        index_file_data = json.load(file)
+
+    weight_map: dict[str, str] = index_file_data["weight_map"]
+    final_weight_map: dict[str, str] = {}
+
+    # set up copy executor and carry over
+    executor = ThreadPoolExecutor(max_workers=10)
+    carry_over_tensors: dict[str, torch.Tensor] = {}
+
+    # iterate in alphabetical order on assumption of weight-file locality
+    file_map = invert_mapping(weight_map)
+    file_map = sorted(file_map)
+    progress = tqdm.tqdm(total=len(file_map))
+    for file_name in file_map:
+        file_path = model_files[file_name]
+        save_path = os.path.join(save_directory, file_name)
+        tensors = load_file(file_path)
+
+        if len(carry_over_tensors) > 0:
+            # add carryover
+            tensors.update(carry_over_tensors)
+            carry_over_tensors = {}
+
+        tensor_names = sorted(list(tensors.keys()))
+        for mapping in fused_mappings:
+            _matches, unmatched = match_names_set_eager(tensor_names, mapping)
+
+            if unmatched is not None:
+                # move to carry over
+                unmatched_tensors = {
+                    key: tensors[key] for key in unmatched.values() if key is not None
+                }
+                carry_over_tensors.update(unmatched_tensors)
+
+                # delete from current file
+                for key in unmatched_tensors:
+                    tensor_names.remove(key)
+                    del tensors[key]
+
+        # save tensors after modification
+        executor.submit(with_progress(save_file, tensors, save_path, progress=progress))
+        final_weight_map.update({name: file_name for name in tensor_names})
+
+    update_safetensors_index(
+        save_directory, index_file_data["metadata"]["total_size"], final_weight_map
+    )
+
+    executor.shutdown()
+
+
+def with_progress(fn: callable, *args, progress: tqdm.tqdm):
+    # ret = fn(*args)
+    # print(args[0].keys())
+    # print(args[1])
+    ret = None
+    progress.update(1)
+    return ret
+
+
+if __name__ == "__main__":
+    main(
+        # "mistralai/mistral-large-3",
+        "/raid/engine/hub_cache/mistral-fp8-block",
+        "temp",
+    )
diff --git a/src/llmcompressor/entrypoints/model_free/save_utils.py b/src/llmcompressor/entrypoints/model_free/save_utils.py
diff --git a/src/llmcompressor/entrypoints/model_free/validate.py b/src/llmcompressor/entrypoints/model_free/validate.py