vllm-project
diff --git a/‎src/llmcompressor/entrypoints/model_free/__init__.py‎
Lines changed: 175 additions & 22 deletions b/‎src/llmcompressor/entrypoints/model_free/__init__.py‎
Lines changed: 175 additions & 22 deletions
diff --git a/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 56 additions & 0 deletions b/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 56 additions & 0 deletions
@@ -1,3 +1,4 @@
+import json
 import os
 import shutil
 from pathlib import Path
@@ -14,12 +15,17 @@
 from compressed_tensors.quantization import QuantizationScheme
 from loguru import logger
 
-from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
+from llmcompressor.entrypoints.model_free.helpers import (
+    find_safetensors_index_file,
+    gpu_if_available,
+    group_files_by_fused_weights,
+)
 from llmcompressor.entrypoints.model_free.microscale import (
     is_microscale_scheme,
 )
 from llmcompressor.entrypoints.model_free.process import (
     process_file,
+    process_file_group_microscale_scheme,
     process_file_microscale_scheme,
     validate_file,
 )
@@ -45,9 +51,14 @@ def model_free_ptq(
 ):
     """
     Quantize a model without the need for a model definition. This function operates on
-    a model stub or folder containing weights saved in safetensors files
+    a model stub or folder containing weights saved in safetensors files.
+
+    For microscale schemes (NVFP4, MXFP4), fused weight sets (q/k/v, gate/up) are
+    automatically grouped for joint processing even when split across shards, removing
+    the need to run reindex_fused_weights as a preprocessing step.
 
     :param model_stub: huggingface model hub or path to local weights files
+    :param save_directory: directory to save quantized weights to
     :param scheme: weight quantization scheme or preset scheme name
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
@@ -64,32 +75,31 @@ def model_free_ptq(
     device = gpu_if_available(device)
     validate_safetensors_index(model_files, scheme)
 
-    # 0. collect safetensors files, copy files
-    jobs = []
-    job_fn = (
-        process_file
-        if not is_microscale_scheme(scheme)
-        else process_file_microscale_scheme
-    )
+    # copy non-safetensors files (configs, tokenizers, etc.)
     for file_path, resolved_path in model_files.items():
-        save_path = Path(save_directory) / file_path
-
-        if file_path.endswith("safetensors"):
-            jobs.append(
-                (job_fn, resolved_path, save_path, scheme, ignore, device, converter)
-            )
-
-        else:
+        if not file_path.endswith("safetensors"):
+            save_path = Path(save_directory) / file_path
             if is_weights_file(file_path):
                 logger.warning(f"Skip processing for weights file {file_path}")
             save_path.parent.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Copying {file_path} {save_path}")
+            logger.info(f"Copying {file_path} -> {save_path}")
             shutil.copyfile(resolved_path, save_path)
 
-    # 1. validate quantizable tensors fail fast before long-running quantization
-    exec_jobs(
-        [(validate_file, *job[1:]) for job in jobs], max_workers, desc="Validating"
+    # build quantization jobs
+    if is_microscale_scheme(scheme):
+        jobs = _build_microscale_jobs(
+            model_files, save_directory, scheme, ignore, device, converter
+        )
+    else:
+        jobs = _build_standard_jobs(
+            model_files, save_directory, scheme, ignore, device, converter
+        )
+
+    # 1. validate quantizable tensors — fail fast before long-running quantization
+    validate_jobs = _make_validate_jobs(
+        jobs, model_files, scheme, ignore, device, converter
     )
+    exec_jobs(validate_jobs, max_workers, desc="Validating")
 
     # 2-5. quantize and compress weights
     total_size = 0
@@ -99,6 +109,149 @@ def model_free_ptq(
         total_size += _total_size
         weight_map.update(_weight_map)
 
-    # 5. update config and safetensors index
+    # 6. update config and safetensors index
     update_config(save_directory, scheme_name, scheme, ignore, converter)
     update_safetensors_index(save_directory, total_size, weight_map)
+
+
+def _build_standard_jobs(
+    model_files: dict[str, str],
+    save_directory: str | os.PathLike,
+    scheme: QuantizationScheme,
+    ignore: Iterable[str],
+    device: torch.device,
+    converter: Converter | None,
+) -> list[tuple]:
+    """Build one job per safetensors file for non-microscale schemes."""
+    jobs = []
+    for file_path, resolved_path in model_files.items():
+        if file_path.endswith("safetensors"):
+            save_path = Path(save_directory) / file_path
+            jobs.append(
+                (
+                    process_file,
+                    resolved_path,
+                    save_path,
+                    scheme,
+                    ignore,
+                    device,
+                    converter,
+                )
+            )
+    return jobs
+
+
+def _build_microscale_jobs(
+    model_files: dict[str, str],
+    save_directory: str | os.PathLike,
+    scheme: QuantizationScheme,
+    ignore: Iterable[str],
+    device: torch.device,
+    converter: Converter | None,
+) -> list[tuple]:
+    """
+    Build jobs for microscale schemes, grouping files that share fused weight sets
+    so that global scale fusion works correctly across shard boundaries.
+
+    For models where all fused weights are already co-located in single shards,
+    each group will be a singleton and process_file_microscale_scheme is used.
+    For models with cross-shard fused weights, multi-file groups are formed and
+    process_file_group_microscale_scheme is used, eliminating the need for
+    reindex_fused_weights preprocessing.
+    """
+    index_file = find_safetensors_index_file(model_files)
+
+    if index_file is None:
+        # Single-file model (no index.json) — use standard microscale path
+        jobs = []
+        for file_path, resolved_path in model_files.items():
+            if file_path.endswith("safetensors"):
+                save_path = Path(save_directory) / file_path
+                jobs.append(
+                    (
+                        process_file_microscale_scheme,
+                        resolved_path,
+                        save_path,
+                        scheme,
+                        ignore,
+                        device,
+                        converter,
+                    )
+                )
+        return jobs
+
+    # Read weight map to determine cross-shard fused weight groupings
+    with open(index_file, "r") as f:
+        weight_map: dict[str, str] = json.load(f)["weight_map"]
+
+    file_groups = group_files_by_fused_weights(weight_map)
+    jobs = []
+
+    for group in file_groups:
+        if len(group) == 1:
+            # No cross-shard fused weights — use the standard single-file path
+            shard_name = group[0]
+            resolved_path = model_files[shard_name]
+            save_path = Path(save_directory) / shard_name
+            jobs.append(
+                (
+                    process_file_microscale_scheme,
+                    resolved_path,
+                    save_path,
+                    scheme,
+                    ignore,
+                    device,
+                    converter,
+                )
+            )
+        else:
+            # Cross-shard fused weights — load group jointly
+            logger.info(
+                f"Grouping {len(group)} shards for joint microscale processing "
+                f"(fused weights span multiple files): {group}"
+            )
+            file_paths = [model_files[shard] for shard in group]
+            save_paths = [Path(save_directory) / shard for shard in group]
+            jobs.append(
+                (
+                    process_file_group_microscale_scheme,
+                    file_paths,
+                    save_paths,
+                    scheme,
+                    ignore,
+                    device,
+                    converter,
+                )
+            )
+
+    return jobs
+
+
+def _make_validate_jobs(
+    jobs: list[tuple],
+    model_files: dict[str, str],
+    scheme: QuantizationScheme,
+    ignore: Iterable[str],
+    device: torch.device,
+    converter: Converter | None,
+) -> list[tuple]:
+    """
+    Build validate_file jobs corresponding to the quantization jobs.
+    For group jobs, creates one validate_file call per file in the group.
+    """
+    validate_jobs = []
+    for job in jobs:
+        fn = job[0]
+        if fn is process_file_group_microscale_scheme:
+            # job = (fn, file_paths, save_paths, scheme, ignore, device, converter)
+            file_paths, save_paths = job[1], job[2]
+            for fp, sp in zip(file_paths, save_paths):
+                validate_jobs.append(
+                    (validate_file, fp, sp, scheme, ignore, device, converter)
+                )
+        else:
+            # job = (fn, file_path, save_path, scheme, ignore, device, converter)
+            validate_jobs.append(
+                (validate_file, job[1], job[2], scheme, ignore, device, converter)
+            )
+    return validate_jobs
@@ -12,6 +12,7 @@
     "match_names_set_eager",
     "MatchedNamesSet",
     "invert_mapping",
+    "group_files_by_fused_weights",
 ]
 
 KeyType = TypeVar("K")
@@ -96,3 +97,58 @@ def invert_mapping(
         inverse[value].append(key)
 
     return inverse
+
+
+def group_files_by_fused_weights(
+    weight_map: dict[str, str],
+) -> list[list[str]]:
+    """
+    Group safetensors files such that files containing complementary fused
+    weight sets (e.g. q/k/v_proj split across shards) are placed in the
+    same group for joint processing. Files with no cross-shard fused
+    dependencies form singleton groups.
+
+    This allows model_free_ptq to handle microscale schemes (NVFP4, MXFP4)
+    without a reindexing preprocessing step, by loading all tensors in a
+    fused set together at processing time.
+
+    :param weight_map: mapping of weight name -> file name (from index.json)
+    :return: list of file groups; each group is a sorted list of shard file names
+    """
+    # Import here to avoid circular dependency
+    from llmcompressor.entrypoints.model_free.microscale import get_fused_names
+
+    all_tensor_names = list(weight_map.keys())
+    fused_sets, _ = get_fused_names(all_tensor_names)
+
+    # union-find over file names
+    file_names = sorted(set(weight_map.values()))
+    parent = {f: f for f in file_names}
+
+    def find(x: str) -> str:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]  # path compression
+            x = parent[x]
+        return x
+
+    def union(a: str, b: str) -> None:
+        parent[find(a)] = find(b)
+
+    # union all files that share a fused set
+    for fused_set in fused_sets:
+        files_in_set = list(
+            {
+                weight_map[name]
+                for name in fused_set.values()
+                if name is not None and name in weight_map
+            }
+        )
+        for f in files_in_set[1:]:
+            union(files_in_set[0], f)
+
+    # collect files into groups keyed by their root
+    groups: dict[str, list[str]] = defaultdict(list)
+    for f in file_names:
+        groups[find(f)].append(f)
+
+    return [sorted(g) for g in groups.values()]