convert.py: Add new parallel mode

turboderp · turboderp · commit 754142101bd5 · 2025-11-01T15:54:22.000+01:00
diff --git a/exllamav3/conversion/convert_model.py b/exllamav3/conversion/convert_model.py
@@ -16,6 +16,7 @@
 from safetensors import safe_open
 import os, shutil
 import json
+import threading
 
 col_default = "\u001b[0m"
 col_red = "\u001b[31;1m"
@@ -40,6 +41,7 @@
 parser.add_argument("-img", "--image_dump", action = "store_true", help = "Save model tensors as images (saved to working directory)")
 parser.add_argument("-cb", "--codebook", type = str, default = "mcg", help = "Codebook: mcg (default), mul1 or 3inst")
 parser.add_argument("-strat", "--strategy", type = str, default = None, help = "Modifiers for quantization strategy - EXPERIMENTAL")
+parser.add_argument("-pm", "--parallel_mode", action = "store_true", help = "When possible, use new parallel mode for small tensors (MoE layers especially)")
 
 group = parser.add_mutually_exclusive_group()
 group.add_argument("--out_scales", dest = "out_scales_", action = "store_true", help = "Always enable out channel scales  (for debug purposes)")
@@ -50,6 +52,10 @@
 
 num_ref_states = 5
 
+progress_lock = threading.Lock()
+curr_progress = 0
+max_progress = 0
+
 def check_system():
     if os.environ.get("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") is not None:
         print(
@@ -167,6 +173,7 @@ def override(arg, can_override, default):
         ("device_ratios", True, None),
         ("codebook", True, "mcg"),
         ("strategy", False, ""),
+        ("parallel_mode", True, False),
     ]:
         override(arg_, can_override if not args.override_anyway else True, default)
 
@@ -268,6 +275,7 @@ def mod_strategy(args, module, strategy, idx):
 
 @torch.inference_mode()
 def main(args, job_state):
+    global max_progress, curr_progress
 
     torch.set_printoptions(precision = 5, sci_mode = False, linewidth = 200)
 
@@ -399,47 +407,136 @@ def main(args, job_state):
             for linear in linears:
                 linear.inner.swap_cpu()
 
-            # Quantize module
-            for linear in linears:
-                quant_args = {
-                    "seed": idx,
-                    "K": strategy[linear.key],
-                    "devices": devices,
-                    "device_ratios": device_ratios,
-                    "apply_out_scales": args["apply_out_scales"],
-                }
-                if args["codebook"] == "mcg":
-                    quant_args.update({
-                        "mcg": True
-                    })
-                elif args["codebook"] == "mul1":
-                    quant_args.update({
-                        "mul1": True
-                    })
-
-                with Timer() as t:
-                    sr = os.path.join(args["work_dir"], f"images/{linear.key}.reg.jpg") \
-                        if args["image_dump"] else None
-                    proxy_err = linear.convert_exl3(
-                        capture_H[linear.qmap],
-                        quant_args = quant_args,
-                        progress_str = f" -- <step>: {linear.key}",
-                        verbose = args["verbose"],
-                        save_reg = sr
+            # Decide mode
+            # TODO: Might be useful to compare no. h-tiles per tensor, no. layers and no. SMs across GPUs
+            use_parallel_mode = False
+            if args["parallel_mode"] and len(linears) >= len(devices):
+                use_parallel_mode = True
+
+            # Quantize module, layer parallel
+            if use_parallel_mode:
+                assert not args["image_dump"], "Parallel mode is incompatible with --image_dump"
+
+                # Split workload
+                all_dev_linears = [[] for _ in devices]
+
+                tot_numel = sum(linear.weights_numel() for linear in linears)
+                if device_ratios is None:
+                    dev_numel = [tot_numel // len(devices) for _ in devices]
+                else:
+                    tot_split = sum(device_ratios)
+                    dev_numel = [tot_numel * r // tot_split for _, r in zip(devices, device_ratios)]
+
+                for linear in linears:
+                    l_numel = linear.weights_numel()
+                    fit = [d_numel - l_numel for d_numel in dev_numel]
+                    bestfit = max(range(len(fit)), key = lambda x: fit[x])
+                    dev_numel[bestfit] -= l_numel
+                    all_dev_linears[bestfit].append(linear)
+
+                with progress_lock:
+                    curr_progress = 0
+                    max_progress = len(linears)
+
+                # Worker thread
+                def work_thread(device_idx, dev_linears):
+                    global curr_progress
+
+                    for linear in dev_linears:
+                        quant_args_local = {
+                            "seed": idx,
+                            "K": strategy[linear.key],
+                            "devices": [device_idx],
+                            "apply_out_scales": args["apply_out_scales"],
+                        }
+                        if args["codebook"] == "mcg": quant_args_local.update({ "mcg": True })
+                        elif args["codebook"] == "mul1": quant_args_local.update({ "mul1": True })
+
+                        proxy_err = linear.convert_exl3(
+                            capture_H[linear.qmap],
+                            quant_args = quant_args_local,
+                            verbose = args["verbose"],
+                            save_reg = False,
+                            override_swap_device = device_idx
+                        )
+                        assert isinstance(linear.inner, LinearEXL3)
+                        linear.inner.swap_cpu()
+
+                        flags = "o" if quant_args_local["apply_out_scales"] else "."
+                        proxy_err_str = f"{proxy_err:8.6f}" if proxy_err >= 0.0 else "(OoM)   "
+                        print(
+                            f" -- Quantized: {linear.key:{config.stc.max_key_len() + 8}}"
+                            f"  bpw: {quant_args_local['K']:5.2f}"
+                            f"  proxy_err: {proxy_err_str}"
+                            f"  {flags}"
+                            f"  g_sc: {quant_args_local['g_scale']:.6f}"
+                        )
+                        with progress_lock:
+                            curr_progress += 1
+
+                # Launch
+                threads = []
+                for i, device_idx in enumerate(devices):
+                    if len(all_dev_linears[i]):
+                        t = threading.Thread(target = work_thread, args = (device_idx, all_dev_linears[i]))
+                        t.daemon = True
+                        threads.append(t)
+                for t in threads:
+                    t.start()
+
+                try:
+                    with ProgressBar(" -- Quantizing (parallel)", max_progress, transient = True) as progress:
+                        while any(t.is_alive() for t in threads):
+                            progress.update(curr_progress)
+                            time.sleep(0.1)
+                except KeyboardInterrupt as e:
+                    # TODO: This is too hacky
+                    from signal import pthread_kill, SIGTSTP, SIGKILL
+                    for t in threads:
+                        pthread_kill(t.ident, SIGTSTP)
+                        pthread_kill(t.ident, SIGKILL)
+                    print("Aborted.")
+                    sys.exit()
+
+                for t in threads:
+                    t.join(timeout = 0.1)
+
+            # Quantize module, single GPU or tensor split
+            else:
+                for linear in linears:
+                    quant_args = {
+                        "seed": idx,
+                        "K": strategy[linear.key],
+                        "devices": devices,
+                        "device_ratios": device_ratios,
+                        "apply_out_scales": args["apply_out_scales"],
+                    }
+                    if args["codebook"] == "mcg": quant_args.update({ "mcg": True })
+                    elif args["codebook"] == "mul1": quant_args.update({ "mul1": True })
+
+                    with Timer() as t:
+                        sr = os.path.join(args["work_dir"], f"images/{linear.key}.reg.jpg") \
+                            if args["image_dump"] else None
+                        proxy_err = linear.convert_exl3(
+                            capture_H[linear.qmap],
+                            quant_args = quant_args,
+                            progress_str = f" -- <step>: {linear.key}",
+                            verbose = args["verbose"],
+                            save_reg = sr,
+                        )
+                        assert isinstance(linear.inner, LinearEXL3)
+                        linear.inner.swap_cpu()
+                    flags = "o" if quant_args["apply_out_scales"] else "."
+                    proxy_err_str = f"{proxy_err:8.6f}" if proxy_err >= 0.0 else "(OoM)   "
+                    print(
+                        f" -- Quantized: {linear.key:{config.stc.max_key_len() + 8}}"
+                        f"  bpw: {quant_args['K']:5.2f}"
+                        f"  proxy_err: {proxy_err_str}"
+                        f"  {flags}"
+                        f"  g_sc: {quant_args['g_scale']:.6f}"
+                        f"  [{t.interval:4.2f} s]"
                     )
-                    assert isinstance(linear.inner, LinearEXL3)
-                    linear.inner.swap_cpu()
-                flags = "o" if quant_args["apply_out_scales"] else "."
-                proxy_err_str = f"{proxy_err:8.6f}" if proxy_err >= 0.0 else "(OoM)   "
-                print(
-                    f" -- Quantized: {linear.key:{config.stc.max_key_len() + 8}}"
-                    f"  bpw: {quant_args['K']:5.2f}"
-                    f"  proxy_err: {proxy_err_str}"
-                    f"  {flags}"
-                    f"  g_sc: {quant_args['g_scale']:.6f}"
-                    f"  [{t.interval:4.2f} s]"
-                )
-                sys.stdout.flush()
+                    sys.stdout.flush()
 
             # Collect converted module tensors
             for m in module:
diff --git a/exllamav3/modules/linear.py b/exllamav3/modules/linear.py
@@ -240,13 +240,17 @@ def convert_exl3(
         progress_str: str | None = None,
         return_weight_q: bool = False,
         verbose: bool = False,
-        save_reg: str = None
+        save_reg: str = None,
+        override_swap_device: torch.device | None = None
     ):
         assert isinstance(self.inner, LinearFP16), \
             "Inner layer is already quant type"
 
         # Destroy original layer here to save VRAM, we only need weights
         swap_to_device = self.inner.swap_device  # in case weights are swapped to CPU
+        if swap_to_device is not None and override_swap_device is not None:
+            swap_to_device = override_swap_device
+
         orig_weight = self.inner.get_weight_tensor().float()
         orig_bias = self.inner.get_bias_tensor()
         self.inner = None
diff --git a/exllamav3/modules/quant/exl3_lib/quantize.py b/exllamav3/modules/quant/exl3_lib/quantize.py
@@ -8,6 +8,7 @@
 from ....util import cuda_sync_active
 from ....util.tensor import save_tensor_image
 from functools import lru_cache
+import threading
 
 # Constant
 had_k, had_n = 128, 128
@@ -442,56 +443,60 @@ def ldlq(
     return weight_q, encoded
 
 
+finalize_capture_H_mutex = threading.Lock()
+
 def finalize_capture_H(H_data: dict, quant_args: dict, verbose: bool):
-    # Unswap H
-    if "H_swap_device" in H_data:
-        H_data["H"] = H_data["H"].to(H_data["H_swap_device"])
-        del H_data["H_swap_device"]
+    with finalize_capture_H_mutex:
 
-    H = H_data["H"]
-    if H_data["finalized"]:
-        return H, H_data["L"], H_data["su"], H_data["diag"]
+        # Unswap H
+        if "H_swap_device" in H_data:
+            H_data["H"] = H_data["H"].to(H_data["H_swap_device"])
+            del H_data["H_swap_device"]
 
-    # Mean of samples summed up during forward pass
-    H /= H_data["count"]
+        H = H_data["H"]
+        if H_data["finalized"]:
+            return H, H_data["L"], H_data["su"], H_data["diag"]
 
-    # Regularize diagonal
-    diag_mean = torch.diag(H).mean()
-    idx = torch.arange(H.shape[0])
-    H[idx, idx] += quant_args.get("sigma_reg", 0.025) * diag_mean
+        # Mean of samples summed up during forward pass
+        H /= H_data["count"]
 
-    # Some tests
-    diag = H[idx, idx].clone()
+        # Regularize diagonal
+        diag_mean = torch.diag(H).mean()
+        idx = torch.arange(H.shape[0])
+        H[idx, idx] += quant_args.get("sigma_reg", 0.025) * diag_mean
 
-    if verbose:
-        print(f"     - H min/max: {H.min().item():.6f}   {H.max().item():.6f}")
-        print(f"     - H mean/std: {H.mean().item():.6f}   {H.std().item():.6f}")
-        print(f"     - H diag min/max: {diag.min():.6f}   {diag.max():.6f} ")
+        # Some tests
+        diag = H[idx, idx].clone()
+
+        if verbose:
+            print(f"     - H min/max: {H.min().item():.6f}   {H.max().item():.6f}")
+            print(f"     - H mean/std: {H.mean().item():.6f}   {H.std().item():.6f}")
+            print(f"     - H diag min/max: {diag.min():.6f}   {diag.max():.6f} ")
 
-    # Random sign flips for input channel, fixed for the first linear layer to quantize with this H
-    k = H.shape[0]
-    su = (torch.randn(k, device = H.device).sign() + 1e-5).sign().to(torch.float).unsqueeze(1)
-    H_data["su"] = su
+        # Random sign flips for input channel, fixed for the first linear layer to quantize with this H
+        k = H.shape[0]
+        su = (torch.randn(k, device = H.device).sign() + 1e-5).sign().to(torch.float).unsqueeze(1)
+        H_data["su"] = su
 
-    # Input had
-    H *= su.T
-    blockwise_preapply_had_r_(H, had_k)
-    H *= su
-    blockwise_preapply_had_l_(H, had_k)
+        # Input had
+        H *= su.T
+        blockwise_preapply_had_r_(H, had_k)
+        H *= su
+        blockwise_preapply_had_l_(H, had_k)
 
-    # Get block LDL decomposition of H, zero diagonal
-    L, H = block_ldl(H, 16, verbose)
-    dr = torch.arange(k)
-    L[dr, dr] = 0
-    H_data["L"] = L
+        # Get block LDL decomposition of H, zero diagonal
+        L, H = block_ldl(H, 16, verbose)
+        dr = torch.arange(k)
+        L[dr, dr] = 0
+        H_data["L"] = L
 
-    # H is no longer needed except to compute proxy error so move to CPU
-    H = H.cpu()
-    H_data["H"] = H.cpu()
+        # H is no longer needed except to compute proxy error so move to CPU
+        H = H.cpu()
+        H_data["H"] = H.cpu()
 
-    H_data["finalized"] = True
-    H_data["diag"] = diag
-    return H, L, su, diag
+        H_data["finalized"] = True
+        H_data["diag"] = diag
+        return H, L, su, diag
 
 
 def pack_trellis(encoded: torch.Tensor, quant_args: dict) -> torch.Tensor:
@@ -777,11 +782,19 @@ def quantize_exl3(
         if "seed" in quant_args:
             torch.manual_seed(quant_args["seed"])
 
+        devices = quant_args["devices"]
+        if weight.device != torch.device(devices[0]):
+            weight = weight.to(devices[0])
+
         device = weight.device if swap_to_device is None else swap_to_device
         k, n = weight.shape
 
         # Get H, LDL decomp. and input/output sign flips
         H, L, su, H_diag = finalize_capture_H(H_data, quant_args, verbose)
+        if H.is_cuda: H = H.to(device)
+        if L.is_cuda: L = L.to(device)
+        if su.is_cuda: su = su.to(device)
+        if H_diag.is_cuda: H_diag = H_diag.to(device)
         sv = (torch.randn(n, device = device).sign() + 1e-5).sign().to(torch.float).unsqueeze(0)
 
         # Move stored L to CPU (if not already), move working L to device