fix dequant + minor refactor (#572)

awni · web-flow · commit 1e8fca4e0b2e · 2025-10-30T14:30:10.000-07:00
diff --git a/mlx_lm/benchmark.py b/mlx_lm/benchmark.py
@@ -68,9 +68,8 @@ def main():
     prompt_tokens = args.prompt_tokens
     generation_tokens = args.generation_tokens
     batch_size = args.batch_size
-    prompts = mx.random.randint(
-        0, config["vocab_size"], (batch_size, prompt_tokens)
-    ).tolist()
+    vocab_size = config.get("vocab_size") or config["text_config"]["vocab_size"]
+    prompts = mx.random.randint(0, vocab_size, (batch_size, prompt_tokens)).tolist()
     prompt = prompts[0]
 
     def single_bench():
diff --git a/mlx_lm/fuse.py b/mlx_lm/fuse.py
@@ -4,8 +4,8 @@
 from mlx.utils import tree_flatten, tree_unflatten
 
 from .gguf import convert_to_gguf
-from .tuner.utils import dequantize, load_adapters
 from .utils import (
+    dequantize_model,
     load,
     save,
     upload_to_hub,
@@ -39,8 +39,8 @@ def parse_arguments() -> argparse.Namespace:
         default=None,
     )
     parser.add_argument(
-        "--de-quantize",
-        help="Generate a de-quantized model.",
+        "--dequantize",
+        help="Generate a dequantized model.",
         action="store_true",
     )
     parser.add_argument(
@@ -66,16 +66,16 @@ def main() -> None:
     )
 
     fused_linears = [
-        (n, m.fuse(de_quantize=args.de_quantize))
+        (n, m.fuse(dequantize=args.dequantize))
         for n, m in model.named_modules()
         if hasattr(m, "fuse")
     ]
 
     if fused_linears:
         model.update_modules(tree_unflatten(fused_linears))
 
-    if args.de_quantize:
-        print("De-quantizing model")
+    if args.dequantize:
+        print("Dequantizing model")
         model = dequantize(model)
         config.pop("quantization", None)
 
diff --git a/mlx_lm/models/afm7.py b/mlx_lm/models/afm7.py
@@ -50,7 +50,7 @@ def __init__(
         ]
         self.lora_b = [mx.zeros((r, od)) for od in output_dims]
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         linear = self.linear
         weight = linear.weight
         is_quantized = isinstance(linear, FusedQuantizedLinear)
@@ -79,7 +79,7 @@ def fuse(self, de_quantize: bool = False):
         delta = mx.concatenate(deltas, axis=0)
         fused_linear.weight = weight + delta
 
-        if is_quantized and not de_quantize:
+        if is_quantized and not dequantize:
             fused_linear = fused_linear.to_quantized(linear.group_size, linear.bits)
 
         return fused_linear
diff --git a/mlx_lm/perplexity.py b/mlx_lm/perplexity.py
@@ -13,8 +13,7 @@
 import numpy as np
 
 from mlx_lm.tuner.datasets import load_dataset
-from mlx_lm.tuner.utils import get_total_parameters
-from mlx_lm.utils import load
+from mlx_lm.utils import get_total_parameters, load
 
 
 def load_data(
diff --git a/mlx_lm/tuner/dora.py b/mlx_lm/tuner/dora.py
@@ -29,7 +29,7 @@ def from_base(
         dora_lin.set_linear(linear)
         return dora_lin
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         linear = self.linear
         bias = "bias" in linear
         weight = self._dequantized_weight()
@@ -49,7 +49,7 @@ def fuse(self, de_quantize: bool = False):
         if bias:
             fused_linear.bias = linear.bias
 
-        if self._is_quantized() and not de_quantize:
+        if self._is_quantized() and not dequantize:
             fused_linear = nn.QuantizedLinear.from_linear(
                 fused_linear,
                 linear.group_size,
@@ -151,7 +151,7 @@ def from_base(
         dora_embedding.set_embedding(embedding)
         return dora_embedding
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         embedding = self.embedding
         weight = embedding.weight
 
diff --git a/mlx_lm/tuner/lora.py b/mlx_lm/tuner/lora.py
@@ -31,7 +31,7 @@ def from_base(
         lora_lin.linear = linear
         return lora_lin
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         linear = self.linear
         bias = "bias" in linear
         weight = linear.weight
@@ -57,7 +57,7 @@ def fuse(self, de_quantize: bool = False):
         if bias:
             fused_linear.bias = linear.bias
 
-        if is_quantized and not de_quantize:
+        if is_quantized and not dequantize:
             fused_linear = nn.QuantizedLinear.from_linear(
                 fused_linear,
                 linear.group_size,
@@ -119,7 +119,7 @@ def from_base(
         lora_lin.linear = linear
         return lora_lin
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         linear = self.linear
         bias = "bias" in linear
         weight = linear.weight
@@ -146,7 +146,7 @@ def fuse(self, de_quantize: bool = False):
         if bias:
             fused_linear.bias = linear.bias
 
-        if is_quantized and not de_quantize:
+        if is_quantized and not dequantize:
             fused_linear = fused_linear.to_quantized(linear.group_size, linear.bits)
 
         return fused_linear
@@ -219,7 +219,7 @@ def from_base(
         lora_embedding.embedding = embedding
         return lora_embedding
 
-    def fuse(self, de_quantize: bool = False):
+    def fuse(self, dequantize: bool = False):
         embedding = self.embedding
         weight = embedding.weight
         is_quantized = isinstance(embedding, nn.QuantizedEmbedding)
@@ -243,7 +243,7 @@ def fuse(self, de_quantize: bool = False):
         lora_b = self.lora_b.astype(dtype)
         fused_embedding.weight = weight + lora_a @ lora_b
 
-        if is_quantized and not de_quantize:
+        if is_quantized and not dequantize:
             fused_embedding = nn.QuantizedEmbedding.from_embedding(
                 fused_embedding,
                 embedding.group_size,
diff --git a/mlx_lm/tuner/utils.py b/mlx_lm/tuner/utils.py
@@ -7,9 +7,10 @@
 import mlx.core as mx
 import mlx.nn as nn
 import mlx.optimizers as opt
-from mlx.utils import tree_flatten, tree_map_with_path, tree_unflatten
+from mlx.utils import tree_flatten, tree_unflatten
 
 from ..models.switch_layers import QuantizedSwitchLinear, SwitchLinear
+from ..utils import get_total_parameters
 from .dora import DoRAEmbedding, DoRALinear
 from .lora import LoRAEmbedding, LoRALinear, LoRASwitchLinear
 
@@ -137,49 +138,6 @@ def load_adapters(model: nn.Module, adapter_path: str) -> nn.Module:
     return model
 
 
-def dequantize(model: nn.Module) -> nn.Module:
-    """
-    Dequantize the quantized linear layers in the model.
-
-    Args:
-        model (nn.Module): The model with quantized linear layers.
-
-    Returns:
-        nn.Module: The model with dequantized layers.
-    """
-    dequantize_layers = []
-    for name, module in model.named_modules():
-        bias = "bias" in module
-        if isinstance(module, nn.QuantizedLinear):
-            cls = nn.Linear
-            kwargs = {"bias": bias}
-        elif isinstance(module, nn.QuantizedEmbedding):
-            kwargs = {}
-            cls = nn.Embedding
-        elif isinstance(module, QuantizedSwitchLinear):
-            kwargs = {"bias": bias}
-            cls = SwitchLinear
-        else:
-            continue
-        weight = mx.dequantize(
-            module.weight,
-            module.scales,
-            module.biases,
-            module.group_size,
-            module.bits,
-        )
-        args = weight.shape[::-1]
-        m = cls(*args, **kwargs)
-        if bias:
-            m.bias = module.bias
-        m.weight = weight
-        dequantize_layers.append((name, m))
-
-    if len(dequantize_layers) > 0:
-        model.update_modules(tree_unflatten(dequantize_layers))
-    return model
-
-
 def remove_lora_layers(model: nn.Module) -> nn.Module:
     """
     Remove the LoRA layers from the model.
@@ -199,20 +157,6 @@ def remove_lora_layers(model: nn.Module) -> nn.Module:
     return model
 
 
-def get_total_parameters(model):
-    leaf_modules = tree_flatten(
-        model.leaf_modules(), is_leaf=lambda m: isinstance(m, nn.Module)
-    )
-
-    def nparams(m):
-        if hasattr(m, "bits"):
-            n = 0 if not hasattr(m, "bias") else m.bias.size
-            return n + m.weight.size * 32 // m.bits
-        return sum(v.size for _, v in tree_flatten(m.parameters()))
-
-    return sum(nparams(m) for _, m in leaf_modules)
-
-
 def print_trainable_parameters(model):
     total_p = get_total_parameters(model) / 1e6
     trainable_p = (
diff --git a/mlx_lm/utils.py b/mlx_lm/utils.py
@@ -31,13 +31,11 @@
 else:
     from huggingface_hub import snapshot_download
 
-from mlx.utils import tree_flatten, tree_map, tree_reduce
+from mlx.utils import tree_flatten, tree_map, tree_reduce, tree_unflatten
 from transformers import PreTrainedTokenizer
 
 # Local imports
 from .tokenizer_utils import TokenizerWrapper, load_tokenizer
-from .tuner.utils import dequantize as dequantize_model
-from .tuner.utils import get_total_parameters, load_adapters
 
 # Constants
 MODEL_REMAPPING = {
@@ -74,6 +72,20 @@ def _get_classes(config: dict):
     return arch.Model, arch.ModelArgs
 
 
+def get_total_parameters(model):
+    leaf_modules = tree_flatten(
+        model.leaf_modules(), is_leaf=lambda m: isinstance(m, nn.Module)
+    )
+
+    def nparams(m):
+        if hasattr(m, "bits"):
+            n = 0 if not hasattr(m, "bias") else m.bias.size
+            return n + m.weight.size * 32 // m.bits
+        return sum(v.size for _, v in tree_flatten(m.parameters()))
+
+    return sum(nparams(m) for _, m in leaf_modules)
+
+
 def compute_bits_per_weight(model):
     model_bytes = tree_reduce(
         lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
@@ -225,6 +237,12 @@ def class_predicate(p, m):
     return model, config
 
 
+def load_adapeters(model: nn.Module, adapter_path: str) -> nn.Module:
+    from .tuner.utils import load_adapters as _load_adapters
+
+    return _load_adapters(model, adapter_path)
+
+
 def load(
     path_or_hf_repo: str,
     tokenizer_config={},
@@ -520,6 +538,52 @@ def wrapped_predicate(path, module):
     return model, quantized_config
 
 
+def dequantize_model(model: nn.Module) -> nn.Module:
+    """
+    Dequantize the quantized layers in the model.
+
+    Args:
+        model (nn.Module): The model with quantized layers.
+
+    Returns:
+        nn.Module: The model with dequantized layers.
+    """
+    from .models.switch_layers import QuantizedSwitchLinear, SwitchLinear
+
+    dequantize_layers = []
+    for name, module in model.named_modules():
+        bias = "bias" in module
+        if isinstance(module, nn.QuantizedLinear):
+            cls = nn.Linear
+            kwargs = {"bias": bias}
+        elif isinstance(module, nn.QuantizedEmbedding):
+            kwargs = {}
+            cls = nn.Embedding
+        elif isinstance(module, QuantizedSwitchLinear):
+            kwargs = {"bias": bias}
+            cls = SwitchLinear
+        else:
+            continue
+        weight = mx.dequantize(
+            module.weight,
+            module.scales,
+            module.biases,
+            module.group_size,
+            module.bits,
+            module.mode,
+        )
+        args = weight.shape[::-1]
+        m = cls(*args, **kwargs)
+        if bias:
+            m.bias = module.bias
+        m.weight = weight
+        dequantize_layers.append((name, m))
+
+    if len(dequantize_layers) > 0:
+        model.update_modules(tree_unflatten(dequantize_layers))
+    return model
+
+
 def save_config(
     config: dict,
     config_path: Union[str, Path],
diff --git a/tests/test_finetune.py b/tests/test_finetune.py
@@ -123,7 +123,7 @@ def test_lora_embedding(self):
             embedding.bits,
         )
         lora_emb = LoRAEmbedding.from_base(embedding, r=8, dropout=0, scale=10)
-        new_embedding = lora_emb.fuse(de_quantize=True)
+        new_embedding = lora_emb.fuse(dequantize=True)
         self.assertTrue(mx.array_equal(dequantized_weight, new_embedding.weight))
         self.assertTrue(mx.array_equal(embedding(tokens), lora_emb(tokens)))
 
@@ -137,7 +137,7 @@ def test_lora_embedding(self):
 
         # change the value of lora_b and the embeddings will no longer be equal
         lora_emb.lora_b = mx.random.uniform(shape=lora_emb.lora_b.shape)
-        new_embedding = lora_emb.fuse(de_quantize=True)
+        new_embedding = lora_emb.fuse(dequantize=True)
         self.assertFalse(mx.array_equal(dequantized_weight, new_embedding.weight))
         self.assertFalse(mx.array_equal(embedding(tokens), lora_emb(tokens)))
 
@@ -300,7 +300,7 @@ def dequantize_weight(quantized_linear):
         quantized_linear = nn.QuantizedLinear(in_dims, out_dims, bias=True)
         dora_quantized_linear = DoRALinear.from_base(quantized_linear, r)
         # Dequantize
-        to_linear_from_quantized = dora_quantized_linear.fuse(de_quantize=True)
+        to_linear_from_quantized = dora_quantized_linear.fuse(dequantize=True)
         self.assertTrue(
             mx.allclose(quantized_linear.bias, to_linear_from_quantized.bias)
         )