gabe-l-hart
diff --git a/‎mlx_lm/evaluate.py‎
Lines changed: 1 addition & 3 deletions b/‎mlx_lm/evaluate.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎mlx_lm/generate.py‎
Lines changed: 0 additions & 7 deletions b/‎mlx_lm/generate.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎mlx_lm/models/afm7.py‎
Lines changed: 3 additions & 6 deletions b/‎mlx_lm/models/afm7.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎mlx_lm/models/apertus.py‎
Lines changed: 3 additions & 6 deletions b/‎mlx_lm/models/apertus.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎mlx_lm/models/baichuan_m1.py‎
Lines changed: 39 additions & 15 deletions b/‎mlx_lm/models/baichuan_m1.py‎
Lines changed: 39 additions & 15 deletions
diff --git a/‎mlx_lm/models/bailing_moe.py‎
Lines changed: 3 additions & 6 deletions b/‎mlx_lm/models/bailing_moe.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎mlx_lm/models/base.py‎
Lines changed: 10 additions & 22 deletions b/‎mlx_lm/models/base.py‎
Lines changed: 10 additions & 22 deletions
diff --git a/‎mlx_lm/models/bitnet.py‎
Lines changed: 3 additions & 6 deletions b/‎mlx_lm/models/bitnet.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎mlx_lm/models/cache.py‎
Lines changed: 45 additions & 0 deletions b/‎mlx_lm/models/cache.py‎
Lines changed: 45 additions & 0 deletions
@@ -107,9 +107,7 @@ def _score_fn(self, inputs, cache: Optional[Any] = None, step_size: int = 2048):
             T = inp.shape[1]
 
             offset = cache[0].offset
-            mask = create_causal_mask(T, offset, lengths=lengths)
-
-            logits = self._model(inp, cache=cache, mask=mask)
+            logits = self._model(inp, cache=cache)
             log_probs = nn.log_softmax(logits.astype(mx.float32))
 
             score = mx.take_along_axis(
 
@@ -686,7 +686,6 @@ def stream_generate(
             prompt, model, draft_model, **kwargs
         )
     with wired_limit(model, [generation_stream]):
-        detokenizer.reset()
         tic = time.perf_counter()
         for n, (token, logprobs, from_draft) in enumerate(token_generator):
             if n == 0:
@@ -731,7 +730,6 @@ def generate(
     tokenizer: Union[PreTrainedTokenizer, TokenizerWrapper],
     prompt: Union[str, List[int]],
     verbose: bool = False,
-    formatter: Optional[Callable] = None,
     **kwargs,
 ) -> str:
     """
@@ -746,11 +744,6 @@ def generate(
        kwargs: The remaining options get passed to :func:`stream_generate`.
           See :func:`stream_generate` for more details.
     """
-    if formatter is not None:
-        print(
-            "[Warning] Text formatting is deprecated and no longer used. "
-            "The argument will be removed in a future version."
-        )
     if verbose:
         print("=" * 10)
 
 
@@ -350,18 +350,16 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: mx.array = None,
         cache=None,
     ):
         h = self.embedding(inputs)
 
-        if mask is None:
-            mask = create_attention_mask(h, cache)
-
         if cache is None:
             cache = [None] * len(self.layers)
             cache[-1] = ConcatenateKVCache()
 
+        mask = create_attention_mask(h, cache[0])
+
         for layer, c in zip(self.layers, cache):
             h = layer(h, mask, cache=c)
 
@@ -382,10 +380,9 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: mx.array = None,
         cache=None,
     ):
-        out = self.model(inputs, mask, cache)
+        out = self.model(inputs, cache)
         out = self.model.embedding.as_linear(out)
         return out
 
 
@@ -177,17 +177,15 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: Optional[mx.array] = None,
         cache: Optional[Any] = None,
     ) -> mx.array:
         h = self.embed_tokens(inputs)
 
-        if mask is None:
-            mask = create_attention_mask(h, cache)
-
         if cache is None:
             cache = [None] * len(self.layers)
 
+        mask = create_attention_mask(h, cache[0])
+
         for layer, c in zip(self.layers, cache):
             h = layer(h, mask=mask, cache=c)
 
@@ -205,10 +203,9 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: Optional[mx.array] = None,
         cache: Optional[Any] = None,
     ) -> mx.array:
-        out = self.model(inputs, mask, cache)
+        out = self.model(inputs, cache)
         return self.lm_head(out)
 
     def sanitize(self, weights):
 
@@ -96,7 +96,10 @@ def __call__(
         k = k.reshape(B, L, self.num_kv_heads, self.head_dim).transpose(0, 2, 1, 3)
         v = v.reshape(B, L, self.num_kv_heads, self.head_dim).transpose(0, 2, 1, 3)
 
-        if cache is not None:
+        if cache is None:
+            cache = (None, None)
+
+        if cache[0] is not None:
             offset = cache[1].offset
             last_k, last_v = cache[0][0], cache[0][1]
         else:
@@ -110,7 +113,7 @@ def __call__(
         q = self.rope(q, offset=offset)
         k = self.rope(k, offset=offset)
 
-        if cache is not None:
+        if cache[0] is not None:
             k, v = cache[1].update_and_fetch(k, v)
             if L > 0:
                 cache[0][0] = k_init[:, :, -1:, :]
@@ -167,17 +170,40 @@ def __init__(self, config: ModelArgs):
         self.layers = [DecoderLayer(config, i) for i in range(config.num_hidden_layers)]
         self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def __call__(
-        self, inputs: mx.array, mask: mx.array = None, cache: Any = None
-    ) -> mx.array:
+        self.sliding_window = config.sliding_window
+        self.first_swa_idx = None
+        if config.sliding_window_layers:
+            self.first_swa_idx = config.sliding_window_layers[0]
+
+        self.first_global_idx = None
+        self.swa_layers = set(config.sliding_window_layers)
+        for i in range(config.num_hidden_layers):
+            if i in self.swa_layers:
+                continue
+            self.first_global_idx = i
+            break
+
+    def __call__(self, inputs: mx.array, cache: Any = None) -> mx.array:
         x = self.embed_tokens(inputs)
-        if mask is None:
-            if cache is not None:
-                c = [cache[0][1]]
-            mask = create_attention_mask(x, c)
+
         if cache is None:
-            cache = [None] * len(self.layers)
-        for layer, c in zip(self.layers, cache):
+            cache = [(None, None)] * len(self.layers)
+
+        if self.first_global_idx is None:
+            c_global = None
+        else:
+            c_global = cache[self.first_global_idx][1]
+
+        if self.first_swa_idx is None:
+            c_swa = None
+        else:
+            c_swa = cache[self.first_swa_idx][1]
+
+        global_mask = create_attention_mask(x, c_global)
+        swa_mask = create_attention_mask(x, c_swa, window_size=self.sliding_window)
+
+        for l, (layer, c) in enumerate(zip(self.layers, cache)):
+            mask = swa_mask if l in self.swa_layers else global_mask
             x = layer(x, mask, c)
         return self.norm(x)
 
@@ -215,10 +241,8 @@ def sanitize(self, weights: dict) -> dict:
             weights["lm_head.weight"] = w
         return weights
 
-    def __call__(
-        self, inputs: mx.array, mask: mx.array = None, cache: Any = None
-    ) -> mx.array:
-        outputs = self.model(inputs, mask, cache)
+    def __call__(self, inputs: mx.array, cache: Any = None) -> mx.array:
+        outputs = self.model(inputs, cache)
         return self.lm_head(outputs)
 
     @property
 
@@ -239,17 +239,15 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: Optional[mx.array] = None,
         cache: Optional[Any] = None,
     ):
         h = self.word_embeddings(inputs)
 
-        if mask is None:
-            mask = create_attention_mask(h, cache)
-
         if cache is None:
             cache = [None] * len(self.layers)
 
+        mask = create_attention_mask(h, cache[0])
+
         for layer, c in zip(self.layers, cache):
             h = layer(h, mask, c)
 
@@ -268,10 +266,9 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: mx.array = None,
         cache=None,
     ):
-        h = self.model(inputs, mask, cache)
+        h = self.model(inputs, cache)
         return self.lm_head(h)
 
     def sanitize(self, weights):
 
@@ -7,8 +7,6 @@
 import mlx.core as mx
 from mlx.utils import tree_map
 
-from .cache import QuantizedKVCache
-
 
 @dataclass
 class BaseModelArgs:
@@ -43,26 +41,16 @@ def create_causal_mask(
 
 
 def create_attention_mask(
-    h: mx.array, cache: Optional[Any] = None, return_array: bool = False
+    h, cache=None, window_size: Optional[int] = None, return_array: bool = False
 ):
-    T = h.shape[1]
-    if T > 1:
-        offset = 0
-        window_size = None
-        if cache is not None and cache[0] is not None:
-            c = cache[0]
-            offset = c.offset
-            if hasattr(c, "max_size"):
-                window_size = c.max_size
-                offset = min(window_size, offset)
-                return_array = return_array or offset + T > window_size
-        if return_array:
-            return create_causal_mask(T, offset, window_size=window_size)
-        else:
-            return "causal"
-    else:
-        mask = None
-    return mask
+    N = h.shape[1]
+    if cache and hasattr(cache, "make_mask"):
+        return cache.make_mask(N, return_array=return_array, window_size=window_size)
+    if N == 1:
+        return None
+    if return_array or (window_size and N > window_size):
+        return create_causal_mask(N, window_size=window_size)
+    return "causal"
 
 
 def quantized_scaled_dot_product_attention(
@@ -117,7 +105,7 @@ def scaled_dot_product_attention(
     scale: float,
     mask: Optional[mx.array],
 ) -> mx.array:
-    if isinstance(cache, QuantizedKVCache):
+    if hasattr(cache, "bits"):
         return quantized_scaled_dot_product_attention(
             queries,
             keys,
 
@@ -163,17 +163,15 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: mx.array = None,
         cache=None,
     ):
         h = self.embed_tokens(inputs)
 
-        if mask is None:
-            mask = create_attention_mask(h, cache)
-
         if cache is None:
             cache = [None] * len(self.layers)
 
+        mask = create_attention_mask(h, cache[0])
+
         for layer, c in zip(self.layers, cache):
             h = layer(h, mask, cache=c)
 
@@ -192,10 +190,9 @@ def __init__(self, args: ModelArgs):
     def __call__(
         self,
         inputs: mx.array,
-        mask: mx.array = None,
         cache=None,
     ):
-        out = self.model(inputs, mask, cache)
+        out = self.model(inputs, cache)
         if self.args.tie_word_embeddings:
             out = self.model.embed_tokens.as_linear(out)
         else:
 
@@ -6,6 +6,8 @@
 import mlx.nn as nn
 from mlx.utils import tree_flatten, tree_map, tree_unflatten
 
+from .base import create_causal_mask
+
 
 def make_prompt_cache(
     model: nn.Module,
@@ -106,6 +108,17 @@ def trim_prompt_cache(cache: List[Any], num_tokens: int) -> List[Any]:
     return [c.trim(num_tokens) for c in cache][0]
 
 
+def create_attention_mask(
+    N: int, offset: int, return_array: bool, window_size: Optional[int]
+):
+    if N == 1:
+        return None
+    if return_array:
+        return create_causal_mask(N, offset, window_size=window_size)
+    else:
+        return "causal"
+
+
 class _BaseCache:
     @property
     def state(self):
@@ -170,6 +183,9 @@ def trim(self, n):
         self.offset -= n
         return n
 
+    def make_mask(self, *args, **kwargs):
+        return create_attention_mask(*args, offset=self.offset, **kwargs)
+
 
 class QuantizedKVCache(_BaseCache):
     def __init__(self, group_size: int = 64, bits: int = 8):
@@ -252,6 +268,9 @@ def trim(self, n):
         self.offset -= n
         return n
 
+    def make_mask(self, *args, **kwargs):
+        return create_attention_mask(*args, offset=self.offset, **kwargs)
+
 
 class KVCache(_BaseCache):
     def __init__(self):
@@ -317,6 +336,9 @@ def to_quantized(self, group_size: int = 64, bits: int = 4) -> QuantizedKVCache:
             )
         return quant_cache
 
+    def make_mask(self, *args, **kwargs):
+        return create_attention_mask(*args, offset=self.offset, **kwargs)
+
 
 class RotatingKVCache(_BaseCache):
 
@@ -460,6 +482,29 @@ def trim(self, n):
     def to_quantized(self, group_size: int = 64, bits: int = 4) -> QuantizedKVCache:
         raise NotImplementedError("RotatingKVCache Quantization NYI")
 
+    def make_mask(
+        self, N: int, window_size: Optional[int] = None, return_array: bool = False
+    ):
+        if N > 1:
+            window_size = window_size or self.max_size
+            offset = min(self.max_size, self.offset)
+            if offset + N > window_size or return_array:
+                return create_causal_mask(N, offset, window_size=window_size)
+            else:
+                return "causal"
+        else:
+            if window_size is None:
+                return None
+            # May need a mask for when window_size < max_size
+            if self.offset >= window_size and self.max_size > window_size:
+                idx = self._idx
+                if idx >= self.max_size:
+                    idx = 0
+                mask_size = min(self.max_size, self.offset)
+                mask = mx.arange(mask_size) >= (mask_size - window_size)
+                mask = mx.roll(mask, shift=idx + 1)
+                return mask[:, None]
+
 
 class ArraysCache(_BaseCache):
     def __init__(self, size):