genlm
diff --git a/‎.github/workflows/coverage.yml‎
Lines changed: 36 additions & 1 deletion b/‎.github/workflows/coverage.yml‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 22 additions & 1 deletion b/‎.github/workflows/pytest.yml‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎DEVELOPING.md‎
Lines changed: 5 additions & 0 deletions b/‎DEVELOPING.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 0 deletions b/‎README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎benchmark/benchmark_mlx.py‎
Lines changed: 43 additions & 0 deletions b/‎benchmark/benchmark_mlx.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎genlm/backend/cache.py‎
Lines changed: 27 additions & 0 deletions b/‎genlm/backend/cache.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎genlm/backend/llm/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎genlm/backend/llm/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -32,7 +32,42 @@ jobs:
       - name: Run tests
         run: |
           source venv/bin/activate
-          coverage run --source=genlm/backend -m pytest --benchmark-disable
+          coverage run --source=genlm/backend -m pytest --benchmark-disable --ignore=tests/test_mlx.py
+          coverage json --omit "*/test*"
+          coverage report --omit "*/test*"
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          fail_ci_if_error: false
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.json
+          slug: genlm/genlm-backend
+
+  test_mlx_coverage:
+    runs-on: macos-14
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11.5
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          pip install -e .[mlx]
+          pip install -r requirements-dev.txt
+
+      - name: Run MLX tests
+        run: |
+          source venv/bin/activate
+          coverage run --source=genlm/backend -m pytest tests/test_mlx.py --benchmark-disable
           coverage json --omit "*/test*"
           coverage report --omit "*/test*"
 
 
@@ -29,4 +29,25 @@ jobs:
           source venv/bin/activate
           pip install -e .[test]
           pip install -r requirements-dev.txt
-          python -m pytest tests
+          python -m pytest tests --ignore=tests/test_mlx.py
+
+  test-mlx:
+    runs-on: macos-14
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11.5
+          cache: 'pip'
+
+      - name: Run MLX Tests
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          pip install -e .[mlx]
+          pip install -r requirements-dev.txt
+          python -m pytest tests/test_mlx.py
@@ -27,6 +27,11 @@ uv pip install -e ".[docs]"
 uv pip install -r requirements-dev.txt
 ```
 
+To build with MLX support, run:
+```bash
+uv pip install -e ".[mlx]"
+```
+
 ## Testing
 
 When test dependencies are installed, the test suite can be run via:
 
@@ -18,6 +18,7 @@ See our [documentation](https://genlm.github.io/genlm-backend/).
 - Automatic batching of concurrent log-probability requests, enabling efficient large-scale inference without having to write batching logic yourself
 - Byte-level decoding of transformers tokenizers, enabling advanced token-level control
 - Support for arbitrary Hugging Face models (e.g., LLaMA, DeepSeek, etc.) with fast inference and automatic KV caching using vllm
+- NEW: support for MLX-LM library, allowing faster inference on Apple silicon devices.
 
 
 ## ⚡ Quick Start
@@ -28,6 +29,13 @@ This library supports installation via pip:
 pip install genlm-backend
 ```
 
+Or to install with MLX support, run:
+
+```bash
+pip install genlm-backend[mlx]
+```
+
+
 ## 🧪 Example: Autobatched Sequential Importance Sampling with LLMs
 
 This example demonstrates how `genlm-backend` enables concise, scalable probabilistic inference with language models. It implements a Sequential Importance Sampling (SIS) algorithm that makes asynchronous log-probabality requests which get automatically batched by the language model.
 
@@ -0,0 +1,43 @@
+"""
+Evaluates performance differences between AsyncMlxLM (MLX-based) and AsyncTransformer
+(HuggingFace-based) implementations using pytest-benchmark.
+
+pytest benchmark/benchmark_mlx.py --benchmark-only --benchmark-group-by=func
+"""
+
+import pytest
+from .util import (
+    get_wikitext,
+    token_prefixes,
+    token_prefix_batches,
+    run_await_next_token_logprobs,
+    run_await_batch_next_token_logprobs,
+)
+
+from genlm.backend.llm import AsyncMlxLM, AsyncTransformer
+
+text = get_wikitext()
+
+
+def load_model(model, batch_size=None):
+    model_name = "gpt2"
+    if model == "mlx":
+        return AsyncMlxLM.from_name(model_name)
+    else:
+        return AsyncTransformer.from_name(model_name, batch_size=batch_size)
+
+
+@pytest.mark.parametrize("model", ["mlx", "transformer"])
+def test_await_next_token_logprobs(benchmark, model):
+    llm = load_model(model, batch_size=1)
+    sequences = token_prefixes(text, tokenizer=llm.tokenizer)
+    run_await_next_token_logprobs(benchmark=benchmark, llm=llm, sequences=sequences)
+
+
+@pytest.mark.parametrize("model", ["mlx", "transformer"])
+def test_await_batch_next_token_logprobs(benchmark, model, batch_size=5):
+    llm = load_model(model, batch_size=batch_size)
+    batches = token_prefix_batches(text, tokenizer=llm.tokenizer, batch_size=batch_size)
+    run_await_batch_next_token_logprobs(
+        benchmark=benchmark, llm=llm, batches=batches, rounds=50, warmup_rounds=10
+    )
@@ -43,6 +43,33 @@ def clear(self):
         self.cache.clear()
 
 
+class OutputMLXCache(OutputCache):
+    """A cache for storing tensor outputs with MLX.
+
+    Since MLX uses unified memory, we don't need to move tensors between CPU and GPU.
+
+    Args:
+        maxsize (int): Maximum number of items to store in the cache
+    """
+
+    def __init__(self, maxsize):
+        super().__init__(maxsize, move_to_cpu=False)
+
+    def __getitem__(self, key):
+        if key in self.cache:
+            value = self.cache.pop(key)
+            self.cache[key] = value
+            return value
+        raise KeyError(key)
+
+    def __setitem__(self, key, value):
+        if len(self.cache) >= self.maxsize:
+            _, old_tensor = self.cache.popitem(last=False)
+            del old_tensor
+
+        self.cache[key] = value
+
+
 class TokenTrie:
     """Class used internally to cache language model results.
 
 
@@ -1,6 +1,7 @@
 from genlm.backend.llm.vllm import AsyncVirtualLM
 from genlm.backend.llm.hf import AsyncTransformer
 from genlm.backend.llm.base import AsyncLM, MockAsyncLM
+from genlm.backend.llm.mlx import AsyncMlxLM
 
 import torch
 
@@ -33,6 +34,8 @@ def load_model_by_name(name, backend=None, llm_opts=None):
         return AsyncTransformer.from_name(name, **llm_opts)
     elif backend == "mock":
         return MockAsyncLM.from_name(name, **llm_opts)
+    elif backend == "mlx":
+        return AsyncMlxLM.from_name(name, **llm_opts)
     else:
         raise ValueError(f"Invalid backend: {backend}")
 
@@ -42,5 +45,6 @@ def load_model_by_name(name, backend=None, llm_opts=None):
     "AsyncLM",
     "AsyncVirtualLM",
     "AsyncTransformer",
+    "AsyncMlxLM",
     "MockAsyncLM",
 ]