Change kv_cache as init parameter, fix KV cache dtype issues, and add MPS support

JingangQu · JingangQu · commit df09cb38fa74 · 2026-03-02T10:18:32.000+01:00
1. KV cache as init parameter
- Move `kv_cache` from `fit()` to `__init__()` in both `TabICLClassifier` and `TabICLRegressor`, following scikit-learn convention that all configuration belongs in the constructor.

2. Fix KV cache dtype mismatch
- When AMP is enabled, cache is computed in float16 and stored in the cache. Loading such a cache on CPU/MPS or CUDA without AMP) causes errors
- Auto-upcasts float16/bfloat16 cache to float32 when loading on CPU, MPS, or CUDA without AMP, with a `UserWarning`

3. MPS (Apple Silicon) support
- Skip auto-batching on MPS in `InferenceManager`
- Fix `DiskTensor.__setitem__` to correctly move MPS tensors to CPU before disk write.
- Auto-upcast KV cache to float32 on MPS (same as CPU).

4. Update README accordingly
diff --git a/README.md b/README.md
@@ -59,11 +59,11 @@ reg.fit(X_train, y_train)
 reg.predict(X_test)
 ```
 
-To speed up repeated inference on the same training data, enable KV caching during `fit`. Note that this consumes additional memory to store the cached projections, so consider the trade-off
-for your use case:
+To speed up repeated inference on the same training data, enable KV caching. The cache is built during `fit` and reused across `predict` calls. Note that this consumes additional memory to store the cached projections, so consider the trade-off for your use case:
 
 ```python
-clf.fit(X_train, y_train, kv_cache=True)  # caches key-value projections for training data
+clf = TabICLClassifier(kv_cache=True)
+clf.fit(X_train, y_train)  # caches key-value projections for training data
 clf.predict(X_test)  # fast: only processes test data by reusing the cached context
 ```
 
@@ -99,10 +99,11 @@ clf = TabICLClassifier(
     average_logits=True,  # average logits (True) or probabilities (False)
     support_many_classes=True,  # handle >10 classes automatically
     batch_size=8,  # ensemble members processed together, lower to save memory
+    kv_cache=False,  # cache training data KV projections for faster repeated inference
     model_path=None,  # path to checkpoint, None downloads from Hugging Face
     allow_auto_download=True,  # auto-download checkpoint if not found locally
     checkpoint_version="tabicl-classifier-v2-20260212.ckpt",  # pretrained checkpoint version
-    device=None,  # inference device, None auto-selects CUDA or CPU
+    device=None,  # inference device, None auto-selects CUDA or CPU; specify "mps" for Apple Silicon
     use_amp="auto",  # automatic mixed precision for faster inference
     use_fa3="auto",  # Flash Attention 3 for Hopper GPUs (e.g. H100)
     offload_mode="auto",  # automatically decide when to use cpu/disk offloading
diff --git a/src/tabicl/model/inference.py b/src/tabicl/model/inference.py
@@ -369,7 +369,7 @@ def __getitem__(self, indices) -> Tensor:
 
     def __setitem__(self, indices, value: Tensor) -> None:
         """Write to the tensor (automatically persists to disk)."""
-        if value.device.type != "cpu":
+        if not value.is_cpu:
             value = value.cpu()
         self._tensor[indices] = value
 
@@ -1128,8 +1128,8 @@ def __call__(
         if not auto_batch:
             return self._run_forward(forward_fn, self._prepare_inputs(inputs))
 
-        # CPU execution: batching not supported currently
-        if self.exe_device.type == "cpu":
+        # CPU/MPS execution: batching not supported (requires CUDA memory APIs)
+        if self.exe_device.type in ("cpu", "mps"):
             return forward_fn(**inputs)
 
         # Extract shape/dtype info
diff --git a/src/tabicl/model/kv_cache.py b/src/tabicl/model/kv_cache.py
@@ -52,11 +52,17 @@ def __setitem__(self, indices, other: KVCacheEntry):
             self.key[indices] = other.key
             self.value[indices] = other.value
 
-    def to(self, device) -> KVCacheEntry:
-        """Move this entry to the given device. Returns a new KVCacheEntry."""
+    def to(self, device, dtype=None) -> KVCacheEntry:
+        """Move this entry to the given device and optionally cast dtype.
+
+        Returns a new KVCacheEntry.
+        """
         if not self.is_valid():
             return KVCacheEntry()
-        return KVCacheEntry(key=self.key.to(device), value=self.value.to(device))
+        return KVCacheEntry(
+            key=self.key.to(device=device, dtype=dtype),
+            value=self.value.to(device=device, dtype=dtype),
+        )
 
     @staticmethod
     def concat(entries: List[KVCacheEntry], dim: int = 0) -> KVCacheEntry:
@@ -117,16 +123,16 @@ def __setitem__(self, indices, other: KVCache):
         """Write batch-sliced entries into this pre-allocated cache."""
         for idx, other_entry in other.kv.items():
             if idx in self.kv:
-                assert self.kv[idx].is_valid(), f"Cannot write to cache index {idx} because it is not valid."
-                device = self.kv[idx].key.device
-                self.kv[idx][indices] = other_entry.to(device)
+                target = self.kv[idx]
+                assert target.is_valid(), f"Cannot write to cache index {idx} because it is not valid."
+                self.kv[idx][indices] = other_entry.to(target.key.device, dtype=target.key.dtype)
 
-    def to(self, device) -> KVCache:
-        """Move all entries to the given device.
+    def to(self, device, dtype=None) -> KVCache:
+        """Move all entries to the given device and optionally cast dtype.
 
         Returns a new cache of the same subclass type.
         """
-        moved_kv = {idx: entry.to(device) for idx, entry in self.kv.items()}
+        moved_kv = {idx: entry.to(device, dtype=dtype) for idx, entry in self.kv.items()}
         return self.__class__(kv=moved_kv)
 
     @staticmethod
@@ -155,7 +161,7 @@ def concat(caches: List[KVCache], dim: int = 0) -> KVCache:
             merged_kv[idx] = KVCacheEntry.concat(entries, dim=dim)
         return KVCache(kv=merged_kv)
 
-    def preallocate(self, reference: KVCache, batch_shape: tuple, device="cpu"):
+    def preallocate(self, reference: KVCache, batch_shape: tuple, device="cpu", dtype=None):
         """Pre-allocate entries in this cache based on shapes from a reference.
 
         K/V tensors always have shape ``(*batch, num_heads, seq_len, head_dim)``.
@@ -173,14 +179,19 @@ def preallocate(self, reference: KVCache, batch_shape: tuple, device="cpu"):
 
         device : str or torch.device
             Device on which to allocate the tensors.
+
+        dtype : torch.dtype or None
+            Data type for the allocated tensors. If None, uses the reference
+            entry's dtype.
         """
         for idx, ref_entry in reference.kv.items():
             if ref_entry.is_valid():
+                target_dtype = dtype if dtype is not None else ref_entry.key.dtype
                 key_shape = batch_shape + ref_entry.key.shape[-3:]
                 value_shape = batch_shape + ref_entry.value.shape[-3:]
                 self.kv[idx] = KVCacheEntry(
-                    key=torch.zeros(key_shape, dtype=ref_entry.key.dtype, device=device),
-                    value=torch.zeros(value_shape, dtype=ref_entry.value.dtype, device=device),
+                    key=torch.zeros(key_shape, dtype=target_dtype, device=device),
+                    value=torch.zeros(value_shape, dtype=target_dtype, device=device),
                 )
 
 
@@ -293,23 +304,26 @@ def slice_batch(self, start: int, end: int) -> TabICLCache:
             num_classes=self.num_classes,
         )
 
-    def to(self, device) -> TabICLCache:
-        """Move all cached tensors to the given device.
+    def to(self, device, dtype=None) -> TabICLCache:
+        """Move all cached tensors to the given device and optionally cast dtype.
 
         Parameters
         ----------
         device : str or torch.device
             Target device (e.g. ``'cpu'``, ``'cuda:0'``).
 
+        dtype : torch.dtype or None
+            Target dtype. If None, preserves the existing dtype.
+
         Returns
         -------
         TabICLCache
             New cache with all tensors on the target device.
         """
         return TabICLCache(
-            col_cache=self.col_cache.to(device) if self.col_cache else KVCache(),
-            row_repr=self.row_repr.to(device) if self.row_repr is not None else None,
-            icl_cache=self.icl_cache.to(device) if self.icl_cache else KVCache(),
+            col_cache=self.col_cache.to(device, dtype=dtype) if self.col_cache else KVCache(),
+            row_repr=self.row_repr.to(device=device, dtype=dtype) if self.row_repr is not None else None,
+            icl_cache=self.icl_cache.to(device, dtype=dtype) if self.icl_cache else KVCache(),
             train_shape=self.train_shape,
             num_classes=self.num_classes,
         )
diff --git a/src/tabicl/sklearn/base.py b/src/tabicl/sklearn/base.py
@@ -158,6 +158,43 @@ def _build_inference_config(self) -> None:
         else:
             self.inference_config_ = self.inference_config
 
+    def _move_cache_to_device(self) -> None:
+        """Move KV cache to the current device, auto-upcasting if needed.
+
+        When the cache contains reduced-precision tensors (float16/bfloat16)
+        and the target environment cannot use them directly (CPU, MPS, or
+        CUDA without AMP), the tensors are upcast to float32 and a warning
+        is emitted.
+        """
+        if not (hasattr(self, "model_kv_cache_") and self.model_kv_cache_ is not None):
+            return
+
+        use_amp, _ = self._resolve_amp_fa3()
+        # CPU and MPS do not support float16 attention; CUDA needs AMP on
+        needs_upcast = self.device_.type in ("cpu", "mps") or not use_amp
+        upcast_dtype = torch.float32 if needs_upcast else None
+
+        # Warn once if we are actually upcasting reduced-precision tensors
+        if upcast_dtype is not None:
+            first_cache = next(iter(self.model_kv_cache_.values()))
+            cache_dtype = next(iter(first_cache.col_cache.kv.values())).key.dtype
+            if cache_dtype != torch.float32:
+                if self.device_.type in ("cpu", "mps"):
+                    reason = f"{self.device_.type.upper()} does not support float16/bfloat16 attention"
+                else:
+                    reason = "AMP is not enabled"
+                warnings.warn(
+                    f"KV cache contains {cache_dtype} tensors (typically from AMP). "
+                    f"Automatically upcasting to float32 because {reason}.",
+                    UserWarning,
+                    stacklevel=3,
+                )
+
+        device_cache = OrderedDict()
+        for method, cache in self.model_kv_cache_.items():
+            device_cache[method] = cache.to(self.device_, dtype=upcast_dtype)
+        self.model_kv_cache_ = device_cache
+
     def __getstate__(self):
         """Customize pickle serialization.
 
@@ -277,12 +314,8 @@ def __setstate__(self, state):
         # Reconstruct inference config
         self._build_inference_config()
 
-        # Move KV cache to device
-        if hasattr(self, "model_kv_cache_") and self.model_kv_cache_ is not None:
-            device_cache = OrderedDict()
-            for method, cache in self.model_kv_cache_.items():
-                device_cache[method] = cache.to(self.device_)
-            self.model_kv_cache_ = device_cache
+        # Move KV cache to device, auto-upcasting if needed
+        self._move_cache_to_device()
 
     def save(
         self,
@@ -332,7 +365,7 @@ def save(
         if not save_training_data and not (save_kv_cache and has_kv_cache):
             raise ValueError(
                 "Cannot exclude training data when KV cache is not available or not being saved. "
-                "Either set save_training_data=True, or fit with kv_cache=True and set save_kv_cache=True."
+                "Either set save_training_data=True, or set kv_cache=True during init and save_kv_cache=True."
             )
 
         # Set temporary flags for __getstate__
@@ -376,11 +409,7 @@ def load(cls, path: str | Path, device: Optional[str | torch.device] = None) ->
             obj._resolve_device()
             obj.model_.to(obj.device_)
             obj._build_inference_config()
-            if hasattr(obj, "model_kv_cache_") and obj.model_kv_cache_ is not None:
-                device_cache = OrderedDict()
-                for method, cache in obj.model_kv_cache_.items():
-                    device_cache[method] = cache.to(obj.device_)
-                obj.model_kv_cache_ = device_cache
+            obj._move_cache_to_device()
 
         return obj
 
diff --git a/src/tabicl/sklearn/classifier.py b/src/tabicl/sklearn/classifier.py
@@ -84,6 +84,23 @@ class TabICLClassifier(ClassifierMixin, TabICLBaseEstimator):
         Adjust this parameter based on available memory. Lower values use less memory but may
         be slower.
 
+    kv_cache : bool or str, default=False
+        Controls caching of training data computations to speed up subsequent
+        ``predict_proba``/``predict`` calls. The cache is built during ``fit()``.
+
+        - False: No caching.
+        - True or "kv": Cache key-value projections from both column embedding
+          and ICL transformer layers. Fast inference but memory-heavy for large
+          training sets.
+        - "repr": Cache column embedding KV projections and row interaction outputs
+          (representations). Uses ~24x less memory than "kv" for the ICL part,
+          at the cost of re-running the ICL transformer at predict time.
+
+        The cache retains whatever dtype the model produced during ``fit()``
+        (float16 when AMP is active, float32 otherwise). If the cache is later
+        loaded on CPU or on CUDA without AMP, the tensors are automatically
+        upcast to float32 to avoid dtype-mismatch errors.
+
     model_path : Optional[str | Path] = None
         Path to the pre-trained model checkpoint file.
         - If provided and the file exists, it's loaded directly.
@@ -108,8 +125,10 @@ class TabICLClassifier(ClassifierMixin, TabICLBaseEstimator):
         - `'tabicl-classifier-v1-20250208.ckpt'`: The version used in our TabICLv1 paper.
 
     device : Optional[str or torch.device], default=None
-        Device to use for inference. If None, defaults to CUDA if available, else CPU.
-        Can be specified as a string ('cuda', 'cpu') or a torch.device object.
+        Device to use for inference. If None, automatically selects CUDA if
+        available, otherwise CPU. Can be specified as a string (``'cuda'``,
+        ``'cpu'``, ``'mps'``) or a ``torch.device`` object. MPS (Apple Silicon
+        GPU) is supported but must be explicitly requested.
 
     use_amp : bool or "auto", default="auto"
         Controls automatic mixed precision (AMP) for inference.
@@ -237,14 +256,14 @@ class TabICLClassifier(ClassifierMixin, TabICLBaseEstimator):
         The inference configuration.
 
     cache_mode_ : str or None
-        The caching mode used when ``fit()`` was called with ``kv_cache``.
-        One of ``"kv"``, ``"repr"``, or ``None`` (when no caching is used).
+        The resolved caching mode, set during ``fit()`` based on the ``kv_cache``
+        init parameter. One of ``"kv"``, ``"repr"``, or ``None`` (no caching).
 
     model_kv_cache_ : OrderedDict[str, TabICLCache] or None
         Pre-computed KV caches for training data, keyed by normalization method.
-        Created when ``fit()`` is called with ``kv_cache=True``. When set, ``predict_proba()``
-        reuses the cached key-value projections instead of re-processing training data,
-        enabling faster inference on multiple test sets.
+        Created during ``fit()`` when ``kv_cache`` is enabled. When set,
+        ``predict_proba()`` reuses the cached key-value projections instead of
+        re-processing training data, enabling faster inference on multiple test sets.
     """
 
     def __init__(
@@ -258,6 +277,7 @@ def __init__(
         average_logits: bool = True,
         support_many_classes: bool = True,
         batch_size: Optional[int] = 8,
+        kv_cache: bool | str = False,
         model_path: Optional[str | Path] = None,
         allow_auto_download: bool = True,
         checkpoint_version: str = "tabicl-classifier-v2-20260212.ckpt",
@@ -280,6 +300,7 @@ def __init__(
         self.average_logits = average_logits
         self.support_many_classes = support_many_classes
         self.batch_size = batch_size
+        self.kv_cache = kv_cache
         self.model_path = model_path
         self.allow_auto_download = allow_auto_download
         self.checkpoint_version = checkpoint_version
@@ -386,7 +407,7 @@ def _load_model(self) -> None:
         self.model_.load_state_dict(checkpoint["state_dict"])
         self.model_.eval()
 
-    def fit(self, X: np.ndarray, y: np.ndarray, kv_cache: bool | str = False) -> TabICLClassifier:
+    def fit(self, X: np.ndarray, y: np.ndarray) -> TabICLClassifier:
         """Fit the classifier to training data.
 
         Prepares the model for prediction by:
@@ -395,6 +416,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, kv_cache: bool | str = False) -> Tab
         3. Fitting the ensemble generator to create transformed dataset views
         4. Loading the pre-trained TabICL model
         5. Optionally pre-computing KV caches for training data to speed up inference
+           (controlled by the ``kv_cache`` init parameter)
 
         The model itself is not trained on the data; it uses in-context learning
         at inference time.
@@ -407,17 +429,6 @@ def fit(self, X: np.ndarray, y: np.ndarray, kv_cache: bool | str = False) -> Tab
         y : array-like of shape (n_samples,)
             Training target labels.
 
-        kv_cache : bool or str, default=False
-            Controls caching of training data computations to speed up subsequent
-            ``predict_proba``/``predict`` calls.
-            - False: No caching.
-            - True or "kv": Cache key-value projections from both column embedding
-              and ICL transformer layers. Fast inference but memory-heavy for large
-              training sets.
-            - "repr": Cache column embedding KV projections and row interaction outputs
-              (representations). Uses ~24x less memory than "kv" for the ICL part,
-              at the cost of re-running the ICL transformer at predict time.
-
         Returns
         -------
         self : TabICLClassifier
@@ -454,7 +465,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, kv_cache: bool | str = False) -> Tab
         self.n_classes_ = len(self.y_encoder_.classes_)
 
         if self.n_classes_ > self.model_.max_classes:
-            if kv_cache:
+            if self.kv_cache:
                 raise ValueError(
                     f"KV caching is not supported when the number of classes ({self.n_classes_}) exceeds the max number "
                     f"of classes ({self.model_.max_classes}) natively supported by the model."
@@ -491,13 +502,13 @@ def fit(self, X: np.ndarray, y: np.ndarray, kv_cache: bool | str = False) -> Tab
         self.ensemble_generator_.fit(X, y)
 
         self.model_kv_cache_ = None
-        if kv_cache:
-            if kv_cache is True or kv_cache == "kv":
+        if self.kv_cache:
+            if self.kv_cache is True or self.kv_cache == "kv":
                 self.cache_mode_ = "kv"
-            elif kv_cache == "repr":
+            elif self.kv_cache == "repr":
                 self.cache_mode_ = "repr"
             else:
-                raise ValueError(f"Invalid kv_cache value '{kv_cache}'. Expected False, True, 'kv', or 'repr'.")
+                raise ValueError(f"Invalid kv_cache value '{self.kv_cache}'. Expected False, True, 'kv', or 'repr'.")
             self._build_kv_cache()
 
         return self
diff --git a/src/tabicl/sklearn/regressor.py b/src/tabicl/sklearn/regressor.py
diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py