From b877fb27604be66c9ca87de11a88524e6cc5d7f9 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 11 Jun 2025 09:03:59 -0400
Subject: [PATCH 1/7] stuff

---
 Dockerfile |  9 ++++++++-
 setup.cfg  | 11 +++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8c2efa85e..983d785ea 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.7-labs
-FROM nvcr.io/nvidia/pytorch:24.11-py3
+FROM nvcr.io/nvidia/pytorch:25.05-py3
 
 # Install dependencies.
 RUN apt-get update \
@@ -24,6 +24,13 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to
       /usr/local/lib/python3.12/dist-packages \
       /usr/local/lib/python3.12/dist-packages/__pycache__
 
+# The base image enforces versions for things like pytest for no good reason.
+ENV PIP_CONSTRAINT=""
+# There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds.
+# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720
+# We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?)
+RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
 # Copy dependency files with universal write permissions for all users.
 COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
 COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
diff --git a/setup.cfg b/setup.cfg
index 381225bf8..fac372eb8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,13 +17,13 @@ install_requires =
 #   FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation
 CORE =
     # Available through the nvidia base image
-    torch>=2.5.0
+    torch>=2.6.0
     # Numpy major needs to match torch
-    numpy>=1.24.4,<2.0.0
+    numpy>=1.26.4,<2.0.0
     # Used for checkpoints
     safetensors>=0.4.4
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
-    flash-attn==2.7.2.post1
+    flash-attn==2.7.3
     mamba_ssm[causal-conv1d]==2.2.4
 
 
@@ -41,17 +41,16 @@ OPTIONAL =
     omegaconf>=2.3.0
     # Miscellaneous
     requests>=2.32.3
-    tqdm>=4.66.3
+    tqdm>=4.67.1
 
 DEV =
     # Pre-commit git hook
     pre-commit>=4.0.1
     # Required for testing
     pytest>=8.3.2
-    pytest-depends>=1.0.1
     pytest-xdist>=3.6.1
     # Somehow needed for Megatron to work with base image 24.11
-    setuptools>=75.6.0
+    setuptools>=78.1.1
 
 # Required for building the documentation
 DOCS =

From 907aef09ad944a3741ff184f36923c7cd7bb84af Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 11 Jun 2025 09:45:29 -0400
Subject: [PATCH 2/7] attempt

---
 Dockerfile | 2 +-
 setup.cfg  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 983d785ea..ae6625d07 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ ENV PIP_CONSTRAINT=""
 # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds.
 # We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720
 # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?)
-RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4"
 
 # Copy dependency files with universal write permissions for all users.
 COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
diff --git a/setup.cfg b/setup.cfg
index fac372eb8..c0a7d57b6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,14 +17,15 @@ install_requires =
 #   FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation
 CORE =
     # Available through the nvidia base image
-    torch>=2.6.0
+    torch>=2.7.0
     # Numpy major needs to match torch
     numpy>=1.26.4,<2.0.0
     # Used for checkpoints
     safetensors>=0.4.4
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
     flash-attn==2.7.3
-    mamba_ssm[causal-conv1d]==2.2.4
+    # mamba_ssm[causal-conv1d]=2.2.4  # Removed here because we need to compile from github.
+    mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4
 
 
 # Required for some optional features and tools.
@@ -48,6 +49,7 @@ DEV =
     pre-commit>=4.0.1
     # Required for testing
     pytest>=8.3.2
+    pytest-depends>=1.0.1
     pytest-xdist>=3.6.1
     # Somehow needed for Megatron to work with base image 24.11
     setuptools>=78.1.1

From 1340903d5b31c8f1fc0c6afb9171b6f119f3c7a4 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 11 Jun 2025 11:56:45 -0400
Subject: [PATCH 3/7] attempt

---
 Dockerfile | 4 ++--
 setup.cfg  | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ae6625d07..05c3870c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,10 +27,10 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to
 # The base image enforces versions for things like pytest for no good reason.
 ENV PIP_CONSTRAINT=""
 # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds.
-# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720
+# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 (same for causal-conv1d)
 # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?)
+RUN MAX_JOBS=4 pip install --no-build-isolation  "causal-conv1d@git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.0.post8"
 RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4"
-
 # Copy dependency files with universal write permissions for all users.
 COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
 COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
diff --git a/setup.cfg b/setup.cfg
index c0a7d57b6..3345ff73a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,8 +24,7 @@ CORE =
     safetensors>=0.4.4
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
     flash-attn==2.7.3
-    # mamba_ssm[causal-conv1d]=2.2.4  # Removed here because we need to compile from github.
-    mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4
+    mamba_ssm[causal-conv1d]==2.2.4
 
 
 # Required for some optional features and tools.

From 13e1da5c9d91658ba9941a2d03d91d21e668143b Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 12 Jun 2025 10:41:22 -0400
Subject: [PATCH 4/7] fix

---
 fast_llm/functional/triton/mlp.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/fast_llm/functional/triton/mlp.py b/fast_llm/functional/triton/mlp.py
index ee3ba304c..ab408368f 100644
--- a/fast_llm/functional/triton/mlp.py
+++ b/fast_llm/functional/triton/mlp.py
@@ -25,9 +25,6 @@
 from fast_llm.functional.triton.sparse_linear import output_sparse_matmul
 from fast_llm.tensor import param_get_and_unset_is_zero
 
-# Triton requires global variables to be annotated with `constexpr`.
-_TritonActivationType: tl_constexpr = ActivationType
-
 
 @triton_jit()
 def triton_mlp_activation_forward_kernel(
@@ -50,18 +47,19 @@ def triton_mlp_activation_forward_kernel(
 
     input_ = tl.load(input_ptr, mask=mask).to(tl.float32)
 
-    if activation_type == _TritonActivationType.gelu:
+    # Triton doesn't like enums, so we use str instead of ActivationType.
+    if activation_type == "gelu":
         tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_)
         tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input))
         out = input_ * 0.5 * (1.0 + tanh)
-    elif activation_type == _TritonActivationType.silu:
+    elif activation_type == "silu":
         out = input_ / (1 + tl.exp(-input_))
-    elif activation_type == _TritonActivationType.relu:
+    elif activation_type == "relu":
         out = tl.where(input_ > 0, input_, 0)
-    elif activation_type == _TritonActivationType.squared_relu:
+    elif activation_type == "squared_relu":
         relu_out = tl.where(input_ > 0, input_, 0)
         out = relu_out * relu_out
-    elif activation_type == _TritonActivationType.identity:
+    elif activation_type == "identity":
         out = input_
     else:
         tl.static_assert(False, activation_type)
@@ -100,28 +98,29 @@ def triton_mlp_activation_backward_kernel(
     input_ = tl.load(input_ptr, mask=mask).to(tl.float32)
     output_grad = tl.load(grad_output_ptr + output_offsets, mask=mask).to(tl.float32)
 
-    if activation_type == _TritonActivationType.gelu:
+    # Triton doesn't like enums, so we use str instead of ActivationType.
+    if activation_type == "gelu":
         tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_)
         tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input))
         grad = 0.5 * input_ * ((1 - tanh * tanh) * (0.79788456 + 0.1070322243 * input_ * input_)) + 0.5 * (1 + tanh)
         if gated or recompute:
             out = input_ * 0.5 * (1.0 + tanh)
-    elif activation_type == _TritonActivationType.silu:
+    elif activation_type == "silu":
         exp = tl.exp(-input_)
         sigma = 1 / (1 + exp)
         grad = sigma * sigma + (1 + input_) / (2 + exp + 1 / exp)
         if gated or recompute:
             out = input_ * sigma
-    elif activation_type == _TritonActivationType.relu:
+    elif activation_type == "relu":
         grad = tl.where(input_ > 0, 1, 0)
         if gated or recompute:
             out = tl.where(input_ > 0, input_, 0)
-    elif activation_type == _TritonActivationType.squared_relu:
+    elif activation_type == "squared_relu":
         relu_out = tl.where(input_ > 0, input_, 0)
         grad = 2 * relu_out
         if gated or recompute:
             out = relu_out * relu_out
-    elif activation_type == _TritonActivationType.identity:
+    elif activation_type == "identity":
         grad = 1
         if gated or recompute:
             out = input_

From 0dffe5c46ca31e0b8b1b13dfcbec6d0e712ab2d6 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 12 Jun 2025 14:27:13 -0400
Subject: [PATCH 5/7] fixes

---
 fast_llm/layers/ssm/discrete_mamba2.py | 41 ++++++++++++++++----------
 fast_llm/layers/ssm/mamba_layer.py     | 11 +++++--
 setup.cfg                              | 29 +++++++++---------
 tests/test_ssms.py                     |  2 +-
 4 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py
index 85916244e..ecf0b29d7 100644
--- a/fast_llm/layers/ssm/discrete_mamba2.py
+++ b/fast_llm/layers/ssm/discrete_mamba2.py
@@ -2,7 +2,6 @@
 import math
 
 import einops
-import mamba_ssm.ops.triton.ssd_combined
 import torch
 
 from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace
@@ -13,12 +12,22 @@
 
 logger = logging.getLogger(__name__)
 
+
 try:
-    import causal_conv1d
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined  # noqa
+
+    _mamba_available = True
 except ImportError:
-    # this is needed since we cannot use causal_conv1d on B200 GPUs for now
-    logger.warning("Note, causal_conv1d not found, will use torch.nn.functional.conv1d instead")
-    causal_conv1d = None
+    _mamba_available = False
+
+
+try:
+    from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn  # noqa
+
+    _causal_conv1d_available = True
+except ImportError:
+    _causal_conv1d_available = False
+
 
 """
 This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py
@@ -148,6 +157,8 @@ def forward(self, hidden_states, kwargs):
             outputs["hidden_states"]: (B, L, D).
             outputs["state"]: inference cache.
         """
+
+        assert _mamba_available
         input_ = hidden_states
         outputs = {}
         # assert state is None
@@ -201,7 +212,7 @@ def forward(self, hidden_states, kwargs):
         C = einops.rearrange(C, "b l (h n) -> b l h n", h=self.n_qk_heads)
 
         # SSM forward
-        result = mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined(
+        result = _mamba_chunk_scan_combined(
             x=x / torch.nn.functional.softplus(A_log).to(x.dtype).unsqueeze(-1),
             dt=A_log,
             dt_softplus=True,
@@ -234,11 +245,18 @@ def forward(self, hidden_states, kwargs):
 
     def convolutional_forward(self, xBC, padded_len):
         """Convolutional layer forward pass for the full sequence."""
-        if causal_conv1d is None or self.activation_name not in [
+        if _causal_conv1d_available and self.activation_name in (
             "silu",
             "swish",
             "identity",
-        ]:
+        ):
+            xBC = _causal_conv1d_fn(
+                xBC.transpose(1, 2),
+                einops.rearrange(self.conv1d_weight, "d 1 w -> d w"),
+                self.conv1d_bias,
+                activation=None if self.activation_name == "identity" else self.activation_name,
+            ).transpose(1, 2)
+        else:
             xBC = self.act(
                 torch.nn.functional.conv1d(
                     xBC.transpose(1, 2),
@@ -248,11 +266,4 @@ def convolutional_forward(self, xBC, padded_len):
                     padding=self.conv_kernel_size - 1,
                 )[..., :padded_len].transpose(1, 2)
             )
-        else:
-            xBC = causal_conv1d.causal_conv1d_fn(
-                xBC.transpose(1, 2),
-                einops.rearrange(self.conv1d_weight, "d 1 w -> d w"),
-                self.conv1d_bias,
-                activation=None if self.activation_name == "identity" else self.activation_name,
-            ).transpose(1, 2)
         return xBC
diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py
index 7d0ee48a4..7fd437894 100644
--- a/fast_llm/layers/ssm/mamba_layer.py
+++ b/fast_llm/layers/ssm/mamba_layer.py
@@ -2,7 +2,6 @@
 from typing import Callable
 
 import einops
-import mamba_ssm.ops.selective_scan_interface
 import torch
 
 from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace
@@ -11,6 +10,13 @@
 from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_
 from fast_llm.utils import get_lr_scale
 
+try:
+    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn  # noqa
+
+    _mamba_available = True
+except ImportError:
+    _mamba_available = False
+
 """
 Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba.
 For now it only supports training and not inference.
@@ -153,6 +159,7 @@ def __init__(
         self._return_input = return_input
 
     def forward(self, hidden_states, kwargs):
+        assert _mamba_available
         batch, seqlen, dim = hidden_states.shape
 
         # We do matmul and transpose BLH -> HBL at the same time
@@ -167,7 +174,7 @@ def forward(self, hidden_states, kwargs):
         A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
         # In the backward pass we write dx and dz next to each other to avoid torch.cat
         # not, if we wanbt to support inference, we would need to imp.lement slow path here, see https://github.com/Zyphra/Zamba2/blob/1b182f40f2257f822cc06dd785df53d67d691a15/mamba_layer.py#L172s
-        out = mamba_ssm.ops.selective_scan_interface.mamba_inner_fn(
+        out = _mamba_inner_fn(
             xz,
             self.conv1d_weight,
             self.conv1d_bias,
diff --git a/setup.cfg b/setup.cfg
index 3345ff73a..bc0de459d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,10 +6,10 @@ packages = find_namespace:
 include_package_data = True
 python_requires = >=3.12
 install_requires =
-    requests>=2.32.3
-    PyYAML>=6.0.1
-    pybind11>=2.5.0
-    packaging>=24.1
+    requests>=2.32.4
+    PyYAML>=6.0.2
+    pybind11>=2.13.6
+    packaging>=25.0
 
 [options.extras_require]
 # Required to use the main functionality of Fast-LLM
@@ -21,7 +21,7 @@ CORE =
     # Numpy major needs to match torch
     numpy>=1.26.4,<2.0.0
     # Used for checkpoints
-    safetensors>=0.4.4
+    safetensors>=0.5.3
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
     flash-attn==2.7.3
     mamba_ssm[causal-conv1d]==2.2.4
@@ -30,28 +30,27 @@ CORE =
 # Required for some optional features and tools.
 OPTIONAL =
     # Huggingface tools
-    transformers>=4.44.2
-    hf-transfer>=0.1.8
-    datasets>=3.1.0
-    huggingface-hub>=0.28.1
+    transformers>=4.52.4
+    hf-transfer>=0.1.9
+    datasets>=3.6.0
+    huggingface-hub>=0.32.6
     # Weights and biases
-    wandb>=0.17.7
+    wandb>=0.20.1
     # Hydra
     hydra-core>=1.3.2
     omegaconf>=2.3.0
     # Miscellaneous
-    requests>=2.32.3
     tqdm>=4.67.1
 
 DEV =
     # Pre-commit git hook
-    pre-commit>=4.0.1
+    pre-commit>=4.2.0
     # Required for testing
-    pytest>=8.3.2
+    pytest>=8.4.0
     pytest-depends>=1.0.1
-    pytest-xdist>=3.6.1
+    pytest-xdist>=3.7.0
     # Somehow needed for Megatron to work with base image 24.11
-    setuptools>=78.1.1
+    setuptools>=80.9.0
 
 # Required for building the documentation
 DOCS =
diff --git a/tests/test_ssms.py b/tests/test_ssms.py
index f3eb92617..ef5193b67 100644
--- a/tests/test_ssms.py
+++ b/tests/test_ssms.py
@@ -14,6 +14,7 @@
 from fast_llm.engine.schedule.schedule import Schedule
 from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames
 from fast_llm.layers.ssm.config import SSMBlockType
+from fast_llm.layers.ssm.llamba_block import LlambaBlock
 from fast_llm.layers.transformer.config import TransformerKwargs
 from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat
 from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat
@@ -21,7 +22,6 @@
 
 try:
     from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2
-    from fast_llm.layers.ssm.llamba_block import LlambaBlock
     from fast_llm.layers.ssm.mamba_layer import MambaLayer
     from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel
 except Exception:

From dcc506464d175407c3d8711e73d05ae3b88c6c41 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 12 Jun 2025 14:30:29 -0400
Subject: [PATCH 6/7] fixes

---
 tests/test_ssms.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/tests/test_ssms.py b/tests/test_ssms.py
index ef5193b67..36c7b6229 100644
--- a/tests/test_ssms.py
+++ b/tests/test_ssms.py
@@ -14,24 +14,15 @@
 from fast_llm.engine.schedule.schedule import Schedule
 from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames
 from fast_llm.layers.ssm.config import SSMBlockType
+from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2
 from fast_llm.layers.ssm.llamba_block import LlambaBlock
+from fast_llm.layers.ssm.mamba_layer import MambaLayer
 from fast_llm.layers.transformer.config import TransformerKwargs
 from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat
 from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat
+from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel
 from tests.common import get_hybrid_config, materialize_meta_tensors
 
-try:
-    from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2
-    from fast_llm.layers.ssm.mamba_layer import MambaLayer
-    from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel
-except Exception:
-    MambaLayer, LlambaBlock, HybridSSMBaseModel, DiscreteMamba2 = (
-        None,
-        None,
-        None,
-        None,
-    )
-
 try:
     from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel
 except ImportError:

From 9d415bc6f29a083e326d856fcfcc949bdad3b638 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 12 Jun 2025 14:37:21 -0400
Subject: [PATCH 7/7] fixes

---
 .github/workflows/docs.yaml |  2 +-
 Dockerfile                  |  2 +-
 setup.cfg                   | 21 ++++++++++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 93191972e..b755993ce 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -33,7 +33,7 @@ jobs:
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
-          pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]"
       - name: Build the documentation
         run: mkdocs build
 
diff --git a/Dockerfile b/Dockerfile
index 05c3870c5..50810ed1e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV]"
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM
diff --git a/setup.cfg b/setup.cfg
index bc0de459d..8a446064d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,16 +24,10 @@ CORE =
     safetensors>=0.5.3
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
     flash-attn==2.7.3
-    mamba_ssm[causal-conv1d]==2.2.4
 
 
-# Required for some optional features and tools.
+# Small packages required for some optional features and tools.
 OPTIONAL =
-    # Huggingface tools
-    transformers>=4.52.4
-    hf-transfer>=0.1.9
-    datasets>=3.6.0
-    huggingface-hub>=0.32.6
     # Weights and biases
     wandb>=0.20.1
     # Hydra
@@ -42,6 +36,19 @@ OPTIONAL =
     # Miscellaneous
     tqdm>=4.67.1
 
+# Huggingface tools
+HUGGINGFACE =
+    transformers>=4.52.4
+    hf-transfer>=0.1.9
+    datasets>=3.6.0
+    huggingface-hub>=0.32.6
+
+# Required to run SSMs
+# To install on cpu environment (ex. for IDE support):
+#   MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install -e ".[CORE,SSM]" --no-build-isolation
+SSM =
+    mamba_ssm[causal-conv1d]==2.2.4
+
 DEV =
     # Pre-commit git hook
     pre-commit>=4.2.0