From b877fb27604be66c9ca87de11a88524e6cc5d7f9 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 09:03:59 -0400 Subject: [PATCH 1/7] stuff --- Dockerfile | 9 ++++++++- setup.cfg | 11 +++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8c2efa85e..983d785ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1.7-labs -FROM nvcr.io/nvidia/pytorch:24.11-py3 +FROM nvcr.io/nvidia/pytorch:25.05-py3 # Install dependencies. RUN apt-get update \ @@ -24,6 +24,13 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to /usr/local/lib/python3.12/dist-packages \ /usr/local/lib/python3.12/dist-packages/__pycache__ +# The base image enforces versions for things like pytest for no good reason. +ENV PIP_CONSTRAINT="" +# There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. +# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 +# We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) +RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" + # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ diff --git a/setup.cfg b/setup.cfg index 381225bf8..fac372eb8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,13 +17,13 @@ install_requires = # FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation CORE = # Available through the nvidia base image - torch>=2.5.0 + torch>=2.6.0 # Numpy major needs to match torch - numpy>=1.24.4,<2.0.0 + numpy>=1.26.4,<2.0.0 # Used for checkpoints safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation - flash-attn==2.7.2.post1 + flash-attn==2.7.3 mamba_ssm[causal-conv1d]==2.2.4 @@ -41,17 +41,16 @@ OPTIONAL = omegaconf>=2.3.0 # Miscellaneous requests>=2.32.3 - tqdm>=4.66.3 + tqdm>=4.67.1 DEV = # Pre-commit git hook pre-commit>=4.0.1 # Required for testing pytest>=8.3.2 - pytest-depends>=1.0.1 pytest-xdist>=3.6.1 # Somehow needed for Megatron to work with base image 24.11 - setuptools>=75.6.0 + setuptools>=78.1.1 # Required for building the documentation DOCS = From 907aef09ad944a3741ff184f36923c7cd7bb84af Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 09:45:29 -0400 Subject: [PATCH 2/7] attempt --- Dockerfile | 2 +- setup.cfg | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 983d785ea..ae6625d07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ ENV PIP_CONSTRAINT="" # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. # We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) -RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" +RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4" # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ diff --git a/setup.cfg b/setup.cfg index fac372eb8..c0a7d57b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,14 +17,15 @@ install_requires = # FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation CORE = # Available through the nvidia base image - torch>=2.6.0 + torch>=2.7.0 # Numpy major needs to match torch numpy>=1.26.4,<2.0.0 # Used for checkpoints safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - mamba_ssm[causal-conv1d]==2.2.4 + # mamba_ssm[causal-conv1d]=2.2.4 # Removed here because we need to compile from github. + mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4 # Required for some optional features and tools. @@ -48,6 +49,7 @@ DEV = pre-commit>=4.0.1 # Required for testing pytest>=8.3.2 + pytest-depends>=1.0.1 pytest-xdist>=3.6.1 # Somehow needed for Megatron to work with base image 24.11 setuptools>=78.1.1 From 1340903d5b31c8f1fc0c6afb9171b6f119f3c7a4 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 11:56:45 -0400 Subject: [PATCH 3/7] attempt --- Dockerfile | 4 ++-- setup.cfg | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index ae6625d07..05c3870c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,10 +27,10 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to # The base image enforces versions for things like pytest for no good reason. ENV PIP_CONSTRAINT="" # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. -# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 +# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 (same for causal-conv1d) # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) +RUN MAX_JOBS=4 pip install --no-build-isolation "causal-conv1d@git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.0.post8" RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4" - # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ diff --git a/setup.cfg b/setup.cfg index c0a7d57b6..3345ff73a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,8 +24,7 @@ CORE = safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - # mamba_ssm[causal-conv1d]=2.2.4 # Removed here because we need to compile from github. - mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4 + mamba_ssm[causal-conv1d]==2.2.4 # Required for some optional features and tools. From 13e1da5c9d91658ba9941a2d03d91d21e668143b Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 10:41:22 -0400 Subject: [PATCH 4/7] fix --- fast_llm/functional/triton/mlp.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fast_llm/functional/triton/mlp.py b/fast_llm/functional/triton/mlp.py index ee3ba304c..ab408368f 100644 --- a/fast_llm/functional/triton/mlp.py +++ b/fast_llm/functional/triton/mlp.py @@ -25,9 +25,6 @@ from fast_llm.functional.triton.sparse_linear import output_sparse_matmul from fast_llm.tensor import param_get_and_unset_is_zero -# Triton requires global variables to be annotated with `constexpr`. -_TritonActivationType: tl_constexpr = ActivationType - @triton_jit() def triton_mlp_activation_forward_kernel( @@ -50,18 +47,19 @@ def triton_mlp_activation_forward_kernel( input_ = tl.load(input_ptr, mask=mask).to(tl.float32) - if activation_type == _TritonActivationType.gelu: + # Triton doesn't like enums, so we use str instead of ActivationType. + if activation_type == "gelu": tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_) tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input)) out = input_ * 0.5 * (1.0 + tanh) - elif activation_type == _TritonActivationType.silu: + elif activation_type == "silu": out = input_ / (1 + tl.exp(-input_)) - elif activation_type == _TritonActivationType.relu: + elif activation_type == "relu": out = tl.where(input_ > 0, input_, 0) - elif activation_type == _TritonActivationType.squared_relu: + elif activation_type == "squared_relu": relu_out = tl.where(input_ > 0, input_, 0) out = relu_out * relu_out - elif activation_type == _TritonActivationType.identity: + elif activation_type == "identity": out = input_ else: tl.static_assert(False, activation_type) @@ -100,28 +98,29 @@ def triton_mlp_activation_backward_kernel( input_ = tl.load(input_ptr, mask=mask).to(tl.float32) output_grad = tl.load(grad_output_ptr + output_offsets, mask=mask).to(tl.float32) - if activation_type == _TritonActivationType.gelu: + # Triton doesn't like enums, so we use str instead of ActivationType. + if activation_type == "gelu": tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_) tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input)) grad = 0.5 * input_ * ((1 - tanh * tanh) * (0.79788456 + 0.1070322243 * input_ * input_)) + 0.5 * (1 + tanh) if gated or recompute: out = input_ * 0.5 * (1.0 + tanh) - elif activation_type == _TritonActivationType.silu: + elif activation_type == "silu": exp = tl.exp(-input_) sigma = 1 / (1 + exp) grad = sigma * sigma + (1 + input_) / (2 + exp + 1 / exp) if gated or recompute: out = input_ * sigma - elif activation_type == _TritonActivationType.relu: + elif activation_type == "relu": grad = tl.where(input_ > 0, 1, 0) if gated or recompute: out = tl.where(input_ > 0, input_, 0) - elif activation_type == _TritonActivationType.squared_relu: + elif activation_type == "squared_relu": relu_out = tl.where(input_ > 0, input_, 0) grad = 2 * relu_out if gated or recompute: out = relu_out * relu_out - elif activation_type == _TritonActivationType.identity: + elif activation_type == "identity": grad = 1 if gated or recompute: out = input_ From 0dffe5c46ca31e0b8b1b13dfcbec6d0e712ab2d6 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:27:13 -0400 Subject: [PATCH 5/7] fixes --- fast_llm/layers/ssm/discrete_mamba2.py | 41 ++++++++++++++++---------- fast_llm/layers/ssm/mamba_layer.py | 11 +++++-- setup.cfg | 29 +++++++++--------- tests/test_ssms.py | 2 +- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 85916244e..ecf0b29d7 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -2,7 +2,6 @@ import math import einops -import mamba_ssm.ops.triton.ssd_combined import torch from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace @@ -13,12 +12,22 @@ logger = logging.getLogger(__name__) + try: - import causal_conv1d + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined # noqa + + _mamba_available = True except ImportError: - # this is needed since we cannot use causal_conv1d on B200 GPUs for now - logger.warning("Note, causal_conv1d not found, will use torch.nn.functional.conv1d instead") - causal_conv1d = None + _mamba_available = False + + +try: + from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn # noqa + + _causal_conv1d_available = True +except ImportError: + _causal_conv1d_available = False + """ This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py @@ -148,6 +157,8 @@ def forward(self, hidden_states, kwargs): outputs["hidden_states"]: (B, L, D). outputs["state"]: inference cache. """ + + assert _mamba_available input_ = hidden_states outputs = {} # assert state is None @@ -201,7 +212,7 @@ def forward(self, hidden_states, kwargs): C = einops.rearrange(C, "b l (h n) -> b l h n", h=self.n_qk_heads) # SSM forward - result = mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined( + result = _mamba_chunk_scan_combined( x=x / torch.nn.functional.softplus(A_log).to(x.dtype).unsqueeze(-1), dt=A_log, dt_softplus=True, @@ -234,11 +245,18 @@ def forward(self, hidden_states, kwargs): def convolutional_forward(self, xBC, padded_len): """Convolutional layer forward pass for the full sequence.""" - if causal_conv1d is None or self.activation_name not in [ + if _causal_conv1d_available and self.activation_name in ( "silu", "swish", "identity", - ]: + ): + xBC = _causal_conv1d_fn( + xBC.transpose(1, 2), + einops.rearrange(self.conv1d_weight, "d 1 w -> d w"), + self.conv1d_bias, + activation=None if self.activation_name == "identity" else self.activation_name, + ).transpose(1, 2) + else: xBC = self.act( torch.nn.functional.conv1d( xBC.transpose(1, 2), @@ -248,11 +266,4 @@ def convolutional_forward(self, xBC, padded_len): padding=self.conv_kernel_size - 1, )[..., :padded_len].transpose(1, 2) ) - else: - xBC = causal_conv1d.causal_conv1d_fn( - xBC.transpose(1, 2), - einops.rearrange(self.conv1d_weight, "d 1 w -> d w"), - self.conv1d_bias, - activation=None if self.activation_name == "identity" else self.activation_name, - ).transpose(1, 2) return xBC diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 7d0ee48a4..7fd437894 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -2,7 +2,6 @@ from typing import Callable import einops -import mamba_ssm.ops.selective_scan_interface import torch from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace @@ -11,6 +10,13 @@ from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_ from fast_llm.utils import get_lr_scale +try: + from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn # noqa + + _mamba_available = True +except ImportError: + _mamba_available = False + """ Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba. For now it only supports training and not inference. @@ -153,6 +159,7 @@ def __init__( self._return_input = return_input def forward(self, hidden_states, kwargs): + assert _mamba_available batch, seqlen, dim = hidden_states.shape # We do matmul and transpose BLH -> HBL at the same time @@ -167,7 +174,7 @@ def forward(self, hidden_states, kwargs): A = -torch.exp(self.A_log.float()) # (d_inner, d_state) # In the backward pass we write dx and dz next to each other to avoid torch.cat # not, if we wanbt to support inference, we would need to imp.lement slow path here, see https://github.com/Zyphra/Zamba2/blob/1b182f40f2257f822cc06dd785df53d67d691a15/mamba_layer.py#L172s - out = mamba_ssm.ops.selective_scan_interface.mamba_inner_fn( + out = _mamba_inner_fn( xz, self.conv1d_weight, self.conv1d_bias, diff --git a/setup.cfg b/setup.cfg index 3345ff73a..bc0de459d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,10 +6,10 @@ packages = find_namespace: include_package_data = True python_requires = >=3.12 install_requires = - requests>=2.32.3 - PyYAML>=6.0.1 - pybind11>=2.5.0 - packaging>=24.1 + requests>=2.32.4 + PyYAML>=6.0.2 + pybind11>=2.13.6 + packaging>=25.0 [options.extras_require] # Required to use the main functionality of Fast-LLM @@ -21,7 +21,7 @@ CORE = # Numpy major needs to match torch numpy>=1.26.4,<2.0.0 # Used for checkpoints - safetensors>=0.4.4 + safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 mamba_ssm[causal-conv1d]==2.2.4 @@ -30,28 +30,27 @@ CORE = # Required for some optional features and tools. OPTIONAL = # Huggingface tools - transformers>=4.44.2 - hf-transfer>=0.1.8 - datasets>=3.1.0 - huggingface-hub>=0.28.1 + transformers>=4.52.4 + hf-transfer>=0.1.9 + datasets>=3.6.0 + huggingface-hub>=0.32.6 # Weights and biases - wandb>=0.17.7 + wandb>=0.20.1 # Hydra hydra-core>=1.3.2 omegaconf>=2.3.0 # Miscellaneous - requests>=2.32.3 tqdm>=4.67.1 DEV = # Pre-commit git hook - pre-commit>=4.0.1 + pre-commit>=4.2.0 # Required for testing - pytest>=8.3.2 + pytest>=8.4.0 pytest-depends>=1.0.1 - pytest-xdist>=3.6.1 + pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 - setuptools>=78.1.1 + setuptools>=80.9.0 # Required for building the documentation DOCS = diff --git a/tests/test_ssms.py b/tests/test_ssms.py index f3eb92617..ef5193b67 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -14,6 +14,7 @@ from fast_llm.engine.schedule.schedule import Schedule from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames from fast_llm.layers.ssm.config import SSMBlockType +from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat @@ -21,7 +22,6 @@ try: from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel except Exception: From dcc506464d175407c3d8711e73d05ae3b88c6c41 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:30:29 -0400 Subject: [PATCH 6/7] fixes --- tests/test_ssms.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/test_ssms.py b/tests/test_ssms.py index ef5193b67..36c7b6229 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -14,24 +14,15 @@ from fast_llm.engine.schedule.schedule import Schedule from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames from fast_llm.layers.ssm.config import SSMBlockType +from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 from fast_llm.layers.ssm.llamba_block import LlambaBlock +from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat +from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel from tests.common import get_hybrid_config, materialize_meta_tensors -try: - from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.mamba_layer import MambaLayer - from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel -except Exception: - MambaLayer, LlambaBlock, HybridSSMBaseModel, DiscreteMamba2 = ( - None, - None, - None, - None, - ) - try: from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel except ImportError: From 9d415bc6f29a083e326d856fcfcc949bdad3b638 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:37:21 -0400 Subject: [PATCH 7/7] fixes --- .github/workflows/docs.yaml | 2 +- Dockerfile | 2 +- setup.cfg | 21 ++++++++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 93191972e..b755993ce 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -33,7 +33,7 @@ jobs: pip install pybind11 FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ - pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]" + pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" - name: Build the documentation run: mkdocs build diff --git a/Dockerfile b/Dockerfile index 05c3870c5..50810ed1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/ # Install dependencies within the virtual environment. -RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]" +RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV]" # Copy the remaining source code with universal write permissions. COPY --chmod=777 ./Megatron-LM Megatron-LM diff --git a/setup.cfg b/setup.cfg index bc0de459d..8a446064d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,16 +24,10 @@ CORE = safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - mamba_ssm[causal-conv1d]==2.2.4 -# Required for some optional features and tools. +# Small packages required for some optional features and tools. OPTIONAL = - # Huggingface tools - transformers>=4.52.4 - hf-transfer>=0.1.9 - datasets>=3.6.0 - huggingface-hub>=0.32.6 # Weights and biases wandb>=0.20.1 # Hydra @@ -42,6 +36,19 @@ OPTIONAL = # Miscellaneous tqdm>=4.67.1 +# Huggingface tools +HUGGINGFACE = + transformers>=4.52.4 + hf-transfer>=0.1.9 + datasets>=3.6.0 + huggingface-hub>=0.32.6 + +# Required to run SSMs +# To install on cpu environment (ex. for IDE support): +# MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install -e ".[CORE,SSM]" --no-build-isolation +SSM = + mamba_ssm[causal-conv1d]==2.2.4 + DEV = # Pre-commit git hook pre-commit>=4.2.0