|
5 | 5 |
|
6 | 6 | from bayesflow.networks.summary_network import SummaryNetwork |
7 | 7 | from bayesflow.types import Tensor |
8 | | -from bayesflow.utils import logging |
| 8 | +from bayesflow.utils import keras_kwargs |
9 | 9 | from bayesflow.utils.decorators import sanitize_input_shape |
10 | 10 |
|
11 | | -try: |
12 | | - from mamba_ssm import Mamba |
13 | | -except ImportError: |
14 | | - logging.error("Mamba class is not available. Please, install the mamba-ssm library via `pip install mamba-ssm`.") |
| 11 | +from .mamba_block import MambaBlock |
15 | 12 |
|
16 | 13 |
|
17 | 14 | @serializable("bayesflow.wrappers") |
18 | | -class MambaBlock(keras.Layer): |
| 15 | +class Mamba(SummaryNetwork): |
19 | 16 | """ |
20 | | - Wraps the original Mamba module from, with added functionality for bidirectional processing: |
| 17 | + Wraps a sequence of Mamba modules using the simple Mamba module from: |
21 | 18 | https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py |
22 | 19 |
|
23 | 20 | Copyright (c) 2023, Tri Dao, Albert Gu. |
24 | | - """ |
25 | | - |
26 | | - def __init__( |
27 | | - self, |
28 | | - state_dim: int, |
29 | | - conv_dim: int, |
30 | | - feature_dim: int = 16, |
31 | | - expand: int = 2, |
32 | | - bidirectional: bool = True, |
33 | | - dt_min: float = 0.001, |
34 | | - dt_max: float = 0.1, |
35 | | - device: str = "cuda", |
36 | | - **kwargs, |
37 | | - ): |
38 | | - """ |
39 | | - A Keras layer implementing a Mamba-based sequence processing block. |
40 | | -
|
41 | | - This layer applies a Mamba model for sequence modeling, preceded by a |
42 | | - convolutional projection and followed by layer normalization. |
43 | | -
|
44 | | - Parameters |
45 | | - ---------- |
46 | | - state_dim : int |
47 | | - The dimension of the state space in the Mamba model. |
48 | | - conv_dim : int |
49 | | - The dimension of the convolutional layer used in Mamba. |
50 | | - feature_dim : int, optional |
51 | | - The feature dimension for input projection and Mamba processing (default is 16). |
52 | | - expand : int, optional |
53 | | - Expansion factor for Mamba's internal dimension (default is 1). |
54 | | - dt_min : float, optional |
55 | | - Minimum delta time for Mamba (default is 0.001). |
56 | | - dt_max : float, optional |
57 | | - Maximum delta time for Mamba (default is 0.1). |
58 | | - device : str, optional |
59 | | - The device to which the Mamba model is moved, typically "cuda" or "cpu" (default is "cuda"). |
60 | | - **kwargs : dict |
61 | | - Additional keyword arguments passed to the `keras.layers.Layer` initializer. |
62 | | - """ |
63 | | - |
64 | | - super().__init__(**kwargs) |
65 | | - |
66 | | - if keras.backend.backend() != "torch": |
67 | | - raise EnvironmentError("Mamba is only available using torch backend.") |
68 | | - |
69 | | - self.bidirectional = bidirectional |
70 | | - |
71 | | - self.mamba = Mamba( |
72 | | - d_model=feature_dim, d_state=state_dim, d_conv=conv_dim, expand=expand, dt_min=dt_min, dt_max=dt_max |
73 | | - ).to(device) |
74 | | - |
75 | | - self.input_projector = keras.layers.Conv1D( |
76 | | - feature_dim, |
77 | | - kernel_size=1, |
78 | | - strides=1, |
79 | | - ) |
80 | | - self.layer_norm = keras.layers.LayerNormalization() |
81 | | - |
82 | | - def call(self, x: Tensor, training: bool = False, **kwargs) -> Tensor: |
83 | | - out_forward = self._call(x, training=training, **kwargs) |
84 | | - if self.bidirectional: |
85 | | - out_backward = self._call(keras.ops.flip(x, axis=1), training=training, **kwargs) |
86 | | - return keras.ops.concatenate((out_forward, out_backward), axis=-1) |
87 | | - return out_forward |
88 | | - |
89 | | - def _call(self, x: Tensor, training: bool = False, **kwargs) -> Tensor: |
90 | | - x = self.input_projector(x) |
91 | | - h = self.mamba(x) |
92 | | - out = self.layer_norm(h + x, training=training, **kwargs) |
93 | | - return out |
94 | | - |
95 | | - @sanitize_input_shape |
96 | | - def build(self, input_shape): |
97 | | - super().build(input_shape) |
98 | | - self.call(keras.ops.zeros(input_shape)) |
99 | 21 |
|
| 22 | + Example usage in a BayesFlow workflow as a summary network: |
100 | 23 |
|
101 | | -@serializable("bayesflow.wrappers") |
102 | | -class MambaSSM(SummaryNetwork): |
103 | | - """ |
104 | | - Wraps a sequence of Mamba modules using the simple Mamba module from: |
105 | | - https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py |
106 | | -
|
107 | | - Copyright (c) 2023, Tri Dao, Albert Gu. |
| 24 | + `summary_net = bayesflow.wrappers.Mamba(summary_dim=32)` |
108 | 25 | """ |
109 | 26 |
|
110 | 27 | def __init__( |
@@ -150,11 +67,11 @@ def __init__( |
150 | 67 | Dropout probability; dropout is applied to the pooled summary vector. |
151 | 68 | device : str, optional |
152 | 69 | The computing device. Currently, only "cuda" is supported (default is "cuda"). |
153 | | - **kwargs : dict |
| 70 | + **kwargs : |
154 | 71 | Additional keyword arguments passed to the `SummaryNetwork` parent class. |
155 | 72 | """ |
156 | 73 |
|
157 | | - super().__init__(**kwargs) |
| 74 | + super().__init__(**keras_kwargs(kwargs)) |
158 | 75 |
|
159 | 76 | if device != "cuda": |
160 | 77 | raise NotImplementedError("MambaSSM only supports cuda as `device`.") |
|
0 commit comments