diff --git a/neural_network/sliding_window_attention.py b/neural_network/sliding_window_attention.py new file mode 100644 index 000000000000..b329e4fce910 --- /dev/null +++ b/neural_network/sliding_window_attention.py @@ -0,0 +1,101 @@ +""" + - - - - - -- - - - - - - - - - - - - - - - - - - - - - - +Name - - sliding_window_attention.py +Goal - - Implement a neural network architecture using sliding window attention for sequence + modeling tasks. +Detail: Total 5 layers neural network + * Input layer + * Sliding Window Attention Layer + * Feedforward Layer + * Output Layer +Author: Stephen Lee +Github: 245885195@qq.com +Date: 2024.10.20 +References: + 1. Choromanska, A., et al. (2020). "On the Importance of Initialization and Momentum in + Deep Learning." *Proceedings of the 37th International Conference on Machine Learning*. + 2. Dai, Z., et al. (2020). "Transformers are RNNs: Fast Autoregressive Transformers + with Linear Attention." *arXiv preprint arXiv:2006.16236*. + 3. [Attention Mechanisms in Neural Networks](https://en.wikipedia.org/wiki/Attention_(machine_learning)) + - - - - - -- - - - - - - - - - - - - - - - - - - - - - - +""" + +import numpy as np + + +class SlidingWindowAttention: + """Sliding Window Attention Module. + + This class implements a sliding window attention mechanism where the model + attends to a fixed-size window of context around each token. + + Attributes: + window_size (int): The size of the attention window. + embed_dim (int): The dimensionality of the input embeddings. + """ + + def __init__(self, embed_dim: int, window_size: int) -> None: + """ + Initialize the SlidingWindowAttention module. + + Args: + embed_dim (int): The dimensionality of the input embeddings. + window_size (int): The size of the attention window. + """ + self.window_size = window_size + self.embed_dim = embed_dim + rng = np.random.default_rng() + self.attention_weights = rng.standard_normal((embed_dim, embed_dim)) + + def forward(self, input_tensor: np.ndarray) -> np.ndarray: + """ + Forward pass for the sliding window attention. + + Args: + input_tensor (np.ndarray): Input tensor of shape (batch_size, seq_length, + embed_dim). + + Returns: + np.ndarray: Output tensor of shape (batch_size, seq_length, embed_dim). + + >>> x = np.random.randn(2, 10, 4) # Batch size 2, sequence length 10, embedding dimension 4 + >>> attention = SlidingWindowAttention(embed_dim=4, window_size=3) + >>> output = attention.forward(x) + >>> output.shape + (2, 10, 4) + >>> (output.sum() != 0).item() # Check if output is non-zero + True + """ + batch_size, seq_length, _ = input_tensor.shape + output = np.zeros_like(input_tensor) + + for i in range(seq_length): + # Define the window range + start = max(0, i - self.window_size // 2) + end = min(seq_length, i + self.window_size // 2 + 1) + + # Extract the local window + local_window = input_tensor[:, start:end, :] + + # Compute attention scores + attention_scores = np.matmul(local_window, self.attention_weights) + + # Average the attention scores + output[:, i, :] = np.mean(attention_scores, axis=1) + + return output + + +if __name__ == "__main__": + import doctest + + doctest.testmod() + + # usage + rng = np.random.default_rng() + x = rng.standard_normal( + (2, 10, 4) + ) # Batch size 2, sequence length 10, embedding dimension 4 + attention = SlidingWindowAttention(embed_dim=4, window_size=3) + output = attention.forward(x) + print(output)