basaanithanaveenkumar · Copilot · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,25 @@
+# Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Packaging / build artifacts
+*.egg-info/
+dist/
+build/
+*.egg
+
+# Test / coverage
+.pytest_cache/
+.coverage
+htmlcov/
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# Editor
+.vscode/
+.idea/
+*.swp
diff --git a/README.md b/README.md
@@ -1,2 +1,109 @@
 # HaloBlocks
-python library where all model components are treated as blocks like heads,attention layers, new layers 
+
+A Python library where all model components are treated as composable **blocks** — including heads, attention layers, feed-forward layers, encoders, and decoders.
+
+Every block is a `torch.nn.Module` that can be mixed and matched to build custom neural network architectures.
+
+## Installation
+
+```bash
+pip install haloblocks
+```
+
+Or from source:
+
+```bash
+git clone https://github.com/basaanithanaveenkumar/HaloBlocks.git
+cd HaloBlocks
+pip install -e .
+```
+
+**Requirements:** Python ≥ 3.8, PyTorch ≥ 1.13
+
+## Quick Start
+
+```python
+import torch
+import haloblocks as hb
+
+# --- Layers ---
+linear  = hb.LinearBlock(128, 256)
+norm    = hb.LayerNormBlock(256)
+ff      = hb.FeedForwardBlock(d_model=256, d_ff=1024)
+emb     = hb.EmbeddingBlock(vocab_size=30000, d_model=256)
+
+# --- Attention ---
+mha     = hb.MultiHeadAttentionBlock(d_model=256, num_heads=8)
+self_a  = hb.SelfAttentionBlock(d_model=256, num_heads=8, causal=True)
+cross_a = hb.CrossAttentionBlock(d_model=256, num_heads=8)
+
+# --- Encoders ---
+encoder = hb.TransformerEncoderBlock(d_model=256, num_heads=8, d_ff=1024, num_layers=6)
+
+# --- Decoders ---
+decoder = hb.TransformerDecoderBlock(d_model=256, num_heads=8, d_ff=1024, num_layers=6)
+
+# --- Heads ---
+cls_head   = hb.ClassificationHead(d_model=256, num_classes=10)
+lm_head    = hb.LanguageModelHead(d_model=256, vocab_size=30000)
+tok_head   = hb.TokenClassificationHead(d_model=256, num_classes=5)
+```
+
+## Building a Transformer from Blocks
+
+```python
+import torch
+import torch.nn as nn
+import haloblocks as hb
+
+
+class SimpleTransformer(nn.Module):
+    def __init__(self, vocab_size, d_model=256, num_heads=8, d_ff=1024,
+                 num_enc_layers=3, num_dec_layers=3, num_classes=10):
+        super().__init__()
+        self.src_emb = hb.EmbeddingBlock(vocab_size, d_model)
+        self.tgt_emb = hb.EmbeddingBlock(vocab_size, d_model)
+        self.encoder = hb.TransformerEncoderBlock(d_model, num_heads, d_ff, num_enc_layers)
+        self.decoder = hb.TransformerDecoderBlock(d_model, num_heads, d_ff, num_dec_layers)
+        self.head    = hb.ClassificationHead(d_model, num_classes, pooling="mean")
+
+    def forward(self, src, tgt):
+        memory = self.encoder(self.src_emb(src))
+        decoded = self.decoder(self.tgt_emb(tgt), memory)
+        return self.head(decoded)
+
+
+model = SimpleTransformer(vocab_size=1000)
+src = torch.randint(0, 1000, (2, 10))
+tgt = torch.randint(0, 1000, (2, 8))
+logits = model(src, tgt)   # shape: (2, 10)
+```
+
+## Available Blocks
+
+| Category  | Block                          | Description                                  |
+|-----------|--------------------------------|----------------------------------------------|
+| Base      | `Block`                        | Base class for all blocks                    |
+| Layers    | `LinearBlock`                  | Linear (fully-connected) layer               |
+| Layers    | `LayerNormBlock`               | Layer normalisation                          |
+| Layers    | `DropoutBlock`                 | Dropout                                      |
+| Layers    | `FeedForwardBlock`             | Two-layer feed-forward with GELU/ReLU        |
+| Layers    | `EmbeddingBlock`               | Token embedding                              |
+| Attention | `MultiHeadAttentionBlock`      | Scaled dot-product multi-head attention      |
+| Attention | `SelfAttentionBlock`           | Self-attention (optionally causal)           |
+| Attention | `CrossAttentionBlock`          | Cross-attention between two sequences        |
+| Encoders  | `TransformerEncoderLayerBlock` | Single Transformer encoder layer            |
+| Encoders  | `TransformerEncoderBlock`      | Stack of Transformer encoder layers         |
+| Decoders  | `TransformerDecoderLayerBlock` | Single Transformer decoder layer            |
+| Decoders  | `TransformerDecoderBlock`      | Stack of Transformer decoder layers         |
+| Heads     | `ClassificationHead`           | Sequence-level classification               |
+| Heads     | `LanguageModelHead`            | Token-level vocabulary projection           |
+| Heads     | `TokenClassificationHead`      | Per-token classification (NER, POS, etc.)   |
+
+## Running Tests
+
+```bash
+pip install pytest
+pytest
+```
+
diff --git a/haloblocks/__init__.py b/haloblocks/__init__.py
@@ -0,0 +1,57 @@
+"""HaloBlocks: a Python library where all model components are treated as blocks.
+
+Blocks are composable PyTorch modules covering:
+- Base: Block
+- Layers: LinearBlock, LayerNormBlock, DropoutBlock, FeedForwardBlock, EmbeddingBlock
+- Attention: MultiHeadAttentionBlock, SelfAttentionBlock, CrossAttentionBlock
+- Heads: ClassificationHead, LanguageModelHead, TokenClassificationHead
+- Encoders: TransformerEncoderLayerBlock, TransformerEncoderBlock
+- Decoders: TransformerDecoderLayerBlock, TransformerDecoderBlock
+"""
+
+from .base import Block
+from .layers import (
+    LinearBlock,
+    LayerNormBlock,
+    DropoutBlock,
+    FeedForwardBlock,
+    EmbeddingBlock,
+)
+from .attention import (
+    MultiHeadAttentionBlock,
+    SelfAttentionBlock,
+    CrossAttentionBlock,
+)
+from .heads import (
+    ClassificationHead,
+    LanguageModelHead,
+    TokenClassificationHead,
+)
+from .encoders import TransformerEncoderLayerBlock, TransformerEncoderBlock
+from .decoders import TransformerDecoderLayerBlock, TransformerDecoderBlock
+
+__all__ = [
+    "Block",
+    # Layers
+    "LinearBlock",
+    "LayerNormBlock",
+    "DropoutBlock",
+    "FeedForwardBlock",
+    "EmbeddingBlock",
+    # Attention
+    "MultiHeadAttentionBlock",
+    "SelfAttentionBlock",
+    "CrossAttentionBlock",
+    # Heads
+    "ClassificationHead",
+    "LanguageModelHead",
+    "TokenClassificationHead",
+    # Encoders
+    "TransformerEncoderLayerBlock",
+    "TransformerEncoderBlock",
+    # Decoders
+    "TransformerDecoderLayerBlock",
+    "TransformerDecoderBlock",
+]
+
+__version__ = "0.1.0"
diff --git a/haloblocks/attention.py b/haloblocks/attention.py
@@ -0,0 +1,195 @@
+"""Attention blocks for HaloBlocks."""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base import Block
+
+
+class MultiHeadAttentionBlock(Block):
+    """Multi-head attention block.
+
+    Computes multi-head scaled dot-product attention as described in
+    "Attention Is All You Need" (Vaswani et al., 2017).
+
+    Args:
+        d_model: Total dimension of the model.
+        num_heads: Number of parallel attention heads.
+        dropout: Dropout probability on attention weights. Default: 0.0.
+        bias: If True, adds bias to query/key/value projections. Default: True.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if d_model % num_heads != 0:
+            raise ValueError(
+                f"d_model ({d_model}) must be divisible by num_heads ({num_heads})."
+            )
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.scale_factor = math.sqrt(self.head_dim)
+
+        self.q_proj = nn.Linear(d_model, d_model, bias=bias)
+        self.k_proj = nn.Linear(d_model, d_model, bias=bias)
+        self.v_proj = nn.Linear(d_model, d_model, bias=bias)
+        self.out_proj = nn.Linear(d_model, d_model, bias=bias)
+        self.attn_dropout = nn.Dropout(p=dropout)
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """Split last dimension into (num_heads, head_dim) and transpose."""
+        batch, seq_len, _ = x.size()
+        x = x.view(batch, seq_len, self.num_heads, self.head_dim)
+        return x.transpose(1, 2)  # (batch, heads, seq, head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """Merge heads back into d_model dimension."""
+        batch, _, seq_len, _ = x.size()
+        x = x.transpose(1, 2).contiguous()
+        return x.view(batch, seq_len, self.d_model)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute multi-head attention.
+
+        Args:
+            query: Query tensor of shape (batch, tgt_len, d_model).
+            key: Key tensor of shape (batch, src_len, d_model).
+            value: Value tensor of shape (batch, src_len, d_model).
+            attn_mask: Optional additive attention mask of shape
+                (tgt_len, src_len) or (batch * num_heads, tgt_len, src_len).
+            key_padding_mask: Optional boolean mask of shape (batch, src_len).
+                Positions set to True are ignored.
+
+        Returns:
+            Tuple of (output, attention_weights) where output has shape
+            (batch, tgt_len, d_model) and attention_weights has shape
+            (batch, num_heads, tgt_len, src_len).
+        """
+        q = self._split_heads(self.q_proj(query))
+        k = self._split_heads(self.k_proj(key))
+        v = self._split_heads(self.v_proj(value))
+
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale_factor
+
+        if attn_mask is not None:
+            attn_scores = attn_scores + attn_mask
+
+        if key_padding_mask is not None:
+            # (batch, 1, 1, src_len)
+            mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            attn_scores = attn_scores.masked_fill(mask, float("-inf"))
+
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        context = torch.matmul(attn_weights, v)
+        output = self.out_proj(self._merge_heads(context))
+        return output, attn_weights
+
+
+class SelfAttentionBlock(Block):
+    """Self-attention block where query, key, and value all come from the same input.
+
+    Wraps MultiHeadAttentionBlock for convenience.
+
+    Args:
+        d_model: Total dimension of the model.
+        num_heads: Number of parallel attention heads.
+        dropout: Dropout probability on attention weights. Default: 0.0.
+        causal: If True, apply a causal (autoregressive) mask. Default: False.
+        bias: If True, adds bias to projections. Default: True.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        causal: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.causal = causal
+        self.attn = MultiHeadAttentionBlock(d_model, num_heads, dropout, bias)
+
+    def _causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        """Build an additive causal mask."""
+        mask = torch.full((seq_len, seq_len), float("-inf"), device=device)
+        return torch.triu(mask, diagonal=1)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute self-attention.
+
+        Args:
+            x: Input tensor of shape (batch, seq_len, d_model).
+            key_padding_mask: Optional boolean mask of shape (batch, seq_len).
+
+        Returns:
+            Tuple of (output, attention_weights).
+        """
+        attn_mask = None
+        if self.causal:
+            attn_mask = self._causal_mask(x.size(1), x.device)
+        return self.attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+
+
+class CrossAttentionBlock(Block):
+    """Cross-attention block where queries come from one sequence and keys/values from another.
+
+    Wraps MultiHeadAttentionBlock for convenience.
+
+    Args:
+        d_model: Total dimension of the model.
+        num_heads: Number of parallel attention heads.
+        dropout: Dropout probability on attention weights. Default: 0.0.
+        bias: If True, adds bias to projections. Default: True.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.attn = MultiHeadAttentionBlock(d_model, num_heads, dropout, bias)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        context: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute cross-attention.
+
+        Args:
+            query: Query tensor of shape (batch, tgt_len, d_model).
+            context: Context tensor of shape (batch, src_len, d_model).
+            key_padding_mask: Optional boolean mask of shape (batch, src_len).
+
+        Returns:
+            Tuple of (output, attention_weights).
+        """
+        return self.attn(query, context, context, key_padding_mask=key_padding_mask)