Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Python bytecode
__pycache__/
*.py[cod]
*$py.class

# Packaging / build artifacts
*.egg-info/
dist/
build/
*.egg

# Test / coverage
.pytest_cache/
.coverage
htmlcov/

# Virtual environments
venv/
.venv/
env/

# Editor
.vscode/
.idea/
*.swp
109 changes: 108 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,109 @@
# HaloBlocks
python library where all model components are treated as blocks like heads,attention layers, new layers

A Python library where all model components are treated as composable **blocks** — including heads, attention layers, feed-forward layers, encoders, and decoders.

Every block is a `torch.nn.Module` that can be mixed and matched to build custom neural network architectures.

## Installation

```bash
pip install haloblocks
```

Or from source:

```bash
git clone https://github.com/basaanithanaveenkumar/HaloBlocks.git
cd HaloBlocks
pip install -e .
```

**Requirements:** Python ≥ 3.8, PyTorch ≥ 1.13

## Quick Start

```python
import torch
import haloblocks as hb

# --- Layers ---
linear = hb.LinearBlock(128, 256)
norm = hb.LayerNormBlock(256)
ff = hb.FeedForwardBlock(d_model=256, d_ff=1024)
emb = hb.EmbeddingBlock(vocab_size=30000, d_model=256)

# --- Attention ---
mha = hb.MultiHeadAttentionBlock(d_model=256, num_heads=8)
self_a = hb.SelfAttentionBlock(d_model=256, num_heads=8, causal=True)
cross_a = hb.CrossAttentionBlock(d_model=256, num_heads=8)

# --- Encoders ---
encoder = hb.TransformerEncoderBlock(d_model=256, num_heads=8, d_ff=1024, num_layers=6)

# --- Decoders ---
decoder = hb.TransformerDecoderBlock(d_model=256, num_heads=8, d_ff=1024, num_layers=6)

# --- Heads ---
cls_head = hb.ClassificationHead(d_model=256, num_classes=10)
lm_head = hb.LanguageModelHead(d_model=256, vocab_size=30000)
tok_head = hb.TokenClassificationHead(d_model=256, num_classes=5)
```

## Building a Transformer from Blocks

```python
import torch
import torch.nn as nn
import haloblocks as hb


class SimpleTransformer(nn.Module):
def __init__(self, vocab_size, d_model=256, num_heads=8, d_ff=1024,
num_enc_layers=3, num_dec_layers=3, num_classes=10):
super().__init__()
self.src_emb = hb.EmbeddingBlock(vocab_size, d_model)
self.tgt_emb = hb.EmbeddingBlock(vocab_size, d_model)
self.encoder = hb.TransformerEncoderBlock(d_model, num_heads, d_ff, num_enc_layers)
self.decoder = hb.TransformerDecoderBlock(d_model, num_heads, d_ff, num_dec_layers)
self.head = hb.ClassificationHead(d_model, num_classes, pooling="mean")

def forward(self, src, tgt):
memory = self.encoder(self.src_emb(src))
decoded = self.decoder(self.tgt_emb(tgt), memory)
return self.head(decoded)


model = SimpleTransformer(vocab_size=1000)
src = torch.randint(0, 1000, (2, 10))
tgt = torch.randint(0, 1000, (2, 8))
logits = model(src, tgt) # shape: (2, 10)
```

## Available Blocks

| Category | Block | Description |
|-----------|--------------------------------|----------------------------------------------|
| Base | `Block` | Base class for all blocks |
| Layers | `LinearBlock` | Linear (fully-connected) layer |
| Layers | `LayerNormBlock` | Layer normalisation |
| Layers | `DropoutBlock` | Dropout |
| Layers | `FeedForwardBlock` | Two-layer feed-forward with GELU/ReLU |
| Layers | `EmbeddingBlock` | Token embedding |
| Attention | `MultiHeadAttentionBlock` | Scaled dot-product multi-head attention |
| Attention | `SelfAttentionBlock` | Self-attention (optionally causal) |
| Attention | `CrossAttentionBlock` | Cross-attention between two sequences |
| Encoders | `TransformerEncoderLayerBlock` | Single Transformer encoder layer |
| Encoders | `TransformerEncoderBlock` | Stack of Transformer encoder layers |
| Decoders | `TransformerDecoderLayerBlock` | Single Transformer decoder layer |
| Decoders | `TransformerDecoderBlock` | Stack of Transformer decoder layers |
| Heads | `ClassificationHead` | Sequence-level classification |
| Heads | `LanguageModelHead` | Token-level vocabulary projection |
| Heads | `TokenClassificationHead` | Per-token classification (NER, POS, etc.) |

## Running Tests

```bash
pip install pytest
pytest
```

57 changes: 57 additions & 0 deletions haloblocks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""HaloBlocks: a Python library where all model components are treated as blocks.

Blocks are composable PyTorch modules covering:
- Base: Block
- Layers: LinearBlock, LayerNormBlock, DropoutBlock, FeedForwardBlock, EmbeddingBlock
- Attention: MultiHeadAttentionBlock, SelfAttentionBlock, CrossAttentionBlock
- Heads: ClassificationHead, LanguageModelHead, TokenClassificationHead
- Encoders: TransformerEncoderLayerBlock, TransformerEncoderBlock
- Decoders: TransformerDecoderLayerBlock, TransformerDecoderBlock
"""

from .base import Block
from .layers import (
LinearBlock,
LayerNormBlock,
DropoutBlock,
FeedForwardBlock,
EmbeddingBlock,
)
from .attention import (
MultiHeadAttentionBlock,
SelfAttentionBlock,
CrossAttentionBlock,
)
from .heads import (
ClassificationHead,
LanguageModelHead,
TokenClassificationHead,
)
from .encoders import TransformerEncoderLayerBlock, TransformerEncoderBlock
from .decoders import TransformerDecoderLayerBlock, TransformerDecoderBlock

__all__ = [
"Block",
# Layers
"LinearBlock",
"LayerNormBlock",
"DropoutBlock",
"FeedForwardBlock",
"EmbeddingBlock",
# Attention
"MultiHeadAttentionBlock",
"SelfAttentionBlock",
"CrossAttentionBlock",
# Heads
"ClassificationHead",
"LanguageModelHead",
"TokenClassificationHead",
# Encoders
"TransformerEncoderLayerBlock",
"TransformerEncoderBlock",
# Decoders
"TransformerDecoderLayerBlock",
"TransformerDecoderBlock",
]

__version__ = "0.1.0"
195 changes: 195 additions & 0 deletions haloblocks/attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""Attention blocks for HaloBlocks."""

import math
from typing import Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

from .base import Block


class MultiHeadAttentionBlock(Block):
"""Multi-head attention block.

Computes multi-head scaled dot-product attention as described in
"Attention Is All You Need" (Vaswani et al., 2017).

Args:
d_model: Total dimension of the model.
num_heads: Number of parallel attention heads.
dropout: Dropout probability on attention weights. Default: 0.0.
bias: If True, adds bias to query/key/value projections. Default: True.
"""

def __init__(
self,
d_model: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
if d_model % num_heads != 0:
raise ValueError(
f"d_model ({d_model}) must be divisible by num_heads ({num_heads})."
)
self.d_model = d_model
self.num_heads = num_heads
self.head_dim = d_model // num_heads
self.scale_factor = math.sqrt(self.head_dim)

self.q_proj = nn.Linear(d_model, d_model, bias=bias)
self.k_proj = nn.Linear(d_model, d_model, bias=bias)
self.v_proj = nn.Linear(d_model, d_model, bias=bias)
self.out_proj = nn.Linear(d_model, d_model, bias=bias)
self.attn_dropout = nn.Dropout(p=dropout)

def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
"""Split last dimension into (num_heads, head_dim) and transpose."""
batch, seq_len, _ = x.size()
x = x.view(batch, seq_len, self.num_heads, self.head_dim)
return x.transpose(1, 2) # (batch, heads, seq, head_dim)

def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
"""Merge heads back into d_model dimension."""
batch, _, seq_len, _ = x.size()
x = x.transpose(1, 2).contiguous()
return x.view(batch, seq_len, self.d_model)

def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
key_padding_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute multi-head attention.

Args:
query: Query tensor of shape (batch, tgt_len, d_model).
key: Key tensor of shape (batch, src_len, d_model).
value: Value tensor of shape (batch, src_len, d_model).
attn_mask: Optional additive attention mask of shape
(tgt_len, src_len) or (batch * num_heads, tgt_len, src_len).
key_padding_mask: Optional boolean mask of shape (batch, src_len).
Positions set to True are ignored.

Returns:
Tuple of (output, attention_weights) where output has shape
(batch, tgt_len, d_model) and attention_weights has shape
(batch, num_heads, tgt_len, src_len).
"""
q = self._split_heads(self.q_proj(query))
k = self._split_heads(self.k_proj(key))
v = self._split_heads(self.v_proj(value))

attn_scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale_factor

if attn_mask is not None:
attn_scores = attn_scores + attn_mask

if key_padding_mask is not None:
# (batch, 1, 1, src_len)
mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
attn_scores = attn_scores.masked_fill(mask, float("-inf"))

attn_weights = F.softmax(attn_scores, dim=-1)
attn_weights = self.attn_dropout(attn_weights)

context = torch.matmul(attn_weights, v)
output = self.out_proj(self._merge_heads(context))
return output, attn_weights


class SelfAttentionBlock(Block):
"""Self-attention block where query, key, and value all come from the same input.

Wraps MultiHeadAttentionBlock for convenience.

Args:
d_model: Total dimension of the model.
num_heads: Number of parallel attention heads.
dropout: Dropout probability on attention weights. Default: 0.0.
causal: If True, apply a causal (autoregressive) mask. Default: False.
bias: If True, adds bias to projections. Default: True.
"""

def __init__(
self,
d_model: int,
num_heads: int,
dropout: float = 0.0,
causal: bool = False,
bias: bool = True,
):
super().__init__()
self.causal = causal
self.attn = MultiHeadAttentionBlock(d_model, num_heads, dropout, bias)

def _causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
"""Build an additive causal mask."""
mask = torch.full((seq_len, seq_len), float("-inf"), device=device)
return torch.triu(mask, diagonal=1)

def forward(
self,
x: torch.Tensor,
key_padding_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute self-attention.

Args:
x: Input tensor of shape (batch, seq_len, d_model).
key_padding_mask: Optional boolean mask of shape (batch, seq_len).

Returns:
Tuple of (output, attention_weights).
"""
attn_mask = None
if self.causal:
attn_mask = self._causal_mask(x.size(1), x.device)
return self.attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask)


class CrossAttentionBlock(Block):
"""Cross-attention block where queries come from one sequence and keys/values from another.

Wraps MultiHeadAttentionBlock for convenience.

Args:
d_model: Total dimension of the model.
num_heads: Number of parallel attention heads.
dropout: Dropout probability on attention weights. Default: 0.0.
bias: If True, adds bias to projections. Default: True.
"""

def __init__(
self,
d_model: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.attn = MultiHeadAttentionBlock(d_model, num_heads, dropout, bias)

def forward(
self,
query: torch.Tensor,
context: torch.Tensor,
key_padding_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute cross-attention.

Args:
query: Query tensor of shape (batch, tgt_len, d_model).
context: Context tensor of shape (batch, src_len, d_model).
key_padding_mask: Optional boolean mask of shape (batch, src_len).

Returns:
Tuple of (output, attention_weights).
"""
return self.attn(query, context, context, key_padding_mask=key_padding_mask)
Loading