feat: implement Phase 3.2 Graph Attention Network (GAT)

HFooladi · claude · HFooladi · commit 521517c2156e · 2026-01-28T16:46:54.000+01:00
Add GAT model with multi-head attention for adaptive neighbor weighting.
Key features include optional edge feature incorporation in attention,
attention dropout for regularization, and API compatibility with
UncertaintyGCN/UncertaintyMPNN for drop-in replacement.

- GATConfig with n_heads, edge_features, attention_dropout_rate params
- GATAttention for single-head concat-based attention mechanism
- GATLayer for multi-head attention with head concatenation/averaging
- UncertaintyGAT with dual heads (mean/variance) and log_var clipping
- Training utilities: create_gat_optimizer, train_gat_step, eval_gat_step
- MC Dropout uncertainty via get_gat_uncertainties()
- 36 comprehensive tests covering all components
- Demo example comparing GAT vs GCN vs MPNN on ESOL dataset

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **Graph Attention Network (GAT)** (`molax/models/gat.py`)
+  - `UncertaintyGAT` model with multi-head attention for adaptive neighbor weighting
+  - `GATConfig` with configurable n_heads, edge_features, attention_dropout_rate
+  - `GATAttention` and `GATLayer` components
+  - Training utilities: `train_gat_step`, `eval_gat_step`, `get_gat_uncertainties`
+  - Same API as `UncertaintyGCN`/`UncertaintyMPNN` for drop-in replacement
+  - Optional edge feature incorporation in attention computation
+  - Comprehensive tests and demo example
+
 - **Message Passing Neural Network (MPNN)** (`molax/models/mpnn.py`)
   - `UncertaintyMPNN` model that leverages edge features (bond information)
   - `MPNNConfig` with configurable aggregation (sum, mean, max)
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -27,6 +27,7 @@ mkdocs serve
 python examples/simple_active_learning.py
 python examples/active_learning_benchmark.py
 python examples/mpnn_demo.py
+python examples/gat_demo.py
 python examples/ensemble_demo.py
 python examples/evidential_demo.py
 ```
@@ -66,7 +67,7 @@ SMILES string
 jraph.GraphsTuple (single molecule)
     ↓ jraph.batch()
 jraph.GraphsTuple (batched - all molecules as one graph)
-    ↓ UncertaintyGCN / UncertaintyMPNN / DeepEnsemble / EvidentialGCN
+    ↓ UncertaintyGCN / UncertaintyMPNN / UncertaintyGAT / DeepEnsemble / EvidentialGCN
 (mean, variance) predictions
 ```
 
@@ -76,6 +77,7 @@ jraph.GraphsTuple (batched - all molecules as one graph)
 |------|---------|
 | `molax/models/gcn.py` | `GCNConfig`, `UncertaintyGCN`, `MolecularGCN`, `train_step`, `eval_step` |
 | `molax/models/mpnn.py` | `MPNNConfig`, `UncertaintyMPNN` for edge-aware message passing |
+| `molax/models/gat.py` | `GATConfig`, `UncertaintyGAT` for attention-based message passing |
 | `molax/models/ensemble.py` | `EnsembleConfig`, `DeepEnsemble` for ensemble uncertainty |
 | `molax/models/evidential.py` | `EvidentialConfig`, `EvidentialGCN` for evidential uncertainty |
 | `molax/utils/data.py` | `MolecularDataset`, `smiles_to_jraph`, `batch_graphs` |
@@ -153,6 +155,27 @@ model = UncertaintyMPNN(config, rngs=nnx.Rngs(0))
 mean, variance = model(batched_graphs, training=False)
 ```
 
+### GAT API
+
+```python
+from molax.models.gat import GATConfig, UncertaintyGAT
+
+config = GATConfig(
+    node_features=6,
+    edge_features=1,  # Optional: include edge features in attention
+    hidden_features=[64, 64],
+    out_features=1,
+    n_heads=4,  # Multi-head attention
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    negative_slope=0.2,  # LeakyReLU slope
+)
+model = UncertaintyGAT(config, rngs=nnx.Rngs(0))
+
+# Same API as UncertaintyGCN/UncertaintyMPNN - uses attention for aggregation
+mean, variance = model(batched_graphs, training=False)
+```
+
 ### Calibration Metrics
 
 ```python
@@ -186,6 +209,7 @@ optimizer.update(model, grads)
 pytest tests/ -v                    # All tests
 pytest tests/test_gcn.py -v         # GCN model tests
 pytest tests/test_mpnn.py -v        # MPNN model tests
+pytest tests/test_gat.py -v         # GAT model tests
 pytest tests/test_ensemble.py -v    # Ensemble tests
 pytest tests/test_evidential.py -v  # Evidential tests
 pytest tests/test_acquisition.py -v # Acquisition tests
diff --git a/docs/roadmap.md b/docs/roadmap.md
@@ -423,7 +423,9 @@ embeddings = model.extract_embeddings(batched_graphs)
 
 ---
 
-### 3.2 Graph Attention Network (GAT)
+### 3.2 Graph Attention Network (GAT) ✅
+
+**Status:** Implemented in `molax/models/gat.py`
 
 **What:** Learn edge importance dynamically via attention mechanism.
 
@@ -433,35 +435,32 @@ embeddings = model.extract_embeddings(batched_graphs)
 
 ```python
 # molax/models/gat.py
+from molax.models.gat import GATConfig, UncertaintyGAT
 
-class GATLayer(nnx.Module):
-    def __init__(self, in_features, out_features, n_heads=4, rngs=None):
-        self.n_heads = n_heads
-        self.head_dim = out_features // n_heads
-
-        self.W = nnx.Linear(in_features, out_features, rngs=rngs)
-        self.attention = nnx.Linear(2 * self.head_dim, 1, rngs=rngs)
-
-    def __call__(self, graphs):
-        nodes = self.W(graphs.nodes)  # (N, out_features)
-        nodes = nodes.reshape(-1, self.n_heads, self.head_dim)
+config = GATConfig(
+    node_features=6,
+    edge_features=1,  # Optional: include edge features in attention
+    hidden_features=[64, 64],
+    out_features=1,
+    n_heads=4,
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    negative_slope=0.2,
+)
+model = UncertaintyGAT(config, rngs=nnx.Rngs(0))
 
-        # Attention coefficients
-        src = nodes[graphs.senders]
-        dst = nodes[graphs.receivers]
-        e = self.attention(jnp.concatenate([src, dst], axis=-1))
-        alpha = jraph.segment_softmax(e, graphs.receivers, len(nodes))
+# Same API as UncertaintyGCN/UncertaintyMPNN
+mean, variance = model(batched_graphs, training=False)
 
-        # Aggregate with attention
-        messages = alpha * src
-        out = jraph.segment_sum(messages, graphs.receivers, len(nodes))
-        return out.reshape(-1, self.n_heads * self.head_dim)
+# Extract embeddings for Core-Set selection
+embeddings = model.extract_embeddings(batched_graphs)
 ```
 
 **Acceptance Criteria:**
-- [ ] Multi-head attention implementation
-- [ ] Edge feature incorporation option
-- [ ] Dropout on attention weights
+- [x] Multi-head attention implementation
+- [x] Edge feature incorporation option
+- [x] Dropout on attention weights
+- [x] Same API as UncertaintyGCN/UncertaintyMPNN for acquisition function compatibility
 
 ---
 
diff --git a/examples/gat_demo.py b/examples/gat_demo.py
@@ -0,0 +1,190 @@
+"""GAT Demo: Attention-based molecular property prediction.
+
+This script demonstrates how to use the Graph Attention Network (GAT)
+for molecular property prediction on the ESOL dataset. GAT uses learned
+attention weights to dynamically weight neighbor contributions.
+"""
+
+from pathlib import Path
+
+import flax.nnx as nnx
+import jax.numpy as jnp
+import jraph
+
+from molax.models.gat import (
+    GATConfig,
+    UncertaintyGAT,
+    create_gat_optimizer,
+    eval_gat_step,
+    train_gat_step,
+)
+from molax.models.gcn import GCNConfig, UncertaintyGCN, create_train_state, train_step
+from molax.models.mpnn import (
+    MPNNConfig,
+    UncertaintyMPNN,
+    create_mpnn_optimizer,
+    train_mpnn_step,
+)
+from molax.utils.data import MolecularDataset
+
+# Configuration
+DATASET_PATH = Path(__file__).parent.parent / "datasets" / "esol.csv"
+N_EPOCHS = 100
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 1e-4
+MAX_GRAD_NORM = 1.0
+
+print("=" * 60)
+print("GAT Demo: Attention-Based Molecular Property Prediction")
+print("=" * 60)
+
+# Load dataset
+print("\nLoading ESOL dataset...")
+dataset = MolecularDataset(DATASET_PATH)
+train_data, test_data = dataset.split(test_size=0.2, seed=42)
+print(f"Train: {len(train_data)} molecules, Test: {len(test_data)} molecules")
+
+# Show feature info
+sample_graph = train_data.graphs[0]
+print("\nGraph features:")
+print(f"  Node features: {sample_graph.nodes.shape[1]} (atom properties)")
+print(f"  Edge features: {sample_graph.edges.shape[1]} (bond type)")
+
+# Batch all data
+print("\nBatching data...")
+all_train_graphs = jraph.batch(train_data.graphs)
+all_train_labels = train_data.labels
+all_test_graphs = jraph.batch(test_data.graphs)
+all_test_labels = test_data.labels
+
+n_train = len(train_data)
+n_test = len(test_data)
+train_mask = jnp.ones(n_train, dtype=bool)
+test_mask = jnp.ones(n_test, dtype=bool)
+
+# Create GAT model
+print("\nCreating GAT model...")
+config = GATConfig(
+    node_features=train_data.n_node_features,
+    edge_features=1,  # Use bond type in attention
+    hidden_features=[64, 64],
+    out_features=1,
+    n_heads=4,
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    negative_slope=0.2,
+)
+model = UncertaintyGAT(config, rngs=nnx.Rngs(0))
+optimizer = create_gat_optimizer(
+    model,
+    learning_rate=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    max_grad_norm=MAX_GRAD_NORM,
+)
+
+print(f"  Hidden layers: {config.hidden_features}")
+print(f"  Attention heads: {config.n_heads}")
+print(f"  Edge features in attention: {config.edge_features > 0}")
+print(f"  Dropout rate: {config.dropout_rate}")
+print(f"  Attention dropout: {config.attention_dropout_rate}")
+print(f"  Weight decay: {WEIGHT_DECAY}")
+
+# Training loop
+print("\nTraining GAT...")
+print("-" * 40)
+
+for epoch in range(N_EPOCHS):
+    # Training step
+    train_loss = train_gat_step(
+        model, optimizer, all_train_graphs, all_train_labels, train_mask
+    )
+
+    # Evaluation every 20 epochs
+    if (epoch + 1) % 20 == 0:
+        test_mse, _ = eval_gat_step(model, all_test_graphs, all_test_labels, test_mask)
+        test_rmse = jnp.sqrt(test_mse)
+        print(
+            f"Epoch {epoch + 1:3d}: Train Loss = {float(train_loss):.4f}, "
+            f"Test RMSE = {float(test_rmse):.4f}"
+        )
+
+# Final evaluation
+print("-" * 40)
+test_mse, predictions = eval_gat_step(
+    model, all_test_graphs, all_test_labels, test_mask
+)
+test_rmse = jnp.sqrt(test_mse)
+
+# Get predictions with uncertainty
+mean, variance = model(all_test_graphs, training=False)
+mean = mean.squeeze(-1)
+variance = variance.squeeze(-1)
+
+print("\nFinal Results:")
+print(f"  Test RMSE: {float(test_rmse):.4f}")
+print(f"  Mean predicted variance: {float(jnp.mean(variance[:n_test])):.4f}")
+
+# Show some predictions
+print("\nSample predictions (first 5 test molecules):")
+print(f"{'Actual':>10} {'Predicted':>10} {'Std Dev':>10}")
+for i in range(min(5, n_test)):
+    actual = float(all_test_labels[i])
+    pred = float(mean[i])
+    std = float(jnp.sqrt(variance[i]))
+    print(f"{actual:>10.3f} {pred:>10.3f} {std:>10.3f}")
+
+# Compare with GCN and MPNN
+print("\n" + "=" * 60)
+print("Comparing GAT with GCN and MPNN...")
+print("=" * 60)
+
+# Train GCN
+gcn_config = GCNConfig(
+    node_features=train_data.n_node_features,
+    hidden_features=[64, 64],
+    out_features=1,
+    dropout_rate=0.1,
+)
+gcn = UncertaintyGCN(gcn_config, rngs=nnx.Rngs(0))
+gcn_optimizer = create_train_state(gcn, learning_rate=LEARNING_RATE)
+
+print("\nTraining GCN for comparison...")
+for epoch in range(N_EPOCHS):
+    train_step(gcn, gcn_optimizer, all_train_graphs, all_train_labels, train_mask)
+
+gcn_mean, _ = gcn(all_test_graphs, training=False)
+gcn_mse = jnp.mean((gcn_mean.squeeze(-1)[:n_test] - all_test_labels[:n_test]) ** 2)
+gcn_rmse = jnp.sqrt(gcn_mse)
+
+# Train MPNN
+mpnn_config = MPNNConfig(
+    node_features=train_data.n_node_features,
+    edge_features=1,
+    hidden_features=[64, 64],
+    out_features=1,
+    aggregation="sum",
+    dropout_rate=0.1,
+)
+mpnn = UncertaintyMPNN(mpnn_config, rngs=nnx.Rngs(0))
+mpnn_optimizer = create_mpnn_optimizer(mpnn, learning_rate=LEARNING_RATE)
+
+print("Training MPNN for comparison...")
+for epoch in range(N_EPOCHS):
+    train_mpnn_step(
+        mpnn, mpnn_optimizer, all_train_graphs, all_train_labels, train_mask
+    )
+
+mpnn_mean, _ = mpnn(all_test_graphs, training=False)
+mpnn_mse = jnp.mean((mpnn_mean.squeeze(-1)[:n_test] - all_test_labels[:n_test]) ** 2)
+mpnn_rmse = jnp.sqrt(mpnn_mse)
+
+print("\n" + "-" * 40)
+print("Model Comparison (Test RMSE):")
+print("-" * 40)
+print(f"  GCN:  {float(gcn_rmse):.4f}")
+print(f"  MPNN: {float(mpnn_rmse):.4f}")
+print(f"  GAT:  {float(test_rmse):.4f}")
+
+print("\n" + "=" * 60)
+print("GAT demo completed successfully!")
+print("=" * 60)
diff --git a/molax/models/__init__.py b/molax/models/__init__.py
@@ -16,6 +16,16 @@
     get_evidential_uncertainties,
     train_evidential_step,
 )
+from .gat import (
+    GATAttention,
+    GATConfig,
+    GATLayer,
+    UncertaintyGAT,
+    create_gat_optimizer,
+    eval_gat_step,
+    get_gat_uncertainties,
+    train_gat_step,
+)
 from .gcn import MolecularGCN, UncertaintyGCN
 from .mpnn import (
     MessageFunction,
@@ -45,6 +55,14 @@
     "train_evidential_step",
     "eval_evidential_step",
     "get_evidential_uncertainties",
+    "GATConfig",
+    "GATAttention",
+    "GATLayer",
+    "UncertaintyGAT",
+    "create_gat_optimizer",
+    "train_gat_step",
+    "eval_gat_step",
+    "get_gat_uncertainties",
     "MPNNConfig",
     "MessageFunction",
     "MessagePassingLayer",
diff --git a/molax/models/gat.py b/molax/models/gat.py
diff --git a/tests/test_gat.py b/tests/test_gat.py