Integrate 39 attention mechanisms from ruvector-attention into PostgreSQL, enabling attention-weighted vector search, transformer-style queries, and neural reranking directly in SQL.
┌──────────────────────────────────────────────────────────────────┐
│ PostgreSQL Extension │
├──────────────────────────────────────────────────────────────────┤
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Attention Registry │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────────────┐ │ │
│ │ │ Flash │ │ Linear │ │ MoE │ │ Hyperbolic │ │ │
│ │ └────┬────┘ └────┬────┘ └────┬────┘ └────────┬────────┘ │ │
│ └───────┼───────────┼───────────┼───────────────┼──────────┘ │
│ └───────────┴───────────┴───────────────┘ │
│ ▼ │
│ ┌───────────────────────────┐ │
│ │ SIMD-Accelerated Core │ │
│ │ (AVX-512/AVX2/NEON) │ │
│ └───────────────────────────┘ │
└──────────────────────────────────────────────────────────────────┘
src/
├── attention/
│ ├── mod.rs # Module exports & registry
│ ├── core/
│ │ ├── scaled_dot.rs # Scaled dot-product attention
│ │ ├── multi_head.rs # Multi-head attention
│ │ ├── flash.rs # Flash Attention v2
│ │ └── linear.rs # Linear attention O(n)
│ ├── graph/
│ │ ├── gat.rs # Graph Attention
│ │ ├── gatv2.rs # GATv2 (dynamic)
│ │ └── sparse.rs # Sparse attention patterns
│ ├── specialized/
│ │ ├── moe.rs # Mixture of Experts
│ │ ├── cross.rs # Cross-attention
│ │ └── sliding.rs # Sliding window
│ ├── hyperbolic/
│ │ ├── poincare.rs # Poincaré attention
│ │ └── lorentz.rs # Lorentzian attention
│ └── operators.rs # PostgreSQL operators
-- Create attention-weighted index
CREATE INDEX ON documents USING ruvector_attention (
embedding vector(768)
) WITH (
attention_type = 'flash',
num_heads = 8,
head_dim = 96
);
-- Attention-weighted search
SELECT id, content,
ruvector_attention_score(embedding, query_vec, 'scaled_dot') AS score
FROM documents
ORDER BY score DESC
LIMIT 10;
-- Multi-head attention search
SELECT * FROM ruvector_mha_search(
table_name := 'documents',
query := query_embedding,
num_heads := 8,
k := 10
);-- Cross-attention between two tables (Q from queries, K/V from documents)
SELECT q.id AS query_id, d.id AS doc_id, score
FROM ruvector_cross_attention(
query_table := 'queries',
query_column := 'embedding',
document_table := 'documents',
document_column := 'embedding',
attention_type := 'scaled_dot'
) AS (query_id int, doc_id int, score float);
-- Mixture of Experts routing
SELECT id,
ruvector_moe_route(embedding, num_experts := 8, top_k := 2) AS expert_weights
FROM documents;
-- Sliding window attention for long sequences
SELECT * FROM ruvector_sliding_attention(
embeddings := embedding_array,
window_size := 256,
stride := 128
);-- List available attention mechanisms
SELECT * FROM ruvector_attention_types();
-- Result:
-- | name | complexity | best_for |
-- |-------------------|------------|-----------------------------|
-- | scaled_dot | O(n²) | Small sequences (<512) |
-- | flash_v2 | O(n²) | GPU, memory-efficient |
-- | linear | O(n) | Long sequences (>4K) |
-- | sparse | O(n√n) | Very long sequences |
-- | gat | O(E) | Graph-structured data |
-- | moe | O(n*k) | Conditional computation |
-- | hyperbolic | O(n²) | Hierarchical data |// src/attention/core/scaled_dot.rs
use simsimd::SpatialSimilarity;
pub struct ScaledDotAttention {
scale: f32,
dropout: Option<f32>,
}
impl ScaledDotAttention {
pub fn new(head_dim: usize) -> Self {
Self {
scale: 1.0 / (head_dim as f32).sqrt(),
dropout: None,
}
}
/// Compute attention scores between query and keys
/// Returns softmax(Q·K^T / √d_k)
#[inline]
pub fn attention_scores(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
let mut scores: Vec<f32> = keys.iter()
.map(|k| self.dot_product(query, k) * self.scale)
.collect();
softmax_inplace(&mut scores);
scores
}
/// SIMD-accelerated dot product
#[inline]
fn dot_product(&self, a: &[f32], b: &[f32]) -> f32 {
f32::dot(a, b).unwrap_or_else(|| {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
})
}
}
// PostgreSQL function
#[pg_extern(immutable, parallel_safe)]
fn ruvector_attention_score(
query: Vec<f32>,
key: Vec<f32>,
attention_type: default!(&str, "'scaled_dot'"),
) -> f32 {
let attention = get_attention_impl(attention_type);
attention.score(&query, &key)
}// src/attention/core/multi_head.rs
pub struct MultiHeadAttention {
num_heads: usize,
head_dim: usize,
w_q: Matrix,
w_k: Matrix,
w_v: Matrix,
w_o: Matrix,
}
impl MultiHeadAttention {
pub fn forward(&self, query: &[f32], keys: &[&[f32]], values: &[&[f32]]) -> Vec<f32> {
// Project to heads
let q_heads = self.split_heads(&self.project(query, &self.w_q));
let k_heads: Vec<_> = keys.iter()
.map(|k| self.split_heads(&self.project(k, &self.w_k)))
.collect();
let v_heads: Vec<_> = values.iter()
.map(|v| self.split_heads(&self.project(v, &self.w_v)))
.collect();
// Attention per head (parallelizable)
let head_outputs: Vec<Vec<f32>> = (0..self.num_heads)
.into_par_iter()
.map(|h| {
let scores = self.attention_scores(&q_heads[h], &k_heads, h);
self.weighted_sum(&scores, &v_heads, h)
})
.collect();
// Concatenate and project
let concat = self.concat_heads(&head_outputs);
self.project(&concat, &self.w_o)
}
}
// PostgreSQL aggregate for batch attention
#[pg_extern]
fn ruvector_mha_search(
table_name: &str,
query: Vec<f32>,
num_heads: default!(i32, 8),
k: default!(i32, 10),
) -> TableIterator<'static, (name!(id, i64), name!(score, f32))> {
// Implementation using SPI
}// src/attention/core/flash.rs
/// Flash Attention v2 - memory-efficient attention
/// Processes attention in blocks to minimize memory bandwidth
pub struct FlashAttention {
block_size_q: usize,
block_size_kv: usize,
scale: f32,
}
impl FlashAttention {
/// Tiled attention computation
/// Memory: O(√N) instead of O(N²)
pub fn forward(
&self,
q: &[f32], // [seq_len, head_dim]
k: &[f32], // [seq_len, head_dim]
v: &[f32], // [seq_len, head_dim]
) -> Vec<f32> {
let seq_len = q.len() / self.head_dim;
let mut output = vec![0.0; q.len()];
let mut row_max = vec![f32::NEG_INFINITY; seq_len];
let mut row_sum = vec![0.0; seq_len];
// Process in blocks
for q_block in (0..seq_len).step_by(self.block_size_q) {
for kv_block in (0..seq_len).step_by(self.block_size_kv) {
self.process_block(
q, k, v,
q_block, kv_block,
&mut output, &mut row_max, &mut row_sum
);
}
}
output
}
}// src/attention/graph/gat.rs
/// Graph Attention Network layer
pub struct GATLayer {
num_heads: usize,
in_features: usize,
out_features: usize,
attention_weights: Vec<Vec<f32>>, // [num_heads, 2 * out_features]
leaky_relu_slope: f32,
}
impl GATLayer {
/// Compute attention coefficients for graph edges
pub fn forward(
&self,
node_features: &[Vec<f32>], // [num_nodes, in_features]
edge_index: &[(usize, usize)], // [(src, dst), ...]
) -> Vec<Vec<f32>> {
// Transform features
let h = self.linear_transform(node_features);
// Compute attention for each edge
let edge_attention: Vec<Vec<f32>> = edge_index.par_iter()
.map(|(src, dst)| {
(0..self.num_heads)
.map(|head| self.edge_attention(head, &h[*src], &h[*dst]))
.collect()
})
.collect();
// Aggregate with attention weights
self.aggregate(&h, edge_index, &edge_attention)
}
}
// PostgreSQL function for graph-based search
#[pg_extern]
fn ruvector_gat_search(
node_table: &str,
edge_table: &str,
query_node_id: i64,
num_heads: default!(i32, 4),
k: default!(i32, 10),
) -> TableIterator<'static, (name!(node_id, i64), name!(attention_score, f32))> {
// Implementation
}// src/attention/hyperbolic/poincare.rs
/// Poincaré ball attention for hierarchical data
pub struct PoincareAttention {
curvature: f32, // -1/c² where c is the ball radius
head_dim: usize,
}
impl PoincareAttention {
/// Möbius addition in Poincaré ball
fn mobius_add(&self, x: &[f32], y: &[f32]) -> Vec<f32> {
let x_norm_sq = self.norm_sq(x);
let y_norm_sq = self.norm_sq(y);
let xy_dot = self.dot(x, y);
let c = -self.curvature;
let num_coef = 1.0 + 2.0 * c * xy_dot + c * y_norm_sq;
let denom = 1.0 + 2.0 * c * xy_dot + c * c * x_norm_sq * y_norm_sq;
x.iter().zip(y.iter())
.map(|(xi, yi)| (num_coef * xi + (1.0 - c * x_norm_sq) * yi) / denom)
.collect()
}
/// Hyperbolic distance
fn distance(&self, x: &[f32], y: &[f32]) -> f32 {
let diff = self.mobius_add(x, &self.negate(y));
let c = -self.curvature;
let norm = self.norm(&diff);
(2.0 / c.sqrt()) * (c.sqrt() * norm).atanh()
}
/// Attention in hyperbolic space
pub fn attention_scores(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
let distances: Vec<f32> = keys.iter()
.map(|k| -self.distance(query, k)) // Negative distance as similarity
.collect();
softmax(&distances)
}
}
#[pg_extern(immutable, parallel_safe)]
fn ruvector_hyperbolic_distance(
a: Vec<f32>,
b: Vec<f32>,
curvature: default!(f32, 1.0),
) -> f32 {
let attention = PoincareAttention::new(curvature, a.len());
attention.distance(&a, &b)
}// src/attention/specialized/moe.rs
/// Mixture of Experts with learned routing
pub struct MixtureOfExperts {
num_experts: usize,
top_k: usize,
gate: GatingNetwork,
experts: Vec<Expert>,
}
impl MixtureOfExperts {
/// Route input to top-k experts
pub fn forward(&self, input: &[f32]) -> Vec<f32> {
// Get routing weights
let gate_logits = self.gate.forward(input);
let (top_k_indices, top_k_weights) = self.top_k_gating(&gate_logits);
// Aggregate expert outputs
let mut output = vec![0.0; self.experts[0].output_dim()];
for (idx, weight) in top_k_indices.iter().zip(top_k_weights.iter()) {
let expert_output = self.experts[*idx].forward(input);
for (o, e) in output.iter_mut().zip(expert_output.iter()) {
*o += weight * e;
}
}
output
}
}
#[pg_extern]
fn ruvector_moe_route(
embedding: Vec<f32>,
num_experts: default!(i32, 8),
top_k: default!(i32, 2),
) -> pgrx::JsonB {
let moe = get_moe_model(num_experts as usize, top_k as usize);
let (indices, weights) = moe.route(&embedding);
pgrx::JsonB(serde_json::json!({
"expert_indices": indices,
"expert_weights": weights,
}))
}// src/attention/mod.rs
pub enum AttentionType {
// Core
ScaledDot,
MultiHead { num_heads: usize },
FlashV2 { block_size: usize },
Linear,
// Graph
GAT { num_heads: usize },
GATv2 { num_heads: usize },
Sparse { pattern: SparsePattern },
// Specialized
MoE { num_experts: usize, top_k: usize },
Cross,
SlidingWindow { size: usize },
// Hyperbolic
Poincare { curvature: f32 },
Lorentz { curvature: f32 },
}
pub fn get_attention(attention_type: AttentionType) -> Box<dyn Attention> {
match attention_type {
AttentionType::ScaledDot => Box::new(ScaledDotAttention::default()),
AttentionType::FlashV2 { block_size } => Box::new(FlashAttention::new(block_size)),
// ... etc
}
}// Use simsimd for all vector operations
use simsimd::{SpatialSimilarity, BinarySimilarity};
#[inline]
fn batched_dot_products(query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
keys.iter()
.map(|k| f32::dot(query, k).unwrap())
.collect()
}// Contiguous memory for cache efficiency
pub struct AttentionCache {
// Keys stored in column-major for efficient attention
keys: Vec<f32>, // [num_keys * head_dim]
values: Vec<f32>, // [num_keys * head_dim]
num_keys: usize,
head_dim: usize,
}// Parallel attention across heads
let head_outputs: Vec<_> = (0..num_heads)
.into_par_iter()
.map(|h| compute_head_attention(h, query, keys, values))
.collect();| Operation | Sequence Length | Heads | Time (μs) | Memory |
|---|---|---|---|---|
| ScaledDot | 512 | 8 | 45 | 2MB |
| Flash | 512 | 8 | 38 | 0.5MB |
| Linear | 4096 | 8 | 120 | 4MB |
| GAT | 1000 nodes | 4 | 85 | 1MB |
| MoE (8 experts) | 512 | 8 | 95 | 3MB |
[dependencies]
# Link to ruvector-attention for implementations
ruvector-attention = { path = "../ruvector-attention", optional = true }
# SIMD
simsimd = "5.9"
# Parallel processing
rayon = "1.10"
# Matrix operations (optional, for weight matrices)
ndarray = { version = "0.15", optional = true }[features]
attention = []
attention-flash = ["attention"]
attention-graph = ["attention"]
attention-hyperbolic = ["attention"]
attention-moe = ["attention"]
attention-all = ["attention-flash", "attention-graph", "attention-hyperbolic", "attention-moe"]