The ruvector-attention SDK provides high-level, ergonomic APIs for building attention mechanisms. It includes three main components:
- Builder API - Fluent interface for configuring attention
- Pipeline API - Composable operations with normalization and residuals
- Presets - Ready-to-use configurations for common models
use ruvector_attention::sdk::*;
// Create a simple multi-head attention
let attention = multi_head(768, 12)
.dropout(0.1)
.causal(true)
.build()?;
// Use it
let query = vec![0.5; 768];
let keys = vec![&query[..]; 10];
let values = vec![&query[..]; 10];
let output = attention.compute(&query, &keys, &values)?;use ruvector_attention::sdk::presets::*;
// BERT-style attention
let bert = AttentionPreset::Bert.builder(768).build()?;
// GPT-style causal attention
let gpt = AttentionPreset::Gpt.builder(768).build()?;
// Flash attention for long sequences
let flash = AttentionPreset::FlashOptimized.builder(1024).build()?;
// Automatic selection based on sequence length
let auto = for_sequences(512, 8192).build()?;use ruvector_attention::sdk::*;
// Create a transformer block
let attention = multi_head(768, 12).build()?;
let pipeline = AttentionPipeline::new()
.add_attention(attention)
.add_dropout(0.1)
.add_residual()
.add_norm(NormType::LayerNorm);
// Run the pipeline
let output = pipeline.run(&query, &keys, &values)?;The fundamental attention mechanism: softmax(QK^T / √d)V
let attention = scaled_dot(512).build()?;Parallel attention heads for diverse representation learning:
let attention = multi_head(768, 12)
.dropout(0.1)
.build()?;Memory-efficient O(n) attention using tiled computation:
let attention = flash(1024, 128) // dim, block_size
.causal(true)
.build()?;O(n) complexity using kernel feature maps:
let attention = linear(512, 256) // dim, num_features
.build()?;Sliding window + global tokens (Longformer-style):
let attention = local_global(512, 256) // dim, window_size
.build()?;Attention in hyperbolic space for hierarchical data:
let attention = hyperbolic(512, -1.0) // dim, curvature
.build()?;Learned routing to specialized experts:
let attention = moe(512, 8, 2) // dim, num_experts, top_k
.expert_capacity(1.25)
.jitter_noise(0.01)
.build()?;All builders support these common options:
let attention = AttentionBuilder::new(512)
.multi_head(8) // Number of heads
.dropout(0.1) // Dropout probability
.causal(true) // Causal masking
.expert_capacity(1.25) // MoE capacity factor
.jitter_noise(0.01) // MoE routing noise
.build()?;let pipeline = AttentionPipeline::new()
.add_attention(attention)
.add_norm(NormType::LayerNorm)
.add_dropout(0.1)
.add_residual()
.add_custom(|x| {
// Custom transformation
x.iter().map(|v| v.max(0.0)).collect()
});// Layer Normalization (standard)
.add_norm(NormType::LayerNorm)
// RMS Normalization (simpler)
.add_norm(NormType::RMSNorm)
// Batch Normalization
.add_norm(NormType::BatchNorm)// Standard post-norm transformer block
let block = transformer_block(attention, 0.1);
// Pre-norm transformer block (more stable)
let block = prenorm_transformer_block(attention, 0.1);// BERT (bidirectional, 12 heads, 0.1 dropout)
AttentionPreset::Bert.builder(768)
// GPT (causal, 12 heads, 0.1 dropout)
AttentionPreset::Gpt.builder(768)
// Longformer (512 window, local-global)
AttentionPreset::Longformer.builder(512)
// Performer (linear attention, O(n))
AttentionPreset::Performer.builder(512)
// Flash (memory-efficient, 128 block)
AttentionPreset::FlashOptimized.builder(1024)
// Switch Transformer (8 experts, top-2)
AttentionPreset::SwitchTransformer.builder(512)
// Hyperbolic (hierarchical data)
AttentionPreset::HyperbolicTree.builder(512)
// T5 (encoder-decoder)
AttentionPreset::T5.builder(768)
// Vision Transformer
AttentionPreset::ViT.builder(768)
// Sparse Transformer
AttentionPreset::SparseTransformer.builder(512)The SDK provides intelligent preset selection:
// Automatic based on sequence length
let attention = for_sequences(512, max_len).build()?;
// ≤512: BERT
// ≤4096: Longformer
// >4096: Performer
// Graph attention
let attention = for_graphs(256, hierarchical).build()?;
// hierarchical=true: Hyperbolic
// hierarchical=false: Multi-head
// Large-scale processing
let attention = for_large_scale(1024).build()?;
// Uses Flash attention
// Vision tasks
let attention = for_vision(768, patch_size).build()?;
// Uses ViT configuration
// Autoregressive generation
let attention = for_generation(768, context_len).build()?;
// ≤2048: GPT
// >2048: Flash with causal
// MoE with custom routing
let attention = for_moe(512, num_experts, top_k).build()?;// By model name (case-insensitive)
let bert = from_model_name("bert", 768)?;
let gpt = from_model_name("gpt2", 768)?;
let longformer = from_model_name("longformer", 512)?;
let t5 = from_model_name("t5", 768)?;
let vit = from_model_name("vit", 768)?;use ruvector_attention::sdk::*;
fn create_transformer_layer(dim: usize, num_heads: usize) -> AttentionResult<AttentionPipeline> {
let attention = multi_head(dim, num_heads)
.dropout(0.1)
.build()?;
Ok(AttentionPipeline::new()
.add_norm(NormType::LayerNorm) // Pre-norm
.add_attention(attention)
.add_dropout(0.1)
.add_residual()
.add_norm(NormType::LayerNorm)) // Post-norm
}use ruvector_attention::sdk::*;
fn create_long_context_attention(dim: usize, max_len: usize) -> AttentionResult<Box<dyn Attention>> {
if max_len <= 2048 {
// Standard attention for short sequences
multi_head(dim, 12).build()
} else if max_len <= 16384 {
// Local-global for medium sequences
local_global(dim, 512).build()
} else {
// Linear attention for very long sequences
linear(dim, dim / 4).build()
}
}use ruvector_attention::sdk::*;
fn create_graph_attention(dim: usize, is_tree: bool) -> AttentionResult<Box<dyn Attention>> {
if is_tree {
// Use hyperbolic space for tree-like structures
hyperbolic(dim, -1.0).build()
} else {
// Standard attention for general graphs
multi_head(dim, 8).build()
}
}use ruvector_attention::sdk::*;
fn create_hybrid_pipeline(dim: usize) -> AttentionResult<AttentionPipeline> {
// Local attention
let local = flash(dim, 128).build()?;
// Global attention (can be added in sequence)
let global = multi_head(dim, 8).build()?;
Ok(AttentionPipeline::new()
.add_attention(local)
.add_norm(NormType::LayerNorm)
.add_residual())
}use ruvector_attention::sdk::*;
fn create_moe_attention(dim: usize) -> AttentionResult<Box<dyn Attention>> {
moe(dim, 16, 2) // 16 experts, route to top-2
.expert_capacity(1.5) // Higher capacity for load balancing
.jitter_noise(0.1) // Exploration during training
.build()
}-
Choose the right attention type:
- Short sequences (<512): Standard multi-head
- Medium sequences (512-4096): Local-global or Flash
- Long sequences (>4096): Linear or Performer
- Hierarchical data: Hyperbolic
- Specialized patterns: MoE
-
Use Flash attention for:
- Long sequences
- Memory-constrained environments
- Training with limited GPU memory
-
Use Linear attention for:
- Very long sequences (>16k tokens)
- Inference-only scenarios
- Real-time applications
-
Use MoE for:
- Multi-task learning
- Specialized domain processing
- Scaling model capacity
-
Pipeline optimization:
- Pre-norm is more stable for deep models
- RMSNorm is faster than LayerNorm
- Dropout during training only
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_attention_pipeline() {
let attention = multi_head(512, 8).build().unwrap();
let pipeline = AttentionPipeline::new()
.add_attention(attention)
.add_norm(NormType::LayerNorm);
let query = vec![0.5; 512];
let keys = vec![&query[..]; 10];
let values = vec![&query[..]; 10];
let output = pipeline.run(&query, &keys, &values).unwrap();
assert_eq!(output.len(), 512);
}
}- See
examples/directory for complete working examples - Check the API documentation for detailed parameter descriptions
- Review benchmarks in
benches/for performance comparisons