diff --git a/blackwell/IMPLEMENTATION_PLAN.md b/blackwell/IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000000000..69e20e2e0dfa8
--- /dev/null
+++ b/blackwell/IMPLEMENTATION_PLAN.md
@@ -0,0 +1,392 @@
+# NVIDIA Blackwell GPU Architecture Support Implementation Plan
+
+## Overview
+
+This document outlines the implementation plan for adding comprehensive NVIDIA Blackwell GPU architecture support to llama.cpp. The plan is structured in phases to ensure systematic development, testing, and validation of Blackwell-specific optimizations.
+
+## Current State Analysis
+
+- **Compute Capability**: Currently supports up to Ada Lovelace (8.9)
+- **Blackwell Support**: [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360) adds CUDA 12.8 + sm120 build support
+- **Missing Features**: Thread Block Clusters, L2 cache management, HBM3/HBM3e optimizations
+- **Flash Attention**: Multiple kernel variants but no Blackwell-specific optimizations
+- **Compatibility**: Basic functionality works via backward compatibility, but performance is sub-optimal
+
+## Architecture Constants Update
+
+**Critical Finding**: Blackwell GPUs use compute capability **12.0** (sm120), not 10.0 as initially assumed.
+
+## Flash Attention Analysis
+
+llama.cpp implements multiple Flash Attention kernel variants:
+- **MMA-based kernels** (`fattn-mma-f16.cuh`): Modern implementation for Turing+ 
+- **Vector kernels** (`fattn-vec-f32/f16.cuh`): For smaller batches/specific dimensions
+- **WMMA kernels** (`fattn-wmma-f16.cu`): Legacy implementation for Volta
+- **Tile kernels** (`fattn-tile-f16/f32.cu`): For architectures without tensor cores
+
+Selection logic in `ggml_cuda_flash_attn_ext()` considers compute capability, batch size, head dimensions, and data types.
+
+## Phase 1: Foundation and Architecture Detection ⚡ **ACCELERATED**
+
+### 1.1 Add Blackwell Constants and Detection **✅ FOUNDATION COMPLETE**
+
+**Status**: Foundation provided by [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360)
+- ✅ CUDA 12.8 toolkit support
+- ✅ sm120 compilation target
+- ✅ Build system integration
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/common.cuh`
+- `ggml/src/ggml-cuda/ggml-cuda.cu`
+
+**Updated Implementation:**
+```cpp
+// Add to common.cuh - CORRECTED for actual Blackwell compute capability
+#define GGML_CUDA_CC_BLACKWELL       1200  // B100/B200/RTX50 (12.0) - CORRECTED
+#define GGML_CUDA_CC_BLACKWELL_FUTURE 1300  // Future Blackwell variants
+
+#define GGML_CUDA_CC_IS_BLACKWELL(cc) (cc >= GGML_CUDA_CC_BLACKWELL && cc < GGML_CUDA_CC_BLACKWELL_FUTURE)
+#define GGML_CUDA_CC_SUPPORTS_CLUSTERS(cc) (cc >= GGML_CUDA_CC_BLACKWELL)
+```
+
+**Timeline:** ~~Week 1-2~~ **COMPLETE** ✅
+
+### 1.2 Enhanced Device Information Structure
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/ggml-cuda.cu` (cuda_device_info)
+
+**New fields:**
+- `max_clusters_per_multiprocessor`
+- `max_blocks_per_cluster` 
+- `l2_cache_size`
+- `hbm_bandwidth`
+
+**Updated Timeline:** Week 1 ⚡ (accelerated due to build foundation)
+
+### 1.3 Blackwell Feature Detection **NEW**
+
+**Files to create:**
+- `ggml/src/ggml-cuda/blackwell-detect.cu`
+
+**Implementation:**
+```cpp
+bool ggml_cuda_supports_blackwell_features(int device_id) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+    
+    // Verify compute capability 12.0+
+    int cc = 100 * prop.major + 10 * prop.minor;
+    if (!GGML_CUDA_CC_IS_BLACKWELL(cc)) return false;
+    
+    // Verify cluster support
+    int max_cluster_size;
+    cudaDeviceGetAttribute(&max_cluster_size, 
+        cudaDevAttrClusterLaunch, device_id);
+    
+    return max_cluster_size > 0;
+}
+```
+
+**Timeline:** Week 1-2
+
+## Phase 2: Thread Block Clusters Foundation
+
+### 2.1 Cluster Detection and Support Infrastructure
+
+**Files to create:**
+- `ggml/src/ggml-cuda/clusters.cuh`
+- `ggml/src/ggml-cuda/clusters.cu`
+
+**Key functions:**
+- `ggml_cuda_cluster_occupancy()`
+- `ggml_cuda_launch_kernel_clusters()`
+- `ggml_cuda_cluster_sync_init()`
+
+**Updated Timeline:** Week 2-3 ⚡ (accelerated)
+
+### 2.2 L2 Cache Management
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/ggml-cuda.cu`
+
+**Implementation:**
+- L2 cache persistence API wrappers
+- Cache allocation strategy for KV cache data
+- Stream-based cache management
+
+**Updated Timeline:** Week 3-4 ⚡ (accelerated)
+
+## Phase 3: Flash Attention Blackwell Optimizations
+
+### 3.1 MMA Kernel Enhancements for Blackwell
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/fattn-mma-f16.cuh`
+- `ggml/src/ggml-cuda/fattn-common.cuh`
+
+**Key optimizations:**
+
+#### 3.1.1 Enhanced Shared Memory Usage
+```cpp
+// Leverage 228 KB shared memory per SM vs 164 KB on Ada Lovelace
+template<int DKQ, int DV>
+struct fattn_blackwell_config : fattn_mma_f16_config<DKQ, DV> {
+    static constexpr int cc_target = GGML_CUDA_CC_BLACKWELL; // 1200
+    static constexpr int smpb_blackwell = 228 * 1024; // 228 KB
+    static constexpr int enhanced_batch_size = smpb_blackwell / (DKQ * sizeof(half));
+    
+    // Increase tile sizes for better cache utilization
+    static constexpr int nbatch_fa_blackwell = std::min(enhanced_batch_size, 128);
+};
+```
+
+#### 3.1.2 Thread Block Cluster Integration
+```cpp
+template<int DKQ, int DV, int ncols1, int ncols2, int cluster_size>
+__cluster_dims__(cluster_size, 1, 1)
+__global__ void flash_attn_ext_f16_clustered(/* parameters */) {
+    // Distributed shared memory across cluster
+    extern __shared__ half2 cluster_shared_memory[];
+    
+    // Cluster-wide synchronization
+    cluster.sync();
+    
+    // Enhanced memory access patterns
+    // ...
+}
+```
+
+#### 3.1.3 L2 Cache-Aware KV Access
+```cpp
+// Optimize KV cache access patterns for L2 persistence
+template<typename T>
+__device__ void prefetch_kv_to_l2(const T* kv_data, size_t size) {
+    // Use cache hints for Blackwell L2 (126 MB vs 40 MB)
+    __builtin_nontemporal_store(); // Blackwell-specific hints
+}
+```
+
+**Updated Timeline:** Week 4-7 ⚡ (accelerated)
+
+### 3.2 Kernel Selection Logic Updates
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/fattn.cu`
+
+**Enhanced selection for Blackwell:**
+```cpp
+void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    
+    if (GGML_CUDA_CC_IS_BLACKWELL(cc)) {
+        // Prefer cluster-based kernels for larger problems
+        if (can_use_clusters && problem_size_threshold_met) {
+            ggml_cuda_flash_attn_ext_mma_f16_clusters(ctx, dst);
+            return;
+        }
+        
+        // Use enhanced MMA kernels with Blackwell optimizations
+        ggml_cuda_flash_attn_ext_mma_f16_blackwell(ctx, dst);
+        return;
+    }
+    
+    // ... existing fallback logic ...
+}
+```
+
+**Updated Timeline:** Week 6-7 ⚡ (accelerated)
+
+### 3.3 Advanced Memory Access Optimizations
+
+#### 3.3.1 HBM3/HBM3e Bandwidth Optimization
+- Implement wider memory transactions (512-bit vs 256-bit)
+- Optimize memory coalescing patterns for higher bandwidth
+- Implement memory prefetching strategies
+
+#### 3.3.2 Async Copy Enhancements
+```cpp
+// Enhanced async copy for Blackwell
+template<int cluster_size>
+__device__ void async_copy_cluster_aware(
+    void* dst, const void* src, size_t bytes, 
+    cuda::barrier<cuda::thread_scope_cluster>& barrier) {
+    // Blackwell-optimized async copy with cluster coordination
+}
+```
+
+**Updated Timeline:** Week 8-9 ⚡ (accelerated)
+
+## Phase 4: Advanced Blackwell Features
+
+### 4.1 Distributed Shared Memory Implementation
+
+**Files to create:**
+- `ggml/src/ggml-cuda/distributed-shared-memory.cuh`
+
+**Key features:**
+- Cross-block shared memory access
+- Cluster-wide data sharing for attention heads
+- Optimized memory layout for distributed access
+
+**Updated Timeline:** Week 10-11 ⚡ (accelerated)
+
+### 4.2 Advanced Occupancy Management
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/fattn-common.cuh`
+
+**Implementation:**
+- `cudaOccupancyMaxActiveClusters` integration
+- Dynamic cluster size selection
+- Load balancing across SMs
+
+**Updated Timeline:** Week 11-12 ⚡ (accelerated)
+
+### 4.3 Multi-Head Attention Cluster Optimization
+
+**New kernel variants:**
+- Cluster-aware multi-head processing
+- Cross-head data sharing via distributed shared memory
+- Optimized attention head grouping strategies
+
+**Updated Timeline:** Week 12-13 ⚡ (accelerated)
+
+## Phase 5: General CUDA Kernel Optimizations
+
+### 5.1 Matrix Operations Enhancement
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/gemm.cu`
+- `ggml/src/ggml-cuda/mul-mat.cu`
+
+**Optimizations:**
+- Leverage 255 registers per thread with improved scheduling
+- Enhanced warp-level primitives for Blackwell
+- L2 cache persistence for weight matrices
+
+**Updated Timeline:** Week 14-15 ⚡ (accelerated)
+
+### 5.2 Attention-Adjacent Operations
+
+**Files to modify:**
+- `ggml/src/ggml-cuda/rope.cu` (Rotary Position Embedding)
+- `ggml/src/ggml-cuda/norm.cu` (Layer Normalization)
+
+**Optimizations:**
+- Thread block cluster integration where beneficial
+- Enhanced shared memory usage
+- Optimized memory access patterns
+
+**Updated Timeline:** Week 15-16 ⚡ (accelerated)
+
+## Phase 6: Performance Validation and Optimization
+
+### 6.1 Benchmarking Infrastructure
+
+**Files to create:**
+- `tools/blackwell-bench/`
+- Comprehensive benchmarking suite
+- Performance regression detection
+- A/B testing framework
+
+**Updated Timeline:** Week 17-18 ⚡ (accelerated)
+
+### 6.2 Performance Tuning
+
+**Focus areas:**
+- Kernel parameter auto-tuning
+- Dynamic optimization based on problem size
+- Memory allocation strategy optimization
+- Cache management tuning
+
+**Updated Timeline:** Week 18-20 ⚡ (accelerated)
+
+### 6.3 Integration Testing
+
+**Test scenarios:**
+- Various model architectures (Llama, Mistral, etc.)
+- Different sequence lengths and batch sizes
+- Mixed precision scenarios
+- Multi-GPU configurations
+
+**Updated Timeline:** Week 20-21 ⚡ (accelerated)
+
+## Phase 7: Documentation and Integration
+
+### 7.1 Documentation Updates
+
+**Files to create/modify:**
+- `docs/backend/BLACKWELL.md`
+- Update existing CUDA documentation
+- Code documentation and examples
+
+**Updated Timeline:** Week 22 ⚡ (accelerated)
+
+### 7.2 Build System Integration **⚡ FOUNDATION COMPLETE**
+
+**Status**: Core build system complete via [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360)
+
+**Remaining tasks:**
+- ✅ CUDA version detection (complete)
+- ✅ Blackwell-specific compilation flags (complete)
+- 🔄 Optional feature toggles for Blackwell optimizations
+
+**Updated Timeline:** Week 22 ⚡ (accelerated)
+
+## Updated Success Metrics
+
+### Performance Targets
+- **Flash Attention**: 20-40% improvement over Ada Lovelace
+- **Overall Inference**: 15-30% improvement in tokens/second
+- **Memory Efficiency**: Better utilization of 126 MB L2 cache
+- **Scalability**: Improved performance on larger context lengths
+
+### Validation Criteria
+- All existing tests pass
+- No performance regression on older architectures
+- Blackwell-specific optimizations activate correctly for compute capability 12.0+
+- Proper fallback behavior on non-Blackwell hardware
+
+## Updated Risk Mitigation
+
+### Technical Risks - REDUCED ⚡
+- ✅ **Build Infrastructure**: Resolved by PR #13360
+- ✅ **Compute Capability Detection**: Corrected to 12.0
+- 🔄 **Hardware Availability**: Still limited but build foundation ready
+- 🔄 **API Changes**: Version detection in place
+- 🔄 **Complexity**: Incremental implementation continues
+
+### Timeline Risks - MITIGATED ⚡
+- ✅ **Foundation Delays**: Eliminated by PR #13360
+- 🔄 **Scope Creep**: Strict phase gating maintained
+- 🔄 **Dependencies**: CUDA 12.8 foundation complete
+
+## Updated Timeline Summary
+
+**Original Timeline**: 24 weeks
+**Accelerated Timeline**: 22 weeks ⚡ (2-week acceleration)
+
+**Key Accelerations**:
+- Phase 1: Complete → Immediate start on Phase 2
+- Phase 2-7: 1-2 week acceleration per phase
+- Build system risks eliminated
+
+## Immediate Next Steps (Week 1)
+
+1. **Implement Phase 1.2**: Enhanced device information structure
+2. **Begin Phase 1.3**: Blackwell feature detection
+3. **Start Phase 2.1**: Cluster infrastructure development
+4. **Update all compute capability constants**: 1000 → 1200
+
+## Conclusion
+
+[PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360) provides crucial foundation acceleration for our Blackwell implementation. The corrected compute capability (12.0) and completed build infrastructure allow us to begin advanced optimizations immediately.
+
+**Key Benefits**:
+- ⚡ **2-week timeline acceleration**
+- ✅ **Build foundation complete**
+- 🎯 **Accurate architecture targeting** (cc 12.0)
+- 🚀 **Immediate development start** capability
+
+The plan now reflects actual Blackwell specifications and leverages the completed foundation to achieve aggressive performance improvements while maintaining our systematic, phased approach. 
\ No newline at end of file
diff --git a/blackwell/README.md b/blackwell/README.md
new file mode 100644
index 0000000000000..0dc6545467c29
--- /dev/null
+++ b/blackwell/README.md
@@ -0,0 +1,185 @@
+# NVIDIA Blackwell GPU Architecture Support
+
+This folder contains the implementation plan and development roadmap for adding comprehensive NVIDIA Blackwell GPU architecture support to llama.cpp.
+
+## Contents
+
+- `IMPLEMENTATION_PLAN.md` - Detailed implementation plan with phases, timelines, and technical specifications
+- `TECHNICAL_SPECS.md` - Comprehensive technical specifications with code examples
+- `README.md` - This overview document
+
+## Overview
+
+NVIDIA Blackwell architecture introduces several key features that can significantly improve AI inference performance:
+
+- **Thread Block Clusters**: Enable cooperation between multiple thread blocks with distributed shared memory
+- **Enhanced L2 Cache**: 126 MB L2 cache (vs 40 MB on Ada Lovelace) with persistence control
+- **HBM3/HBM3e Memory**: Higher bandwidth memory subsystem
+- **Increased Shared Memory**: Up to 228 KB per SM (vs 164 KB on previous architectures)
+
+## Current Status
+
+### Build Foundation: ✅ **COMPLETE** (via [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360))
+
+- ✅ **CUDA 12.8 Support**: Required toolkit for Blackwell compilation
+- ✅ **sm120 Architecture Target**: Compute capability 12.0 (CORRECTED from initial 10.0 assumption)
+- ✅ **Build System Integration**: CMake and CI/CD ready for Blackwell
+
+### Architecture Implementation: 🔄 **IN PROGRESS**
+
+- 🔄 **Thread Block Clusters**: Advanced multi-block cooperation
+- 🔄 **Flash Attention Optimizations**: Blackwell-specific kernel variants
+- 🔄 **L2 Cache Management**: 126 MB cache utilization strategies
+- 🔄 **Memory Optimizations**: HBM3/HBM3e bandwidth improvements
+
+## Architecture Specifications
+
+### Compute Capability: **12.0** (CORRECTED)
+
+**Critical Update**: Blackwell GPUs use compute capability **12.0** (sm120), not 10.0 as initially assumed.
+
+**Supported Hardware**:
+- NVIDIA B100/B200 (Data Center)
+- NVIDIA RTX 50 Series (Consumer)
+- NVIDIA RTX 6000 Ada Generation successor
+
+### Key Features
+
+| Feature | Ada Lovelace | Blackwell | Improvement |
+|---------|-------------|-----------|------------|
+| **Compute Capability** | 8.9 | **12.0** | New Architecture |
+| **L2 Cache** | 40 MB | 126 MB | **3.1x larger** |
+| **Shared Memory/SM** | 164 KB | 228 KB | **39% increase** |
+| **Memory Type** | GDDR6X | HBM3/HBM3e | **Higher bandwidth** |
+| **Thread Block Clusters** | No | Yes | **New feature** |
+| **Max Cluster Size** | N/A | 8 (portable), 16 (non-portable) | **New capability** |
+
+## Implementation Roadmap
+
+### ⚡ **Accelerated Timeline** (22 weeks, reduced from 24)
+
+**Phase 1: Foundation** ✅ **ACCELERATED** (Week 1-2)
+- Build system complete via [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360)
+- Architecture detection (compute capability 12.0)
+- Device capability enumeration
+
+**Phase 2: Thread Block Clusters** (Week 2-4) ⚡
+- Cluster launch framework
+- Distributed shared memory
+- L2 cache management APIs
+
+**Phase 3: Flash Attention Optimizations** (Week 4-9) ⚡
+- Enhanced MMA kernels for Blackwell
+- Cluster-based attention computation
+- L2 cache-aware KV access patterns
+
+**Phase 4-7: Advanced Features & Validation** (Week 10-22) ⚡
+- Multi-head attention clustering
+- General kernel optimizations
+- Performance validation and tuning
+- Documentation and integration
+
+## Performance Targets
+
+Based on architectural improvements:
+
+- **Flash Attention**: 20-40% improvement over Ada Lovelace
+- **Overall Inference**: 15-30% improvement in tokens/second
+- **Memory Efficiency**: Better utilization of 126 MB L2 cache
+- **Scalability**: Improved performance on larger context lengths (8K+ tokens)
+
+## Development Status
+
+### Foundation Layer: ✅ **COMPLETE**
+
+```cpp
+// Blackwell detection (compute capability 12.0)
+#define GGML_CUDA_CC_BLACKWELL 1200
+#define GGML_CUDA_CC_IS_BLACKWELL(cc) (cc >= 1200 && cc < 1300)
+```
+
+### Current Development Priorities
+
+1. **Enhanced Device Information** (Week 1)
+   - Cluster capability detection
+   - L2 cache size enumeration
+   - HBM3 bandwidth detection
+
+2. **Thread Block Clusters Framework** (Week 2-3)
+   - Cluster launch utilities
+   - Occupancy calculation
+   - Distributed shared memory management
+
+3. **Flash Attention Blackwell Kernels** (Week 4-7)
+   - Enhanced tile sizes (228 KB shared memory)
+   - Cluster-aware attention computation
+   - L2 cache persistence for KV data
+
+## Hardware Requirements
+
+### Minimum Requirements
+- NVIDIA Blackwell GPU (B100/B200/RTX50 series)
+- CUDA Toolkit 12.8+
+- CUDA Driver supporting compute capability 12.0
+
+### Recommended Development Environment
+- Multiple Blackwell GPUs for cluster testing
+- High-memory configuration for large context validation
+- CUDA Toolkit 12.8 or newer
+
+## Quick Start
+
+### Building with Blackwell Support
+
+```bash
+# Ensure CUDA 12.8+ is installed
+cmake -B build -DGGML_CUDA=ON
+cmake --build build
+```
+
+The build system automatically detects Blackwell capabilities and includes sm120 architecture if CUDA 12.8+ is available.
+
+### Runtime Detection
+
+```cpp
+// Check for Blackwell support
+const int cc = ggml_cuda_info().devices[0].cc;
+if (GGML_CUDA_CC_IS_BLACKWELL(cc)) {
+    // Blackwell optimizations available
+    printf("Blackwell GPU detected (compute capability %.1f)\n", cc / 100.0);
+}
+```
+
+## Contributing
+
+### Development Focus Areas
+
+1. **Thread Block Clusters**: Implementing cooperative multi-block kernels
+2. **Flash Attention**: Optimizing attention computation for Blackwell
+3. **Memory Management**: Leveraging 126 MB L2 cache effectively
+4. **Performance Analysis**: Benchmarking and validation frameworks
+
+### Testing Requirements
+
+- Access to Blackwell hardware for validation
+- Performance regression testing on older architectures
+- Memory usage analysis for large context scenarios
+- Cluster efficiency measurement tools
+
+## Documentation
+
+- **[IMPLEMENTATION_PLAN.md](IMPLEMENTATION_PLAN.md)**: Comprehensive 22-week development roadmap
+- **[TECHNICAL_SPECS.md](TECHNICAL_SPECS.md)**: Detailed technical specifications with code examples
+
+## References
+
+- [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360): CUDA 12.8 + sm120 build foundation
+- [NVIDIA Blackwell Architecture Whitepaper](https://developer.nvidia.com/blackwell-architecture)
+- [CUDA Programming Guide - Thread Block Clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters)
+- [CUDA Toolkit 12.8 Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/)
+
+---
+
+**Status**: Foundation complete via [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360) ✅  
+**Timeline**: 22 weeks (accelerated from 24) ⚡  
+**Architecture**: Compute Capability 12.0 (corrected) 🎯 
\ No newline at end of file
diff --git a/blackwell/TECHNICAL_SPECS.md b/blackwell/TECHNICAL_SPECS.md
new file mode 100644
index 0000000000000..31216cedfaf88
--- /dev/null
+++ b/blackwell/TECHNICAL_SPECS.md
@@ -0,0 +1,416 @@
+# Blackwell GPU Architecture Technical Specifications
+
+## Architecture Constants and Detection
+
+### Compute Capability Constants
+
+**Status**: Build foundation provided by [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360)
+
+```cpp
+// ggml/src/ggml-cuda/common.cuh - CORRECTED IMPLEMENTATION
+#define GGML_CUDA_CC_BLACKWELL       1200  // B100/B200/RTX50 (12.0) - CORRECTED
+#define GGML_CUDA_CC_BLACKWELL_FUTURE 1300  // Future Blackwell variants
+
+#define GGML_CUDA_CC_IS_BLACKWELL(cc) (cc >= GGML_CUDA_CC_BLACKWELL && cc < GGML_CUDA_CC_BLACKWELL_FUTURE)
+#define GGML_CUDA_CC_SUPPORTS_CLUSTERS(cc) (cc >= GGML_CUDA_CC_BLACKWELL)
+
+// Backward compatibility check
+#define GGML_CUDA_CC_BLACKWELL_MIN    1200  // Minimum Blackwell compute capability
+```
+
+### Enhanced Device Information Structure
+
+```cpp
+// Enhanced cuda_device_info structure
+struct cuda_device_info {
+    int     cc;                     // compute capability (1200+ for Blackwell)
+    int     nsm;                    // number of streaming multiprocessors
+    size_t  smpb;                   // max. shared memory per block (228 KB)
+    size_t  smpbo;                  // max. shared memory per block with opt-in (227 KB)
+    
+    // Blackwell-specific fields
+    bool    supports_clusters;      // Thread Block Cluster support
+    int     max_cluster_size;       // Maximum portable cluster size (8)
+    int     max_cluster_size_np;    // Maximum non-portable cluster size (16)
+    size_t  l2_cache_size;         // L2 cache capacity (126 MB for GB200)
+    size_t  hbm_bandwidth;         // Memory bandwidth (HBM3/HBM3e)
+    bool    hbm3_support;          // HBM3/HBM3e memory type
+    
+    // Enhanced capabilities
+    int     max_registers_per_thread; // 255 registers per thread
+    size_t  max_shmem_cluster;     // Max shared memory per cluster
+    bool    distributed_shmem;     // Distributed shared memory support
+};
+```
+
+### Blackwell Feature Detection
+
+```cpp
+// ggml/src/ggml-cuda/blackwell-detect.cu
+bool ggml_cuda_supports_blackwell_features(int device_id) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device_id));
+    
+    // Verify compute capability 12.0+
+    int cc = 100 * prop.major + 10 * prop.minor;
+    if (!GGML_CUDA_CC_IS_BLACKWELL(cc)) {
+        return false;
+    }
+    
+    // Verify cluster support
+    int max_cluster_size = 0;
+    cudaError_t err = cudaDeviceGetAttribute(&max_cluster_size, 
+        cudaDevAttrClusterLaunch, device_id);
+    
+    if (err != cudaSuccess || max_cluster_size == 0) {
+        return false;
+    }
+    
+    return true;
+}
+
+void ggml_cuda_init_blackwell_info(cuda_device_info* info, int device_id) {
+    if (!ggml_cuda_supports_blackwell_features(device_id)) {
+        info->supports_clusters = false;
+        return;
+    }
+    
+    info->supports_clusters = true;
+    
+    // Get cluster capabilities
+    CUDA_CHECK(cudaDeviceGetAttribute(&info->max_cluster_size, 
+        cudaDevAttrClusterLaunch, device_id));
+    
+    // Get L2 cache size
+    CUDA_CHECK(cudaDeviceGetAttribute((int*)&info->l2_cache_size, 
+        cudaDevAttrL2CacheSize, device_id));
+    
+    // Set Blackwell-specific defaults
+    info->max_cluster_size_np = 16;  // Non-portable limit
+    info->distributed_shmem = true;
+    info->hbm3_support = (info->cc >= GGML_CUDA_CC_BLACKWELL);
+}
+```
+
+## Thread Block Clusters Implementation
+
+### Cluster Launch Framework
+
+```cpp
+// ggml/src/ggml-cuda/clusters.cuh
+template<typename KernelFunc, typename... Args>
+cudaError_t ggml_cuda_launch_kernel_clusters(
+    KernelFunc kernel,
+    dim3 grid_dim,
+    dim3 block_dim, 
+    int cluster_size,
+    size_t shared_mem_bytes,
+    cudaStream_t stream,
+    Args... args) {
+    
+    // Verify cluster support
+    int device_id;
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    
+    if (!ggml_cuda_supports_blackwell_features(device_id)) {
+        // Fallback to regular kernel launch
+        kernel<<<grid_dim, block_dim, shared_mem_bytes, stream>>>(args...);
+        return cudaGetLastError();
+    }
+    
+    // Configure cluster launch
+    cudaLaunchConfig_t config = {0};
+    config.gridDim = grid_dim;
+    config.blockDim = block_dim;
+    config.dynamicSmemBytes = shared_mem_bytes;
+    config.stream = stream;
+    
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeClusterDimension;
+    attrs[0].val.clusterDim.x = cluster_size;
+    attrs[0].val.clusterDim.y = 1;
+    attrs[0].val.clusterDim.z = 1;
+    
+    config.attrs = attrs;
+    config.numAttrs = 1;
+    
+    return cudaLaunchKernelEx(&config, (void*)kernel, args...);
+}
+```
+
+### Cluster Occupancy Calculation
+
+```cpp
+// Enhanced occupancy calculation for clusters
+struct ggml_cuda_cluster_occupancy {
+    int blocks_per_cluster;
+    int clusters_per_sm;
+    int max_active_clusters;
+    int effective_occupancy;
+    size_t shared_mem_per_cluster;
+};
+
+ggml_cuda_cluster_occupancy ggml_cuda_calculate_cluster_occupancy(
+    const void* kernel_func,
+    int cluster_size,
+    int block_size,
+    size_t shared_mem_per_block) {
+    
+    ggml_cuda_cluster_occupancy result = {0};
+    
+    int device_id;
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    
+    // Use CUDA's cluster occupancy API
+    CUDA_CHECK(cudaOccupancyMaxActiveClusters(
+        &result.max_active_clusters,
+        kernel_func,
+        block_size,
+        shared_mem_per_block,
+        cluster_size));
+    
+    result.blocks_per_cluster = cluster_size;
+    result.shared_mem_per_cluster = shared_mem_per_block * cluster_size;
+    
+    // Calculate effective occupancy
+    const cuda_device_info& info = ggml_cuda_info().devices[device_id];
+    result.clusters_per_sm = result.max_active_clusters / info.nsm;
+    result.effective_occupancy = result.clusters_per_sm * cluster_size;
+    
+    return result;
+}
+```
+
+## Flash Attention Blackwell Optimizations
+
+### Enhanced Configuration Structure
+
+```cpp
+// ggml/src/ggml-cuda/fattn-mma-f16.cuh
+template<int DKQ, int DV>
+struct fattn_blackwell_config : fattn_mma_f16_config<DKQ, DV> {
+    static constexpr int cc_target = GGML_CUDA_CC_BLACKWELL; // 1200
+    
+    // Enhanced shared memory (228 KB vs 164 KB on Ada Lovelace)
+    static constexpr int smpb_blackwell = 228 * 1024;
+    static constexpr int enhanced_batch_size = smpb_blackwell / (DKQ * sizeof(half));
+    
+    // Increased tile dimensions for better cache utilization  
+    static constexpr int nbatch_fa_blackwell = std::min(enhanced_batch_size, 128);
+    static constexpr int tile_size_multiplier = 2; // Leverage larger shared memory
+    
+    // Cluster-specific parameters
+    static constexpr int preferred_cluster_size = 4;  // Optimal for attention workloads
+    static constexpr int distributed_shmem_size = smpb_blackwell * preferred_cluster_size;
+    
+    // L2 cache optimization parameters
+    static constexpr size_t l2_cache_target = 126 * 1024 * 1024; // 126 MB
+    static constexpr float kv_cache_persistence_ratio = 0.8f;     // 80% persistence for KV data
+    
+    // HBM3 bandwidth optimization
+    static constexpr int memory_transaction_width = 512; // bits
+    static constexpr int coalescing_factor = 16;         // Enhanced coalescing
+};
+```
+
+### Cluster-Based Flash Attention Kernel
+
+```cpp
+template<int DKQ, int DV, int ncols1, int ncols2, int cluster_size, bool use_logit_softcap>
+__cluster_dims__(cluster_size, 1, 1)
+__launch_bounds__((DKQ/32) * cluster_size, 1)  // Account for cluster warps
+__global__ void flash_attn_ext_f16_blackwell_clusters(
+    const char * __restrict__ Q,
+    const char * __restrict__ K, 
+    const char * __restrict__ V,
+    const char * __restrict__ mask,
+    float * __restrict__ dst,
+    float2 * __restrict__ dst_meta,
+    // ... other parameters
+) {
+#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) && (__CUDA_ARCH__ >= 1200)
+    
+    namespace cg = cooperative_groups;
+    
+    // Get cluster and block information
+    cg::cluster_group cluster = cg::this_cluster();
+    cg::thread_block block = cg::this_thread_block();
+    
+    const int cluster_rank = cluster.block_rank();
+    const int blocks_per_cluster = cluster.size();
+    
+    // Distributed shared memory allocation
+    extern __shared__ half2 cluster_shared_memory[];
+    half2* local_shmem = cluster_shared_memory;
+    half2* distributed_shmem = cluster.map_shared_rank(local_shmem, cluster_rank);
+    
+    // Enhanced attention computation with cluster coordination
+    typedef fattn_blackwell_config<DKQ, DV> config;
+    
+    // Load Q, K, V with cluster-aware distribution
+    load_qkv_cluster_distributed<config>(Q, K, V, distributed_shmem, cluster, block);
+    
+    // Cluster-wide synchronization
+    cluster.sync();
+    
+    // Compute attention with enhanced tile sizes
+    compute_attention_enhanced_tiles<config>(
+        distributed_shmem, mask, cluster, block);
+    
+    // Write results with coordinated memory access
+    write_attention_results<config>(dst, dst_meta, distributed_shmem, cluster, block);
+    
+#else
+    // Fallback for non-Blackwell architectures
+    NO_DEVICE_CODE;
+#endif
+}
+```
+
+### L2 Cache Management Integration
+
+```cpp
+// ggml/src/ggml-cuda/cache-mgmt.cuh
+class BlackwellL2Manager {
+private:
+    static constexpr size_t L2_SIZE = 126 * 1024 * 1024; // 126 MB
+    
+public:
+    static void set_kv_cache_persistence(void* kv_ptr, size_t kv_size) {
+        if (!ggml_cuda_supports_blackwell_features(ggml_cuda_get_device())) {
+            return; // Graceful fallback
+        }
+        
+        // Set high persistence ratio for KV cache data
+        constexpr float persist_ratio = 0.8f;
+        
+        cudaAccessProperty prop = {};
+        prop.location = cudaLocationTypeGlobal;
+        prop.range.base = kv_ptr;
+        prop.range.size = kv_size;
+        prop.ratio = persist_ratio;
+        
+        cudaStreamAttrValue streamAttr = {};
+        streamAttr.accessPolicyWindow = prop;
+        
+        cudaStream_t stream;
+        CUDA_CHECK(cudaStreamCreate(&stream));
+        CUDA_CHECK(cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, &streamAttr));
+    }
+    
+    static void prefetch_to_l2(const void* data, size_t size, cudaStream_t stream) {
+        if (size > L2_SIZE / 4) {
+            // Too large for effective L2 caching
+            return;
+        }
+        
+        // Use memory advise for L2 prefetching
+        CUDA_CHECK(cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, 
+                                ggml_cuda_get_device()));
+    }
+};
+```
+
+### Enhanced Kernel Selection Logic
+
+```cpp
+// ggml/src/ggml-cuda/fattn.cu - Updated selection for Blackwell
+void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q = dst->src[0];
+    const int device_id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[device_id].cc;
+    
+    // Blackwell-specific optimizations
+    if (GGML_CUDA_CC_IS_BLACKWELL(cc)) {
+        const size_t problem_size = Q->ne[0] * Q->ne[1] * Q->ne[2];
+        const bool large_context = Q->ne[1] > 2048;
+        const bool can_use_clusters = ggml_cuda_supports_blackwell_features(device_id);
+        
+        // Use cluster-based kernels for large problems
+        if (can_use_clusters && (large_context || problem_size > (64 * 64 * 32))) {
+            ggml_cuda_flash_attn_ext_mma_f16_clusters(ctx, dst);
+            return;
+        }
+        
+        // Use enhanced MMA kernels with Blackwell optimizations  
+        ggml_cuda_flash_attn_ext_mma_f16_blackwell(ctx, dst);
+        return;
+    }
+    
+    // ... existing fallback logic for older architectures ...
+}
+```
+
+## Build System Integration
+
+### CMake Configuration
+
+```cmake
+# ggml/src/ggml-cuda/CMakeLists.txt - COMPLETED IN PR #13360
+if (GGML_CUDA_CTK_VERSION VERSION_GREATER_EQUAL "12.8")
+    # Blackwell architecture support (compute capability 12.0)
+    list(APPEND GGML_CUDA_ARCHITECTURES "120-real")
+    
+    # Optional: Add virtual architecture for forward compatibility
+    list(APPEND GGML_CUDA_ARCHITECTURES "120-virtual")
+endif()
+```
+
+### Compile-Time Feature Detection
+
+```cpp
+// Automatic Blackwell feature detection at compile time
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1200)
+    #define BLACKWELL_AVAILABLE 1
+    #define CLUSTER_SUPPORT_AVAILABLE 1
+    #define ENHANCED_SHMEM_AVAILABLE 1
+#else
+    #define BLACKWELL_AVAILABLE 0
+    #define CLUSTER_SUPPORT_AVAILABLE 0  
+    #define ENHANCED_SHMEM_AVAILABLE 0
+#endif
+
+// Runtime feature toggles
+#define GGML_CUDA_BLACKWELL_CLUSTERS_ENABLED 1
+#define GGML_CUDA_BLACKWELL_L2_MGMT_ENABLED 1
+#define GGML_CUDA_BLACKWELL_ENHANCED_SHMEM_ENABLED 1
+```
+
+## Performance Monitoring and Validation
+
+### Blackwell-Specific Benchmarking
+
+```cpp
+// tools/blackwell-bench/blackwell-bench.cpp
+struct BlackwellBenchmarkResults {
+    float flash_attention_speedup;    // vs Ada Lovelace baseline
+    float l2_cache_hit_rate;         // L2 cache effectiveness
+    float cluster_efficiency;        // Cluster utilization
+    float memory_bandwidth_util;     // HBM3 bandwidth utilization
+    float register_efficiency;       // Register file utilization
+};
+
+BlackwellBenchmarkResults benchmark_blackwell_optimizations(
+    const ggml_tensor* Q, const ggml_tensor* K, const ggml_tensor* V) {
+    
+    BlackwellBenchmarkResults results = {};
+    
+    if (!GGML_CUDA_CC_IS_BLACKWELL(ggml_cuda_info().devices[0].cc)) {
+        return results; // Skip on non-Blackwell hardware
+    }
+    
+    // Benchmark cluster-based vs non-cluster kernels
+    results.flash_attention_speedup = benchmark_cluster_vs_standard();
+    
+    // Measure L2 cache effectiveness
+    results.l2_cache_hit_rate = measure_l2_cache_performance();
+    
+    // Evaluate cluster utilization
+    results.cluster_efficiency = measure_cluster_efficiency();
+    
+    return results;
+}
+```
+
+This technical specification provides the detailed implementation foundation for Blackwell support, building on the accelerated timeline enabled by [PR #13360](https://github.com/ggml-org/llama.cpp/pull/13360) and correcting the compute capability to the actual 12.0 specification. 
\ No newline at end of file
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index e59d61f195675..0d2be9abc3dab 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -247,6 +247,7 @@ struct cmd_params {
     std::vector<ggml_type>           type_v;
     std::vector<float>               defrag_thold;
     std::vector<int>                 n_threads;
+    std::vector<int>                 n_threads_batch;
     std::vector<std::string>         cpu_mask;
     std::vector<bool>                cpu_strict;
     std::vector<int>                 poll;
@@ -283,6 +284,7 @@ static const cmd_params cmd_params_defaults = {
     /* type_v               */ { GGML_TYPE_F16 },
     /* defrag_thold         */ { -1.0f },
     /* n_threads            */ { cpu_get_num_math() },
+    /* n_threads_batch      */ { cpu_get_num_math() },
     /* cpu_mask             */ { "0x0" },
     /* cpu_strict           */ { false },
     /* poll                 */ { 50 },
@@ -347,6 +349,8 @@ static void print_usage(int /* argc */, char ** argv) {
            join(cmd_params_defaults.defrag_thold, ",").c_str());
     printf("  -t, --threads <n>                         (default: %s)\n",
            join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  --n-threads-batch <n>                     (default: %s)\n",
+           join(cmd_params_defaults.n_threads_batch, ",").c_str());
     printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
            join(cmd_params_defaults.cpu_mask, ",").c_str());
     printf("  --cpu-strict <0|1>                        (default: %s)\n",
@@ -543,6 +547,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 auto p = parse_int_range(argv[i]);
                 params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+            } else if (arg == "--n-threads-batch") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_threads_batch.insert(params.n_threads_batch.end(), p.begin(), p.end());
             } else if (arg == "-C" || arg == "--cpu-mask") {
                 if (++i >= argc) {
                     invalid_param = true;
@@ -882,6 +893,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_threads.empty()) {
         params.n_threads = cmd_params_defaults.n_threads;
     }
+    if (params.n_threads_batch.empty()) {
+        params.n_threads_batch = cmd_params_defaults.n_threads_batch;
+    }
     if (params.cpu_mask.empty()) {
         params.cpu_mask = cmd_params_defaults.cpu_mask;
     }
@@ -906,6 +920,7 @@ struct cmd_params_instance {
     ggml_type          type_v;
     float              defrag_thold;
     int                n_threads;
+    int                n_threads_batch;
     std::string        cpu_mask;
     bool               cpu_strict;
     int                poll;
@@ -1020,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & nkvo : params.no_kv_offload)
     for (const auto & fa : params.flash_attn)
     for (const auto & nt : params.n_threads)
+    for (const auto & ntb : params.n_threads_batch)
     for (const auto & cm : params.cpu_mask)
     for (const auto & cs : params.cpu_strict)
     for (const auto & nd : params.n_depth)
@@ -1039,6 +1055,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_v       = */ tv,
                 /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
+                /* .n_threads_batch = */ ntb,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
@@ -1072,6 +1089,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_v       = */ tv,
                 /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
+                /* .n_threads_batch = */ ntb,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
@@ -1105,6 +1123,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_v       = */ tv,
                 /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
+                /* .n_threads_batch = */ ntb,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
@@ -1140,6 +1159,7 @@ struct test {
     int                      n_batch;
     int                      n_ubatch;
     int                      n_threads;
+    int                      n_threads_batch;
     std::string              cpu_mask;
     bool                     cpu_strict;
     int                      poll;
@@ -1161,6 +1181,8 @@ struct test {
     int                      n_depth;
     std::string              test_time;
     std::vector<uint64_t>    samples_ns;
+    std::vector<uint64_t>    samples_prompt_ns;
+    std::vector<uint64_t>    samples_gen_ns;
 
     test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
         cpu_info(get_cpu_info()),
@@ -1175,6 +1197,7 @@ struct test {
         n_batch        = inst.n_batch;
         n_ubatch       = inst.n_ubatch;
         n_threads      = inst.n_threads;
+        n_threads_batch = inst.n_threads_batch;
         cpu_mask       = inst.cpu_mask;
         cpu_strict     = inst.cpu_strict;
         poll           = inst.poll;
@@ -1206,6 +1229,14 @@ struct test {
 
     uint64_t stdev_ns() const { return ::stdev(samples_ns); }
 
+    uint64_t avg_prompt_ns() const { return samples_prompt_ns.empty() ? 0 : ::avg(samples_prompt_ns); }
+
+    uint64_t stdev_prompt_ns() const { return samples_prompt_ns.empty() ? 0 : ::stdev(samples_prompt_ns); }
+
+    uint64_t avg_gen_ns() const { return samples_gen_ns.empty() ? 0 : ::avg(samples_gen_ns); }
+
+    uint64_t stdev_gen_ns() const { return samples_gen_ns.empty() ? 0 : ::stdev(samples_gen_ns); }
+
     std::vector<double> get_ts() const {
         int                 n_tokens = n_prompt + n_gen;
         std::vector<double> ts;
@@ -1214,10 +1245,46 @@ struct test {
         return ts;
     }
 
+    std::vector<double> get_prompt_ts() const {
+        if (samples_prompt_ns.empty() || n_prompt == 0) return {};
+        std::vector<double> ts;
+        std::transform(samples_prompt_ns.begin(), samples_prompt_ns.end(), std::back_inserter(ts),
+                       [this](uint64_t t) { return 1e9 * n_prompt / t; });
+        return ts;
+    }
+
+    std::vector<double> get_gen_ts() const {
+        if (samples_gen_ns.empty() || n_gen == 0) return {};
+        std::vector<double> ts;
+        std::transform(samples_gen_ns.begin(), samples_gen_ns.end(), std::back_inserter(ts),
+                       [this](uint64_t t) { return 1e9 * n_gen / t; });
+        return ts;
+    }
+
     double avg_ts() const { return ::avg(get_ts()); }
 
     double stdev_ts() const { return ::stdev(get_ts()); }
 
+    double avg_prompt_ts() const {
+        auto ts = get_prompt_ts();
+        return ts.empty() ? 0.0 : ::avg(ts);
+    }
+
+    double stdev_prompt_ts() const {
+        auto ts = get_prompt_ts();
+        return ts.empty() ? 0.0 : ::stdev(ts);
+    }
+
+    double avg_gen_ts() const {
+        auto ts = get_gen_ts();
+        return ts.empty() ? 0.0 : ::avg(ts);
+    }
+
+    double stdev_gen_ts() const {
+        auto ts = get_gen_ts();
+        return ts.empty() ? 0.0 : ::stdev(ts);
+    }
+
     static std::string get_backend() {
         std::vector<std::string> backends;
         for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
@@ -1234,11 +1301,13 @@ struct test {
         static const std::vector<std::string> fields = {
             "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "n_threads_batch", "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
             "defrag_thold",
             "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
             "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
+            "avg_prompt_ns", "stddev_prompt_ns", "avg_prompt_ts", "stddev_prompt_ts",
+            "avg_gen_ns",    "stddev_gen_ns",    "avg_gen_ts",    "stddev_gen_ts",
         };
         return fields;
     }
@@ -1247,16 +1316,18 @@ struct test {
 
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
-            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
+            field == "n_threads_batch" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
             field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
-            field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
+            field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload" ||
+            field == "avg_prompt_ns" || field == "stddev_prompt_ns" || field == "avg_gen_ns" || field == "stddev_gen_ns") {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
             field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
-        if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
+        if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold" ||
+            field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") {
             return FLOAT;
         }
         return STRING;
@@ -1311,6 +1382,7 @@ struct test {
                                             std::to_string(n_batch),
                                             std::to_string(n_ubatch),
                                             std::to_string(n_threads),
+                                            std::to_string(n_threads_batch),
                                             cpu_mask,
                                             std::to_string(cpu_strict),
                                             std::to_string(poll),
@@ -1334,7 +1406,15 @@ struct test {
                                             std::to_string(avg_ns()),
                                             std::to_string(stdev_ns()),
                                             std::to_string(avg_ts()),
-                                            std::to_string(stdev_ts()) };
+                                            std::to_string(stdev_ts()),
+                                            std::to_string(avg_prompt_ns()),
+                                            std::to_string(stdev_prompt_ns()),
+                                            std::to_string(avg_prompt_ts()),
+                                            std::to_string(stdev_prompt_ts()),
+                                            std::to_string(avg_gen_ns()),
+                                            std::to_string(stdev_gen_ns()),
+                                            std::to_string(avg_gen_ts()),
+                                            std::to_string(stdev_gen_ts()) };
         return values;
     }
 
@@ -1476,7 +1556,7 @@ struct markdown_printer : public printer {
         if (field == "model") {
             return -30;
         }
-        if (field == "t/s") {
+        if (field == "t/s" || field == "pp t/s" || field == "tg t/s") {
             return 20;
         }
         if (field == "size" || field == "params") {
@@ -1488,6 +1568,9 @@ struct markdown_printer : public printer {
         if (field == "n_threads") {
             return 7;
         }
+        if (field == "n_threads_batch") {
+            return 8;
+        }
         if (field == "n_batch") {
             return 7;
         }
@@ -1531,6 +1614,9 @@ struct markdown_printer : public printer {
         if (field == "n_threads") {
             return "threads";
         }
+        if (field == "n_threads_batch") {
+            return "th_batch";
+        }
         if (field == "no_kv_offload") {
             return "nkvo";
         }
@@ -1569,6 +1655,9 @@ struct markdown_printer : public printer {
         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
             fields.emplace_back("n_threads");
         }
+        if (params.n_threads_batch.size() > 1 || params.n_threads_batch != cmd_params_defaults.n_threads_batch || is_cpu_backend) {
+            fields.emplace_back("n_threads_batch");
+        }
         if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
             fields.emplace_back("cpu_mask");
         }
@@ -1622,6 +1711,8 @@ struct markdown_printer : public printer {
         }
         fields.emplace_back("test");
         fields.emplace_back("t/s");
+        fields.emplace_back("pp t/s");
+        fields.emplace_back("tg t/s");
 
         fprintf(fout, "|");
         for (const auto & field : fields) {
@@ -1677,6 +1768,20 @@ struct markdown_printer : public printer {
             } else if (field == "t/s") {
                 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                 value = buf;
+            } else if (field == "pp t/s") {
+                if (t.n_prompt > 0) {
+                    snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts());
+                } else {
+                    snprintf(buf, sizeof(buf), "N/A");
+                }
+                value = buf;
+            } else if (field == "tg t/s") {
+                if (t.n_gen > 0) {
+                    snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_gen_ts(), t.stdev_gen_ts());
+                } else {
+                    snprintf(buf, sizeof(buf), "N/A");
+                }
+                value = buf;
             } else if (vmap.find(field) != vmap.end()) {
                 value = vmap.at(field);
             } else {
@@ -1685,7 +1790,7 @@ struct markdown_printer : public printer {
             }
 
             int width = get_field_width(field);
-            if (field == "t/s") {
+            if (field == "t/s" || field == "pp t/s" || field == "tg t/s") {
                 // HACK: the utf-8 character is 2 bytes
                 width += 1;
             }
@@ -1738,8 +1843,8 @@ struct sql_printer : public printer {
     }
 };
 
-static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
-    llama_set_n_threads(ctx, n_threads, n_threads);
+static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads, int n_threads_batch) {
+    llama_set_n_threads(ctx, n_threads, n_threads_batch);
 
     const llama_model * model   = llama_get_model(ctx);
     const llama_vocab * vocab   = llama_model_get_vocab(model);
@@ -1767,8 +1872,8 @@ static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
     return true;
 }
 
-static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
-    llama_set_n_threads(ctx, n_threads, n_threads);
+static bool test_gen(llama_context * ctx, int n_gen, int n_threads, int n_threads_batch) {
+    llama_set_n_threads(ctx, n_threads, n_threads_batch);
 
     const llama_model * model   = llama_get_model(ctx);
     const llama_vocab * vocab   = llama_model_get_vocab(model);
@@ -1930,7 +2035,7 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+            bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads, t.n_threads_batch);
             if (!res) {
                 fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
                 exit(1);
@@ -1940,7 +2045,7 @@ int main(int argc, char ** argv) {
             if (params.progress) {
                 fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
             }
-            bool res = test_gen(ctx, 1, t.n_threads);
+            bool res = test_gen(ctx, 1, t.n_threads, t.n_threads_batch);
             if (!res) {
                 fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
                 exit(1);
@@ -1955,7 +2060,7 @@ int main(int argc, char ** argv) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
-                bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+                bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads, t.n_threads_batch);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run depth\n", __func__);
                     exit(1);
@@ -1963,32 +2068,46 @@ int main(int argc, char ** argv) {
             }
 
             uint64_t t_start = get_time_ns();
+            uint64_t t_prompt_ns = 0;
+            uint64_t t_gen_ns = 0;
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
-                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+                uint64_t t_prompt_start = get_time_ns();
+                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads, t.n_threads_batch);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
                     exit(1);
                 }
+                t_prompt_ns = get_time_ns() - t_prompt_start;
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
-                bool res = test_gen(ctx, t.n_gen, t.n_threads);
+                uint64_t t_gen_start = get_time_ns();
+                bool res = test_gen(ctx, t.n_gen, t.n_threads, t.n_threads_batch);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run gen\n", __func__);
                     exit(1);
                 }
+                t_gen_ns = get_time_ns() - t_gen_start;
             }
 
             uint64_t t_ns = get_time_ns() - t_start;
             t.samples_ns.push_back(t_ns);
+
+            // Store separate timings
+            if (t.n_prompt > 0) {
+                t.samples_prompt_ns.push_back(t_prompt_ns);
+            }
+            if (t.n_gen > 0) {
+                t.samples_gen_ns.push_back(t_gen_ns);
+            }
         }
 
         if (p) {
@@ -2022,3 +2141,4 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
+