feature: add mps kernel

hysmio · hysmio · commit 1f02571afaba · 2025-11-22T21:42:42.000+11:00
diff --git a/pufferlib/extensions/mps/pufferlib.metal b/pufferlib/extensions/mps/pufferlib.metal
@@ -0,0 +1,44 @@
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void puff_advantage_kernel(
+    device const float* values [[buffer(0)]],
+    device const float* rewards [[buffer(1)]],
+    device const float* dones [[buffer(2)]],
+    device const float* importance [[buffer(3)]],
+    device float* advantages [[buffer(4)]],
+    constant float& gamma [[buffer(5)]],
+    constant float& lambda [[buffer(6)]],
+    constant float& rho_clip [[buffer(7)]],
+    constant float& c_clip [[buffer(8)]],
+    constant int& horizon [[buffer(9)]],
+    uint row [[thread_position_in_grid]])
+{
+    int offset = row * horizon;
+    device const float* row_values = values + offset;
+    device const float* row_rewards = rewards + offset;
+    device const float* row_dones = dones + offset;
+    device const float* row_importance = importance + offset;
+    device float* row_advantages = advantages + offset;
+
+    float gamma_lambda = gamma * lambda;
+    
+    float lastpufferlam = 0.0f;
+    for (int t = horizon - 2; t >= 0; t--) {
+        int t_next = t + 1;
+
+        float importance_t = row_importance[t];
+        float done_next = row_dones[t_next];
+        float value_t = row_values[t];
+        float value_next = row_values[t_next];
+        float reward_next = row_rewards[t_next];
+        
+        float rho_t = fmin(importance_t, rho_clip);
+        float c_t = fmin(importance_t, c_clip);
+        
+        float nextnonterminal = 1.0f - done_next;
+        float delta = rho_t * (reward_next + gamma * value_next * nextnonterminal - value_t);
+        lastpufferlam = delta + gamma_lambda * c_t * lastpufferlam * nextnonterminal;
+        row_advantages[t] = lastpufferlam;
+    }
+}
diff --git a/pufferlib/extensions/mps/pufferlib.mm b/pufferlib/extensions/mps/pufferlib.mm
@@ -0,0 +1,106 @@
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+#include <torch/extension.h>
+
+namespace pufferlib {
+
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor& tensor) {
+    return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+
+void compute_puff_advantage_mps(torch::Tensor values, torch::Tensor rewards,
+        torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
+        double gamma, double lambda, double rho_clip, double c_clip) {
+
+    @autoreleasepool {
+        TORCH_CHECK(values.device().is_mps(), "All tensors must be on MPS device");
+        TORCH_CHECK(values.is_contiguous(), "values must be contiguous");
+        TORCH_CHECK(rewards.is_contiguous(), "rewards must be contiguous");
+        TORCH_CHECK(dones.is_contiguous(), "dones must be contiguous");
+        TORCH_CHECK(importance.is_contiguous(), "importance must be contiguous");
+        TORCH_CHECK(advantages.is_contiguous(), "advantages must be contiguous");
+        TORCH_CHECK(values.scalar_type() == torch::kFloat32, "All tensors must be float32");
+
+        int num_steps = values.size(0);
+        int horizon = values.size(1);
+
+        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+        NSError* error = nil;
+
+        // probably not all too necessary to cache, but does save like 0.1ms per call
+        static id<MTLFunction> function = nil;
+        static id<MTLComputePipelineState> pipelineState = nil;
+        
+        if (function == nil) {
+            // read the file & compile the shader
+            NSString* sourcePath = [[@(__FILE__) stringByDeletingLastPathComponent]
+                stringByAppendingPathComponent:@"pufferlib.metal"];
+            NSString* source = [NSString stringWithContentsOfFile:sourcePath
+                encoding:NSUTF8StringEncoding error:&error];
+            TORCH_CHECK(source, "Failed to read Metal source file: ",
+                        error ? [[error localizedDescription] UTF8String] : "unknown error");
+        
+            id<MTLLibrary> library = [device newLibraryWithSource:source options:nil error:&error];
+            TORCH_CHECK(library, "Failed to compile Metal library: ",
+                        [[error localizedDescription] UTF8String]);
+
+            function = [library newFunctionWithName:@"puff_advantage_kernel"];
+            TORCH_CHECK(function, "Failed to find puff_advantage_kernel function");
+
+            pipelineState = [device newComputePipelineStateWithFunction:function error:&error];
+            TORCH_CHECK(pipelineState, "Failed to create compute pipeline: ",
+                        [[error localizedDescription] UTF8String]);
+        }
+
+
+        id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+
+        dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+
+        dispatch_sync(serialQueue, ^{
+            id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+            TORCH_CHECK(encoder, "Failed to create compute command encoder");
+
+            [encoder setComputePipelineState:pipelineState];
+            [encoder setBuffer:getMTLBufferStorage(values)
+                        offset:values.storage_offset() * values.element_size() atIndex:0];
+            [encoder setBuffer:getMTLBufferStorage(rewards)
+                        offset:rewards.storage_offset() * rewards.element_size() atIndex:1];
+            [encoder setBuffer:getMTLBufferStorage(dones)
+                        offset:dones.storage_offset() * dones.element_size() atIndex:2];
+            [encoder setBuffer:getMTLBufferStorage(importance)
+                        offset:importance.storage_offset() * importance.element_size() atIndex:3];
+            [encoder setBuffer:getMTLBufferStorage(advantages)
+                        offset:advantages.storage_offset() * advantages.element_size() atIndex:4];
+
+            float gamma_f = gamma, lambda_f = lambda, rho_clip_f = rho_clip, c_clip_f = c_clip;
+            int horizon_i = horizon;
+
+            [encoder setBytes:&gamma_f length:sizeof(float) atIndex:5];
+            [encoder setBytes:&lambda_f length:sizeof(float) atIndex:6];
+            [encoder setBytes:&rho_clip_f length:sizeof(float) atIndex:7];
+            [encoder setBytes:&c_clip_f length:sizeof(float) atIndex:8];
+            [encoder setBytes:&horizon_i length:sizeof(int) atIndex:9];
+
+            MTLSize gridSize = MTLSizeMake(num_steps, 1, 1);
+
+            NSUInteger threadGroupSize = pipelineState.maxTotalThreadsPerThreadgroup;
+            if (threadGroupSize > num_steps) {
+                threadGroupSize = num_steps;
+            }
+            MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1);
+
+            [encoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
+            [encoder endEncoding];
+
+            torch::mps::commit();
+        });
+    }
+}
+
+TORCH_LIBRARY_IMPL(pufferlib, MPS, m) {
+  m.impl("compute_puff_advantage", &compute_puff_advantage_mps);
+}
+
+}
diff --git a/setup.py b/setup.py
@@ -21,11 +21,15 @@
     CUDA_HOME,
     ROCM_HOME
 )
+from torch.backends import mps
 
 # build cuda extension if torch can find CUDA or HIP/ROCM in the system
 # may require `uv pip install --no-build-isolation` or `python setup.py build_ext --inplace`
 BUID_CUDA_EXT = bool(CUDA_HOME or ROCM_HOME)
 
+# build mps extension if torch can find MPS in the system
+BUILD_MPS_EXT = bool(mps.is_available())
+
 # Build with DEBUG=1 to enable debug symbols
 DEBUG = os.getenv("DEBUG", "0") == "1"
 NO_OCEAN = os.getenv("NO_OCEAN", "0") == "1"
@@ -243,6 +247,9 @@ def run(self):
     if BUID_CUDA_EXT:
         extension = CUDAExtension
         torch_sources.append("pufferlib/extensions/cuda/pufferlib.cu")
+    elif BUILD_MPS_EXT:
+        extension = CppExtension
+        torch_sources.append("pufferlib/extensions/mps/pufferlib.mm")
     else:
         extension = CppExtension