|
| 1 | +# TouchDesigner CUDA Integration Architecture |
| 2 | + |
| 3 | +## Overview |
| 4 | + |
| 5 | +This document outlines the architecture for adding CUDA support to the td-rs Rust framework for TouchDesigner plugins, based on analysis of the existing C++ CudaTOP sample and the cudarc Rust CUDA library. |
| 6 | + |
| 7 | +## TouchDesigner CUDA Integration Pattern |
| 8 | + |
| 9 | +### Core Execution Flow |
| 10 | + |
| 11 | +1. **Plugin Declaration**: Plugin must declare `TOP_ExecuteMode::CUDA` in plugin info |
| 12 | +2. **Resource Acquisition**: Get `OP_CUDAArrayInfo*` from TouchDesigner before CUDA operations |
| 13 | +3. **Critical Sequence**: |
| 14 | + ``` |
| 15 | + createCUDAArray() → beginCUDAOperations() → kernel work → endCUDAOperations() |
| 16 | + ``` |
| 17 | +4. **Memory Management**: TouchDesigner owns `cudaArray*`, plugin works with surfaces |
| 18 | + |
| 19 | +### Key TouchDesigner Types |
| 20 | + |
| 21 | +```cpp |
| 22 | +// Resource info - initially cudaArray is nullptr |
| 23 | +class OP_CUDAArrayInfo { |
| 24 | + OP_TextureDesc textureDesc; // Resolution, format, dimension |
| 25 | + cudaArray* cudaArray = nullptr; // Filled by beginCUDAOperations() |
| 26 | +}; |
| 27 | + |
| 28 | +// Stream specification for async operations |
| 29 | +class OP_CUDAAcquireInfo { |
| 30 | + cudaStream_t stream; |
| 31 | +}; |
| 32 | + |
| 33 | +// Output specification |
| 34 | +class TOP_CUDAOutputInfo { |
| 35 | + cudaStream_t stream; |
| 36 | + OP_TextureDesc textureDesc; |
| 37 | + uint32_t colorBufferIndex = 0; // Multi-output support |
| 38 | +}; |
| 39 | +``` |
| 40 | +
|
| 41 | +### Surface Object Pattern |
| 42 | +
|
| 43 | +TouchDesigner uses surface objects for efficient texture access in kernels: |
| 44 | +
|
| 45 | +```cpp |
| 46 | +// Dynamic surface management - reuse if array matches |
| 47 | +static void setupCudaSurface(cudaSurfaceObject_t* surface, cudaArray_t array) { |
| 48 | + if (*surface) { |
| 49 | + cudaResourceDesc desc; |
| 50 | + cudaGetSurfaceObjectResourceDesc(&desc, *surface); |
| 51 | + if (desc.res.array.array != array) { |
| 52 | + cudaDestroySurfaceObject(*surface); |
| 53 | + *surface = 0; |
| 54 | + } |
| 55 | + } |
| 56 | + |
| 57 | + if (!*surface) { |
| 58 | + cudaResourceDesc desc; |
| 59 | + desc.resType = cudaResourceTypeArray; |
| 60 | + desc.res.array.array = array; |
| 61 | + cudaCreateSurfaceObject(surface, &desc); |
| 62 | + } |
| 63 | +} |
| 64 | +``` |
| 65 | + |
| 66 | +## cudarc API Analysis |
| 67 | + |
| 68 | +### Available APIs ✅ |
| 69 | + |
| 70 | +cudarc provides complete FFI bindings for surface objects: |
| 71 | + |
| 72 | +```rust |
| 73 | +// Core types |
| 74 | +pub type cudaSurfaceObject_t = ::core::ffi::c_ulonglong; |
| 75 | + |
| 76 | +// Functions |
| 77 | +pub unsafe fn cudaCreateSurfaceObject( |
| 78 | + pSurfObject: *mut cudaSurfaceObject_t, |
| 79 | + pResDesc: *const cudaResourceDesc, |
| 80 | +) -> cudaError_t; |
| 81 | + |
| 82 | +pub unsafe fn cudaDestroySurfaceObject( |
| 83 | + surfObject: cudaSurfaceObject_t |
| 84 | +) -> cudaError_t; |
| 85 | + |
| 86 | +// Resource management |
| 87 | +pub struct cudaResourceDesc { /* ... */ }; |
| 88 | +``` |
| 89 | + |
| 90 | +### Missing Safe Abstractions ❌ |
| 91 | + |
| 92 | +- No high-level wrappers in `driver::safe` module |
| 93 | +- No integration with `CudaStream`, `CudaContext` |
| 94 | +- Manual memory/lifecycle management required |
| 95 | + |
| 96 | +## Rust Implementation Strategy |
| 97 | + |
| 98 | +### 1. Safe Surface Object Wrapper |
| 99 | + |
| 100 | +Create a safe abstraction over surface objects: |
| 101 | + |
| 102 | +```rust |
| 103 | +use cudarc::runtime::sys; |
| 104 | + |
| 105 | +pub struct CudaSurface { |
| 106 | + surface: sys::cudaSurfaceObject_t, |
| 107 | + _ctx: Arc<CudaContext>, |
| 108 | +} |
| 109 | + |
| 110 | +impl CudaSurface { |
| 111 | + /// Create surface from external cudaArray* (TouchDesigner-owned) |
| 112 | + pub unsafe fn from_external_array( |
| 113 | + ctx: Arc<CudaContext>, |
| 114 | + array: *mut sys::cudaArray |
| 115 | + ) -> Result<Self, CudaError> { |
| 116 | + let mut surface = 0; |
| 117 | + let mut desc = sys::cudaResourceDesc { |
| 118 | + resType: sys::cudaResourceType::cudaResourceTypeArray, |
| 119 | + res: sys::cudaResourceDesc__bindgen_ty_1 { |
| 120 | + array: sys::cudaResourceDesc__bindgen_ty_1__bindgen_ty_1 { |
| 121 | + array, |
| 122 | + } |
| 123 | + }, |
| 124 | + }; |
| 125 | + |
| 126 | + sys::cudaCreateSurfaceObject(&mut surface, &desc)?; |
| 127 | + Ok(CudaSurface { surface, _ctx: ctx }) |
| 128 | + } |
| 129 | + |
| 130 | + pub fn handle(&self) -> sys::cudaSurfaceObject_t { |
| 131 | + self.surface |
| 132 | + } |
| 133 | +} |
| 134 | + |
| 135 | +impl Drop for CudaSurface { |
| 136 | + fn drop(&mut self) { |
| 137 | + unsafe { sys::cudaDestroySurfaceObject(self.surface); } |
| 138 | + } |
| 139 | +} |
| 140 | +``` |
| 141 | + |
| 142 | +### 2. Surface Cache for Performance |
| 143 | + |
| 144 | +Implement dynamic surface reuse pattern: |
| 145 | + |
| 146 | +```rust |
| 147 | +pub struct SurfaceCache { |
| 148 | + surfaces: HashMap<*mut sys::cudaArray, CudaSurface>, |
| 149 | + ctx: Arc<CudaContext>, |
| 150 | +} |
| 151 | + |
| 152 | +impl SurfaceCache { |
| 153 | + pub fn get_or_create(&mut self, array: *mut sys::cudaArray) -> Result<&CudaSurface, CudaError> { |
| 154 | + if !self.surfaces.contains_key(&array) { |
| 155 | + let surface = unsafe { CudaSurface::from_external_array(self.ctx.clone(), array)? }; |
| 156 | + self.surfaces.insert(array, surface); |
| 157 | + } |
| 158 | + Ok(self.surfaces.get(&array).unwrap()) |
| 159 | + } |
| 160 | + |
| 161 | + pub fn cleanup_invalid(&mut self, valid_arrays: &[*mut sys::cudaArray]) { |
| 162 | + self.surfaces.retain(|&k, _| valid_arrays.contains(&k)); |
| 163 | + } |
| 164 | +} |
| 165 | +``` |
| 166 | + |
| 167 | +### 3. CUDA TOP Trait Extension |
| 168 | + |
| 169 | +Extend the TOP trait to support CUDA execution: |
| 170 | + |
| 171 | +```rust |
| 172 | +pub trait CudaTop: Top { |
| 173 | + /// Execute CUDA kernel operations |
| 174 | + fn execute_cuda( |
| 175 | + &mut self, |
| 176 | + output: &CudaTopOutput, |
| 177 | + inputs: &CudaTopInputs, |
| 178 | + params: &Self::Params, |
| 179 | + ) -> Result<(), CudaError>; |
| 180 | + |
| 181 | + /// Get required CUDA stream (default: context default stream) |
| 182 | + fn cuda_stream(&self) -> Option<&CudaStream> { None } |
| 183 | +} |
| 184 | + |
| 185 | +pub struct CudaTopOutput { |
| 186 | + pub primary: CudaSurface, |
| 187 | + pub auxiliary: Vec<CudaSurface>, |
| 188 | + pub stream: Arc<CudaStream>, |
| 189 | +} |
| 190 | + |
| 191 | +pub struct CudaTopInputs { |
| 192 | + pub inputs: Vec<Option<CudaSurface>>, |
| 193 | + pub stream: Arc<CudaStream>, |
| 194 | +} |
| 195 | +``` |
| 196 | + |
| 197 | +### 4. Integration with td-rs Framework |
| 198 | + |
| 199 | +Modify the existing TOP infrastructure: |
| 200 | + |
| 201 | +```rust |
| 202 | +// In td-rs-top/src/lib.rs |
| 203 | +impl OpInfo for MyCudaPlugin { |
| 204 | + fn op_type(&self) -> &'static str { "Mycudasample" } |
| 205 | + fn execute_mode(&self) -> TopExecuteMode { |
| 206 | + TopExecuteMode::CUDA // ← Declares CUDA capability |
| 207 | + } |
| 208 | +} |
| 209 | + |
| 210 | +impl Top for MyCudaPlugin { |
| 211 | + fn execute(&mut self, output: &TopOutput, inputs: &OpInputs) -> Result<(), OpError> { |
| 212 | + // Bridge to CUDA execution |
| 213 | + let cuda_output = self.setup_cuda_output(output)?; |
| 214 | + let cuda_inputs = self.setup_cuda_inputs(inputs)?; |
| 215 | + self.execute_cuda(&cuda_output, &cuda_inputs, &self.params) |
| 216 | + } |
| 217 | +} |
| 218 | + |
| 219 | +impl CudaTop for MyCudaPlugin { |
| 220 | + fn execute_cuda( |
| 221 | + &mut self, |
| 222 | + output: &CudaTopOutput, |
| 223 | + inputs: &CudaTopInputs, |
| 224 | + params: &Self::Params, |
| 225 | + ) -> Result<(), CudaError> { |
| 226 | + // Launch kernels using cudarc |
| 227 | + let config = LaunchConfig::for_num_elems(width * height); |
| 228 | + unsafe { |
| 229 | + my_kernel_launch( |
| 230 | + output.primary.handle(), |
| 231 | + inputs.inputs[0].as_ref().map(|s| s.handle()).unwrap_or(0), |
| 232 | + width, height, |
| 233 | + &output.stream, |
| 234 | + config |
| 235 | + )?; |
| 236 | + } |
| 237 | + Ok(()) |
| 238 | + } |
| 239 | +} |
| 240 | +``` |
| 241 | + |
| 242 | +### 5. Build System Integration |
| 243 | + |
| 244 | +Add CUDA compilation support to td-rs build system: |
| 245 | + |
| 246 | +```toml |
| 247 | +# Plugin Cargo.toml |
| 248 | +[dependencies] |
| 249 | +cudarc = { path = "../../vendor/cudarc", features = ["runtime"] } |
| 250 | +td-rs-top = { path = "../../td-rs-top", features = ["cuda"] } |
| 251 | + |
| 252 | +[package.metadata.td-rs] |
| 253 | +type = "top" |
| 254 | +cuda = true # Enable CUDA compilation |
| 255 | + |
| 256 | +[[package.metadata.td-rs.kernels]] |
| 257 | +source = "src/kernels.cu" |
| 258 | +include_dirs = ["src/include"] |
| 259 | +``` |
| 260 | + |
| 261 | +## Implementation Challenges & Solutions |
| 262 | + |
| 263 | +### 1. External Memory Safety |
| 264 | +**Challenge**: TouchDesigner owns `cudaArray*`, Rust must not drop it |
| 265 | +**Solution**: Use `CudaSurface::from_external_array()` with proper lifetime management |
| 266 | + |
| 267 | +### 2. Synchronization |
| 268 | +**Challenge**: Coordinate beginCUDAOperations/endCUDAOperations lifecycle |
| 269 | +**Solution**: Integrate into TOP trait execution flow, handle in bridge layer |
| 270 | + |
| 271 | +### 3. Multi-Stream Coordination |
| 272 | +**Challenge**: TouchDesigner provides specific streams for async operations |
| 273 | +**Solution**: Accept external streams in `CudaTopOutput`, use cudarc stream sync primitives |
| 274 | + |
| 275 | +### 4. Error Propagation |
| 276 | +**Challenge**: Map CUDA errors to TouchDesigner error system |
| 277 | +**Solution**: Convert `CudaError` to `OpError` in bridge layer |
| 278 | + |
| 279 | +### 5. Dimension Support |
| 280 | +**Challenge**: Support 2D/3D/Cube/Array texture types |
| 281 | +**Solution**: Pass `OP_TexDim` to kernels, dispatch appropriate kernel variants |
| 282 | + |
| 283 | +## Performance Considerations |
| 284 | + |
| 285 | +1. **Surface Reuse**: Cache surface objects to avoid recreation overhead |
| 286 | +2. **Stream Management**: Use TouchDesigner-provided streams for optimal sync |
| 287 | +3. **Kernel Compilation**: Pre-compile kernels at build time when possible |
| 288 | +4. **Memory Access**: Prefer surface objects over direct array access for texture ops |
| 289 | + |
| 290 | +## Future Extensions |
| 291 | + |
| 292 | +1. **Graph API**: Support CUDA graphs for complex operation chains |
| 293 | +2. **Texture Arrays**: Enhanced support for layered textures |
| 294 | +3. **Interop**: OpenGL-CUDA interop for mixed rendering |
| 295 | +4. **Compute Shaders**: Alternative compute backends for portability |
| 296 | + |
| 297 | +This architecture provides a safe, efficient foundation for CUDA integration while maintaining compatibility with TouchDesigner's execution model and the existing td-rs framework patterns. |
0 commit comments