diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless.py b/slangpy/tests/slangpy_tests/test_neural_bindless.py new file mode 100644 index 00000000..a6b5a29d --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_bindless.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Neural integration tests for bindless resource types. + +Reviewer-requested coverage: +- Bindless "pointer type" (raw pointer parameters passed via Buffer.device_address) +- Bindless DescriptorHandle resources (StructuredBuffer.Handle / RWStructuredBuffer.Handle) +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest + +import slangpy as spy +from slangpy.core.calldata import SLANG_PATH +from slangpy.testing import helpers + + +def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device: + if helpers.should_skip_test_for_device(device_type): + pytest.skip(f"Device type {device_type.name} not selected for this test run") + + test_dir = Path(__file__).resolve().parent + compiler_options = spy.SlangCompilerOptions( + { + "include_paths": [test_dir, SLANG_PATH], + "debug_info": spy.SlangDebugInfoLevel.standard, + "enable_experimental_features": True, + } + ) + + return spy.Device( + type=device_type, + enable_debug_layers=True, + compiler_options=compiler_options, + label=f"uncached-slangpy-neural-bindless-{device_type.name}", + ) + + +# Pointer-style bindless params are supported on Vulkan. Keep this test on Vulkan only +# to avoid backend-specific CUDA toolchain requirements for this integration test. +POINTER_DEVICE_TYPES: list[spy.DeviceType] = [ + x for x in helpers.DEFAULT_DEVICE_TYPES if x in [spy.DeviceType.vulkan] +] + + +@pytest.mark.parametrize("device_type", POINTER_DEVICE_TYPES) +def test_neural_bindless_pointer_type(device_type: spy.DeviceType) -> None: + device = _get_device_with_native_neural(device_type) + try: + module = spy.Module(device.load_module("test_neural_bindless_pointer.slang")) + + buf = device.create_buffer( + size=4, + usage=spy.BufferUsage.shader_resource, + data=np.array([42], dtype=np.int32), + ) + + res = int(module.read_int_ptr(buf.device_address)) + assert res == 42 + finally: + device.close() + + +@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) +def test_neural_bindless_descriptor_handle_type(device_type: spy.DeviceType) -> None: + if device_type == spy.DeviceType.cuda: + pytest.skip("Bindless DescriptorHandle resources not supported with CUDA yet.") + + device = _get_device_with_native_neural(device_type) + try: + if not device.has_feature(spy.Feature.bindless): + pytest.skip("Bindless not supported on this device.") + + module = device.load_module("test_neural_bindless_descriptor_handle.slang") + program = device.link_program( + modules=[module], entry_points=[module.entry_point("compute_main")] + ) + kernel = device.create_compute_kernel(program) + + buffer_count = 6 + + ro_buffers: list[spy.Buffer] = [] + rw_buffers: list[spy.Buffer] = [] + for i in range(buffer_count): + ro_buffers.append( + device.create_buffer( + size=4 * 4, + usage=spy.BufferUsage.shader_resource, + data=np.array([i * 10, i * 10 + 1, i * 10 + 2, i * 10 + 3], dtype=np.float32), + ) + ) + rw_buffers.append( + device.create_buffer( + size=4 * 4, + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + data=np.zeros(4, dtype=np.float32), + ) + ) + + buffer_info_layout = module.layout.get_type_layout( + module.layout.find_type_by_name("StructuredBuffer") + ).element_type_layout + + buffer_infos_buffer = device.create_buffer( + size=buffer_count * buffer_info_layout.stride, + usage=spy.BufferUsage.shader_resource, + ) + results_buffer = device.create_buffer( + size=buffer_count * 4, + usage=spy.BufferUsage.unordered_access, + ) + + c = spy.BufferCursor(buffer_info_layout, buffer_infos_buffer, load_before_write=False) + for i in range(buffer_count): + c[i].ro_buffer = ro_buffers[i].descriptor_handle_ro + c[i].rw_buffer = rw_buffers[i].descriptor_handle_rw + c[i].offset = i % 4 + c.apply() + + kernel.dispatch( + thread_count=[buffer_count, 1, 1], + buffer_infos=buffer_infos_buffer, + results=results_buffer, + ) + + results = results_buffer.to_numpy().view(np.float32) + expected_results = np.array( + [ + 0, # buffer 0, offset 0 + 11, # buffer 1, offset 1 + 22, # buffer 2, offset 2 + 33, # buffer 3, offset 3 + 40, # buffer 4, offset 0 + 51, # buffer 5, offset 1 + ], + dtype=np.float32, + ) + assert np.allclose(results, expected_results) + + # Verify RW buffers were written. + for i in range(buffer_count): + rw_data = rw_buffers[i].to_numpy().view(np.float32) + offset = i % 4 + expected_value = (i * 10 + offset) + 100.0 + assert np.isclose(rw_data[offset], expected_value) + finally: + device.close() + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) + diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang b/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang new file mode 100644 index 00000000..f89b39de --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Neural integration smoke test for bindless DescriptorHandle style resources. +// We import `neural` to ensure experimental module compilation works alongside bindless. + +import neural; + +struct BufferInfo +{ + StructuredBuffer.Handle ro_buffer; + RWStructuredBuffer.Handle rw_buffer; + uint offset; +}; + +[shader("compute")] +[numthreads(1, 1, 1)] +void compute_main( + uint3 tid : SV_DispatchThreadID, + StructuredBuffer buffer_infos, + RWStructuredBuffer results) +{ + uint index = tid.x; + BufferInfo info = buffer_infos[index]; + + float value = info.ro_buffer[info.offset]; + info.rw_buffer[info.offset] = value + 100.0; + results[index] = value; +} + diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang b/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang new file mode 100644 index 00000000..02147c50 --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Neural integration smoke test for "bindless pointer" style parameters. +// This uses raw pointer parameters (passed from Python via Buffer.device_address). + +import slangpy; +import neural; + +int read_int_ptr(int* ptr) +{ + return ptr[0]; +} + diff --git a/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py b/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py new file mode 100644 index 00000000..604fa3ac --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +""" +Neural smoke test that actually exercises Slang autodiff (`bwd_diff(...)`). + +Important constraints: +- No dependency on sample apps under `samples/`. +- No dependency on external assets (e.g. image files). + +This uses the test-local Slang module `fflayer-bug-repro.slang` which imports the +experimental `neural` module and calls `bwd_diff(loss)(DifferentialPtrPair(...), ...)`. +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest + +import slangpy as spy +from slangpy.core.calldata import SLANG_PATH +from slangpy.testing import helpers + + +def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device: + if helpers.should_skip_test_for_device(device_type): + pytest.skip(f"Device type {device_type.name} not selected for this test run") + + test_dir = Path(__file__).resolve().parent + compiler_options = spy.SlangCompilerOptions( + { + "include_paths": [test_dir, SLANG_PATH], + "debug_info": spy.SlangDebugInfoLevel.standard, + "enable_experimental_features": True, + } + ) + + return spy.Device( + type=device_type, + enable_debug_layers=True, + compiler_options=compiler_options, + label=f"uncached-slangpy-neural-bwd-diff-{device_type.name}", + ) + + +@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) +def test_neural_bwd_diff_writes_param_grads(device_type: spy.DeviceType) -> None: + device = _get_device_with_native_neural(device_type) + try: + module = spy.Module(device.load_module("fflayer-bug-repro.slang")) + + # 2*2 weights + 2 biases = 6 floats (matches `fflayer-bug-repo.py`) + params = device.create_buffer( + data=np.ones((6,), dtype=np.float32), + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + dparams = device.create_buffer( + data=np.zeros((6,), dtype=np.float32), + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + + module.calculate_grad(input=spy.float2(1, 1), params=params, dparams=dparams) + + dparams_np = dparams.to_numpy().view(np.float32) + assert np.any(dparams_np != 0.0) + finally: + device.close() + diff --git a/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py new file mode 100644 index 00000000..698233db --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +SlangPy integration test for neural module FFLayer with CoopMat (WaveTangledVector) backend. + +This extends test_neural_frontend_training.py to cover the cooperative matrix backend. +CoopMat requires: +- Vulkan with cooperative matrix extension support +- Explicit compute shaders with [numthreads(32, 1, 1)] +- Types defined inside shader functions + +Tests training convergence for a simple quadratic regression task using: +- FFLayer with WaveTangledVector backend +- Manual gradient computation (analytic gradients) +- Simple SGD optimization + +We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 and verify convergence. +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest + +import slangpy as spy +from slangpy.core.calldata import SLANG_PATH +from slangpy.testing import helpers + + +def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device: + if helpers.should_skip_test_for_device(device_type): + pytest.skip(f"Device type {device_type.name} not selected for this test run") + + test_dir = Path(__file__).resolve().parent + + # Enable experimental features since neural is an experimental module + compiler_options = spy.SlangCompilerOptions( + { + "include_paths": [test_dir, SLANG_PATH], + "debug_info": spy.SlangDebugInfoLevel.standard, + "enable_experimental_features": True, + } + ) + + return spy.Device( + type=device_type, + enable_debug_layers=True, + compiler_options=compiler_options, + label=f"uncached-slangpy-neural-coopmat-frontend-{device_type.name}", + ) + + +# CoopMat only supported on Vulkan (and CUDA with SM 7.0+, but we focus on Vulkan here) +COOPMAT_DEVICE_TYPES: list[spy.DeviceType] = [ + x for x in helpers.DEFAULT_DEVICE_TYPES if x in [spy.DeviceType.vulkan] +] + + +@pytest.mark.parametrize("device_type", COOPMAT_DEVICE_TYPES) +def test_neural_coopmat_frontend_training_converges(device_type: spy.DeviceType) -> None: + """ + Test that training converges for a simple quadratic regression task using CoopMat backend. + + Uses FFLayer with WaveTangledVector (cooperative matrix backend). + """ + device = _get_device_with_native_neural(device_type) + try: + # Check for cooperative matrix support + if not device.has_feature(spy.Feature.cooperative_matrix): + pytest.skip("Cooperative matrix not supported on this device.") + + module = device.load_module("test_neural_coopmat_frontend_training.slang") + + # Get param count via simple function (doesn't need CoopMat) + param_count = int(spy.Module(module).get_param_count()) + assert param_count == 3 + + # Create compute kernels for CoopMat operations + eval_loss_program = device.link_program( + modules=[module], + entry_points=[module.entry_point("compute_eval_loss")] + ) + eval_loss_kernel = device.create_compute_kernel(eval_loss_program) + + train_step_program = device.link_program( + modules=[module], + entry_points=[module.entry_point("compute_train_step")] + ) + train_step_kernel = device.create_compute_kernel(train_step_program) + + # Fit: y = 2*x^2 - 0.5*x + 0.25 + sample_count = 256 + xs = np.linspace(-1.0, 1.0, sample_count, dtype=np.float32) + ys = (2.0 * xs * xs - 0.5 * xs + 0.25).astype(np.float32) + + xs_buf = device.create_buffer(data=xs, usage=spy.BufferUsage.shader_resource) + ys_buf = device.create_buffer(data=ys, usage=spy.BufferUsage.shader_resource) + + rng = np.random.default_rng(0) + params_init = (0.01 * rng.standard_normal(size=(param_count,))).astype(np.float32) + + params = device.create_buffer( + data=params_init, + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + grads = device.create_buffer( + data=np.zeros((param_count,), dtype=np.float32), + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + loss_buf = device.create_buffer( + data=np.zeros(1, dtype=np.float32), + usage=spy.BufferUsage.unordered_access, + ) + + # Dispatch with 1 group of 32 threads (warp-sized for CoopMat) + eval_loss_kernel.dispatch( + thread_count=[32, 1, 1], + params=params, + xs=xs_buf, + ys=ys_buf, + loss_out=loss_buf, + count=sample_count, + ) + initial_loss = float(loss_buf.to_numpy().view(np.float32)[0]) + + learning_rate = 0.1 + steps = 200 + for _ in range(steps): + train_step_kernel.dispatch( + thread_count=[32, 1, 1], + params=params, + grads=grads, + xs=xs_buf, + ys=ys_buf, + loss_out=loss_buf, + count=sample_count, + learningRate=learning_rate, + ) + + eval_loss_kernel.dispatch( + thread_count=[32, 1, 1], + params=params, + xs=xs_buf, + ys=ys_buf, + loss_out=loss_buf, + count=sample_count, + ) + final_loss = float(loss_buf.to_numpy().view(np.float32)[0]) + + # Convergence: should significantly reduce MSE and reach a small absolute error. + assert final_loss < initial_loss * 1e-2, f"Final loss {final_loss} not < initial*0.01 {initial_loss * 1e-2}" + assert final_loss < 1e-3, f"Final loss {final_loss} not < 1e-3" + + # Parameter packing: [w0, w1, bias] for y = w0*x + w1*x^2 + bias + learned = params.to_numpy().view(np.float32)[:param_count] + expected = np.array([-0.5, 2.0, 0.25], dtype=np.float32) + assert np.allclose(learned, expected, rtol=0.1, atol=0.1), f"Learned {learned} != expected {expected}" + + finally: + device.close() + + +@pytest.mark.parametrize("device_type", COOPMAT_DEVICE_TYPES) +def test_neural_coopmat_fflayer_forward_pass(device_type: spy.DeviceType) -> None: + """ + Test FFLayer forward pass with WaveTangledVector produces correct output. + """ + device = _get_device_with_native_neural(device_type) + try: + if not device.has_feature(spy.Feature.cooperative_matrix): + pytest.skip("Cooperative matrix not supported on this device.") + + module = device.load_module("test_neural_coopmat_frontend_training.slang") + + forward_program = device.link_program( + modules=[module], + entry_points=[module.entry_point("compute_forward_pass")] + ) + forward_kernel = device.create_compute_kernel(forward_program) + + # Set up known weights for verification + # Layer: 2 inputs -> 1 output with bias + # y = w0*x0 + w1*x1 + b + # With w0=1, w1=2, b=0.5: y = 1*1 + 2*2 + 0.5 = 5.5 + params = device.create_buffer( + data=np.array([1.0, 2.0, 0.5], dtype=np.float32), + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + result = device.create_buffer( + data=np.zeros(1, dtype=np.float32), + usage=spy.BufferUsage.unordered_access, + ) + + # Input: [1.0, 2.0] + # Dispatch with 1 group of 32 threads + forward_kernel.dispatch( + thread_count=[32, 1, 1], + params=params, + result=result, + x0=1.0, + x1=2.0, + ) + + output = result.to_numpy().view(np.float32)[0] + expected = 1.0 * 1.0 + 2.0 * 2.0 + 0.5 # = 5.5 + assert np.isclose(output, expected, rtol=0.1), f"Output {output} != expected {expected}" + + finally: + device.close() + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang new file mode 100644 index 00000000..6907d758 --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// SlangPy test for FFLayer with CoopMat (WaveTangledVector) backend. +// This extends test_neural_frontend_training.slang to cover the cooperative matrix backend. +// +// Key requirements for CoopMat: +// - Must use explicit compute shader entry points with [numthreads(32, 1, 1)] +// - Types must be defined inside shader functions +// - Requires cooperative matrix capabilities + +import neural; + +typealias Storage = StructuredBufferStorage; + +// Shared memory configuration +static const int InputSize = 2; +static const int OutputSize = 1; +static const int SubgroupSize = 32; +static const int BatchSize = 32; + +typealias ShMemSize = SharedMemorySize; +typealias ShMemSizeLayer = ShMemSize.OfLayer1; + +// Linear layer: Input=2 (x, x^2), Output=1 (y), with bias +// Parameters: weights (1x2) + bias (1) = 3 params +static const int PARAM_COUNT = 3; + +// Simple function to get param count (doesn't need CoopMat) +int get_param_count() +{ + return PARAM_COUNT; +} + +// Compute shader entry point for forward pass test +// Dispatched with [1, 1, 1] groups, 32 threads per group +[shader("compute")] +[numthreads(32, 1, 1)] +void compute_forward_pass( + uint3 tid : SV_DispatchThreadID, + uint gtid : SV_GroupIndex, + RWStructuredBuffer params, + RWStructuredBuffer result, + uniform float x0, + uniform float x1) +{ + // Define CoopMat types inside shader function + typealias ShMemPool = SharedMemoryPool; + typealias V2 = WaveTangledVector; + typealias V1 = WaveTangledVector; + typealias Act = IdentityActivation; + typealias LinearLayer = FFLayer; + + let storage = Storage(params); + let layer = LinearLayer(0, 2); // weights at 0, bias at 2 + + float featsArr[InputSize] = { x0, x1 }; + let feats = V2(featsArr); + + let predV = layer.eval(storage, NoParam(), feats); + + // Only first thread writes result + if (gtid == 0) + { + result[0] = predV[0]; + } +} + +// Compute shader entry point for evaluating loss +[shader("compute")] +[numthreads(32, 1, 1)] +void compute_eval_loss( + uint3 tid : SV_DispatchThreadID, + uint gtid : SV_GroupIndex, + RWStructuredBuffer params, + StructuredBuffer xs, + StructuredBuffer ys, + RWStructuredBuffer loss_out, + uniform int count) +{ + typealias ShMemPool = SharedMemoryPool; + typealias V2 = WaveTangledVector; + typealias V1 = WaveTangledVector; + typealias Act = IdentityActivation; + typealias LinearLayer = FFLayer; + + let storage = Storage(params); + let layer = LinearLayer(0, 2); + + float sum = 0.0; + for (int i = 0; i < count; i++) + { + let x = xs[i]; + float featsArr[InputSize] = { x, x * x }; + let feats = V2(featsArr); + + let predV = layer.eval(storage, NoParam(), feats); + let pred = predV[0]; + let target = ys[i]; + + let err = pred - target; + sum += err * err; + } + + if (gtid == 0) + { + loss_out[0] = sum / float(count); + } +} + +// Compute shader entry point for training step +[shader("compute")] +[numthreads(32, 1, 1)] +void compute_train_step( + uint3 tid : SV_DispatchThreadID, + uint gtid : SV_GroupIndex, + RWStructuredBuffer params, + RWStructuredBuffer grads, + StructuredBuffer xs, + StructuredBuffer ys, + RWStructuredBuffer loss_out, + uniform int count, + uniform float learningRate) +{ + typealias ShMemPool = SharedMemoryPool; + typealias V2 = WaveTangledVector; + typealias V1 = WaveTangledVector; + typealias Act = IdentityActivation; + typealias LinearLayer = FFLayer; + + let storage = Storage(params); + let layer = LinearLayer(0, 2); + + // Clear grads (only first thread) + if (gtid == 0) + { + for (int i = 0; i < PARAM_COUNT; i++) + grads[i] = 0.0; + } + GroupMemoryBarrierWithGroupSync(); + + // Accumulate analytic grads for y = w0*x + w1*x^2 + b + float g0 = 0.0; + float g1 = 0.0; + float gb = 0.0; + float lossSum = 0.0; + + for (int i = 0; i < count; i++) + { + let x = xs[i]; + let t = ys[i]; + + float featsArr[InputSize] = { x, x * x }; + let feats = V2(featsArr); + + let predV = layer.eval(storage, NoParam(), feats); + let pred = predV[0]; + + let err = pred - t; + lossSum += err * err; + + g0 += 2.0 * err * x; + g1 += 2.0 * err * (x * x); + gb += 2.0 * err; + } + + // Only first thread writes and updates + if (gtid == 0) + { + let invN = 1.0 / float(count); + grads[0] = g0 * invN; + grads[1] = g1 * invN; + grads[2] = gb * invN; + + // SGD update + for (int i = 0; i < PARAM_COUNT; i++) + { + params[i] = params[i] - learningRate * grads[i]; + } + + loss_out[0] = lossSum * invN; + } +} diff --git a/slangpy/tests/slangpy_tests/test_neural_frontend_training.py b/slangpy/tests/slangpy_tests/test_neural_frontend_training.py new file mode 100644 index 00000000..75b8675d --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_frontend_training.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +SlangPy integration test for neural module FFLayer (Option 2 design). + +Tests training convergence for a simple quadratic regression task using: +- FFLayer with storage passed as parameter to eval() +- Manual gradient computation (analytic gradients) +- Simple SGD optimization + +We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 and verify convergence. +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest + +import slangpy as spy +from slangpy.core.calldata import SLANG_PATH +from slangpy.testing import helpers + + +def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device: + if helpers.should_skip_test_for_device(device_type): + pytest.skip(f"Device type {device_type.name} not selected for this test run") + + test_dir = Path(__file__).resolve().parent + + # Use pre-built neural module from slang (not compiled from source) + # The neural module is built as part of slang-neural-module target + # Enable experimental features since neural is an experimental module + compiler_options = spy.SlangCompilerOptions( + { + "include_paths": [test_dir, SLANG_PATH], + "debug_info": spy.SlangDebugInfoLevel.standard, + "enable_experimental_features": True, + } + ) + + return spy.Device( + type=device_type, + enable_debug_layers=True, + compiler_options=compiler_options, + label=f"uncached-slangpy-neural-frontend-{device_type.name}", + ) + + +@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) +def test_neural_frontend_training_converges(device_type: spy.DeviceType) -> None: + """ + Test that training converges for a simple quadratic regression task. + + Uses FFLayer with Option 2 design (storage as parameter to eval). + """ + device = _get_device_with_native_neural(device_type) + try: + module = spy.Module(device.load_module("test_neural_frontend_training.slang")) + + param_count = int(module.get_param_count()) + assert param_count == 3 + + # Fit: y = 2*x^2 - 0.5*x + 0.25 + sample_count = 256 + xs = np.linspace(-1.0, 1.0, sample_count, dtype=np.float32) + ys = (2.0 * xs * xs - 0.5 * xs + 0.25).astype(np.float32) + + xs_buf = device.create_buffer(data=xs, usage=spy.BufferUsage.shader_resource) + ys_buf = device.create_buffer(data=ys, usage=spy.BufferUsage.shader_resource) + + rng = np.random.default_rng(0) + params_init = (0.01 * rng.standard_normal(size=(param_count,))).astype(np.float32) + + params = device.create_buffer( + data=params_init, + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + grads = device.create_buffer( + data=np.zeros((param_count,), dtype=np.float32), + usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access, + ) + + initial_loss = float(module.eval_loss(params, xs_buf, ys_buf, sample_count)) + + learning_rate = 0.1 + steps = 200 + for _ in range(steps): + module.train_step(params, grads, xs_buf, ys_buf, sample_count, learning_rate) + + final_loss = float(module.eval_loss(params, xs_buf, ys_buf, sample_count)) + + # Convergence: should significantly reduce MSE and reach a small absolute error. + assert final_loss < initial_loss * 1e-2 + assert final_loss < 1e-3 + + # Parameter packing: [w0, w1, bias] for y = w0*x + w1*x^2 + bias + learned = params.to_numpy().view(np.float32)[:param_count] + expected = np.array([-0.5, 2.0, 0.25], dtype=np.float32) + assert np.allclose(learned, expected, rtol=0.1, atol=0.1) + + finally: + device.close() + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang b/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang new file mode 100644 index 00000000..204d6477 --- /dev/null +++ b/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// SlangPy test for FFLayer autodiff backward pass (Option 2 design). +// Verifies that autodiff correctly computes gradients through eval(). +// +// We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 using a single linear layer over +// features [x, x^2], and verify training converges. + +import slangpy; +import neural; + +typealias Storage = StructuredBufferStorage; +typealias V2 = InlineVector; +typealias V1 = InlineVector; +typealias Act = IdentityActivation; + +// Linear layer: Input=2 (x, x^2), Output=1 (y), with bias +// Parameters: weights (1x2) + bias (1) = 3 params +typealias LinearLayer = FFLayer; + +static const int PARAM_COUNT = LinearLayer.ParameterCount; + +int get_param_count() +{ + return PARAM_COUNT; +} + +float eval_loss( + RWStructuredBuffer params, + StructuredBuffer xs, + StructuredBuffer ys, + int count) +{ + let storage = Storage(params); + // Option 2: only addresses in constructor + // weights at 0 (2 floats), bias at 2 (1 float) + let layer = LinearLayer(0, 2); + + float sum = 0.0; + [MaxIters(1024)] + for (int i = 0; i < count; i++) + { + let x = xs[i]; + + float featsArr[2] = { x, x * x }; + let feats = V2(featsArr); + + // Option 2: storage passed to eval() + let predV = layer.eval(storage, NoParam(), feats); + let pred = predV[0]; + let target = ys[i]; + + let err = pred - target; + sum += err * err; + } + + return sum / float(count); +} + +float train_step( + RWStructuredBuffer params, + RWStructuredBuffer grads, + no_diff StructuredBuffer xs, + no_diff StructuredBuffer ys, + no_diff int count, + no_diff float learningRate) +{ + let pStorage = Storage(params); + let gStorage = Storage(grads); + + // Clear gradient buffer + [MaxIters(1024)] + for (int i = 0; i < PARAM_COUNT; i++) + grads[i] = 0.0; + + // Option 2: only addresses in constructor + let layer = LinearLayer(0, 2); + + // Accumulate analytic grads for y = w0*x + w1*x^2 + b, loss = mean((y - t)^2) + float g0 = 0.0; + float g1 = 0.0; + float gb = 0.0; + + float lossSum = 0.0; + + [MaxIters(1024)] + for (int i = 0; i < count; i++) + { + let x = xs[i]; + let t = ys[i]; + + float featsArr[2] = { x, x * x }; + let feats = V2(featsArr); + + // Option 2: storage passed to eval() + let predV = layer.eval(pStorage, NoParam(), feats); + let pred = predV[0]; + + let err = pred - t; + lossSum += err * err; + + g0 += 2.0 * err * x; + g1 += 2.0 * err * (x * x); + gb += 2.0 * err; + } + + let invN = 1.0 / float(count); + grads[0] = g0 * invN; + grads[1] = g1 * invN; + grads[2] = gb * invN; + + // Simple SGD update: params -= lr * grads + [MaxIters(1024)] + for (int i = 0; i < PARAM_COUNT; i++) + { + params[i] = params[i] - learningRate * grads[i]; + } + + return lossSum * invN; +} diff --git a/src/sgl/device/shader.cpp b/src/sgl/device/shader.cpp index c733e45e..48aa28c1 100644 --- a/src/sgl/device/shader.cpp +++ b/src/sgl/device/shader.cpp @@ -333,6 +333,10 @@ void SlangSession::create_session(SlangSessionBuild& build) session_options.add(slang::CompilerOptionName::DumpIntermediates, options.dump_intermediates); session_options.add(slang::CompilerOptionName::DumpIntermediatePrefix, options.dump_intermediates_prefix); + // Enable experimental features (e.g., experimental modules like neural). + if (options.enable_experimental_features) + session_options.add(slang::CompilerOptionName::ExperimentalFeature, true); + // Add hlsl_nvapi capability. session_options.add( slang::CompilerOptionName::Capability, diff --git a/src/sgl/device/shader.h b/src/sgl/device/shader.h index 538b58f8..1b6052e2 100644 --- a/src/sgl/device/shader.h +++ b/src/sgl/device/shader.h @@ -183,6 +183,9 @@ struct SlangCompilerOptions { /// Specifies a list of additional arguments to be passed to the downstream compiler. std::vector downstream_args; + /// Enable experimental features (e.g., experimental modules like neural). + bool enable_experimental_features{false}; + /// When set will dump the intermediate source output. bool dump_intermediates{false}; diff --git a/src/slangpy_ext/device/shader.cpp b/src/slangpy_ext/device/shader.cpp index 84bf61c6..39acec85 100644 --- a/src/slangpy_ext/device/shader.cpp +++ b/src/slangpy_ext/device/shader.cpp @@ -24,6 +24,7 @@ SGL_DICT_TO_DESC_FIELD(floating_point_mode, SlangFloatingPointMode) SGL_DICT_TO_DESC_FIELD(debug_info, SlangDebugInfoLevel) SGL_DICT_TO_DESC_FIELD(optimization, SlangOptimizationLevel) SGL_DICT_TO_DESC_FIELD_LIST(downstream_args, std::string) +SGL_DICT_TO_DESC_FIELD(enable_experimental_features, bool) SGL_DICT_TO_DESC_FIELD(dump_intermediates, bool) SGL_DICT_TO_DESC_FIELD(dump_intermediates_prefix, std::string) SGL_DICT_TO_DESC_END() @@ -135,6 +136,11 @@ SGL_PY_EXPORT(device_shader) .def_rw("debug_info", &SlangCompilerOptions::debug_info, D(SlangCompilerOptions, debug_info)) .def_rw("optimization", &SlangCompilerOptions::optimization, D(SlangCompilerOptions, optimization)) .def_rw("downstream_args", &SlangCompilerOptions::downstream_args, D(SlangCompilerOptions, downstream_args)) + .def_rw( + "enable_experimental_features", + &SlangCompilerOptions::enable_experimental_features, + "Enable experimental features (e.g., experimental modules like neural)." + ) .def_rw( "dump_intermediates", &SlangCompilerOptions::dump_intermediates,