diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless.py b/slangpy/tests/slangpy_tests/test_neural_bindless.py
new file mode 100644
index 00000000..a6b5a29d
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_bindless.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Neural integration tests for bindless resource types.
+
+Reviewer-requested coverage:
+- Bindless "pointer type" (raw pointer parameters passed via Buffer.device_address)
+- Bindless DescriptorHandle resources (StructuredBuffer<T>.Handle / RWStructuredBuffer<T>.Handle)
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import slangpy as spy
+from slangpy.core.calldata import SLANG_PATH
+from slangpy.testing import helpers
+
+
+def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device:
+    if helpers.should_skip_test_for_device(device_type):
+        pytest.skip(f"Device type {device_type.name} not selected for this test run")
+
+    test_dir = Path(__file__).resolve().parent
+    compiler_options = spy.SlangCompilerOptions(
+        {
+            "include_paths": [test_dir, SLANG_PATH],
+            "debug_info": spy.SlangDebugInfoLevel.standard,
+            "enable_experimental_features": True,
+        }
+    )
+
+    return spy.Device(
+        type=device_type,
+        enable_debug_layers=True,
+        compiler_options=compiler_options,
+        label=f"uncached-slangpy-neural-bindless-{device_type.name}",
+    )
+
+
+# Pointer-style bindless params are supported on Vulkan. Keep this test on Vulkan only
+# to avoid backend-specific CUDA toolchain requirements for this integration test.
+POINTER_DEVICE_TYPES: list[spy.DeviceType] = [
+    x for x in helpers.DEFAULT_DEVICE_TYPES if x in [spy.DeviceType.vulkan]
+]
+
+
+@pytest.mark.parametrize("device_type", POINTER_DEVICE_TYPES)
+def test_neural_bindless_pointer_type(device_type: spy.DeviceType) -> None:
+    device = _get_device_with_native_neural(device_type)
+    try:
+        module = spy.Module(device.load_module("test_neural_bindless_pointer.slang"))
+
+        buf = device.create_buffer(
+            size=4,
+            usage=spy.BufferUsage.shader_resource,
+            data=np.array([42], dtype=np.int32),
+        )
+
+        res = int(module.read_int_ptr(buf.device_address))
+        assert res == 42
+    finally:
+        device.close()
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_neural_bindless_descriptor_handle_type(device_type: spy.DeviceType) -> None:
+    if device_type == spy.DeviceType.cuda:
+        pytest.skip("Bindless DescriptorHandle resources not supported with CUDA yet.")
+
+    device = _get_device_with_native_neural(device_type)
+    try:
+        if not device.has_feature(spy.Feature.bindless):
+            pytest.skip("Bindless not supported on this device.")
+
+        module = device.load_module("test_neural_bindless_descriptor_handle.slang")
+        program = device.link_program(
+            modules=[module], entry_points=[module.entry_point("compute_main")]
+        )
+        kernel = device.create_compute_kernel(program)
+
+        buffer_count = 6
+
+        ro_buffers: list[spy.Buffer] = []
+        rw_buffers: list[spy.Buffer] = []
+        for i in range(buffer_count):
+            ro_buffers.append(
+                device.create_buffer(
+                    size=4 * 4,
+                    usage=spy.BufferUsage.shader_resource,
+                    data=np.array([i * 10, i * 10 + 1, i * 10 + 2, i * 10 + 3], dtype=np.float32),
+                )
+            )
+            rw_buffers.append(
+                device.create_buffer(
+                    size=4 * 4,
+                    usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+                    data=np.zeros(4, dtype=np.float32),
+                )
+            )
+
+        buffer_info_layout = module.layout.get_type_layout(
+            module.layout.find_type_by_name("StructuredBuffer<BufferInfo>")
+        ).element_type_layout
+
+        buffer_infos_buffer = device.create_buffer(
+            size=buffer_count * buffer_info_layout.stride,
+            usage=spy.BufferUsage.shader_resource,
+        )
+        results_buffer = device.create_buffer(
+            size=buffer_count * 4,
+            usage=spy.BufferUsage.unordered_access,
+        )
+
+        c = spy.BufferCursor(buffer_info_layout, buffer_infos_buffer, load_before_write=False)
+        for i in range(buffer_count):
+            c[i].ro_buffer = ro_buffers[i].descriptor_handle_ro
+            c[i].rw_buffer = rw_buffers[i].descriptor_handle_rw
+            c[i].offset = i % 4
+        c.apply()
+
+        kernel.dispatch(
+            thread_count=[buffer_count, 1, 1],
+            buffer_infos=buffer_infos_buffer,
+            results=results_buffer,
+        )
+
+        results = results_buffer.to_numpy().view(np.float32)
+        expected_results = np.array(
+            [
+                0,  # buffer 0, offset 0
+                11,  # buffer 1, offset 1
+                22,  # buffer 2, offset 2
+                33,  # buffer 3, offset 3
+                40,  # buffer 4, offset 0
+                51,  # buffer 5, offset 1
+            ],
+            dtype=np.float32,
+        )
+        assert np.allclose(results, expected_results)
+
+        # Verify RW buffers were written.
+        for i in range(buffer_count):
+            rw_data = rw_buffers[i].to_numpy().view(np.float32)
+            offset = i % 4
+            expected_value = (i * 10 + offset) + 100.0
+            assert np.isclose(rw_data[offset], expected_value)
+    finally:
+        device.close()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
+
diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang b/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang
new file mode 100644
index 00000000..f89b39de
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_bindless_descriptor_handle.slang
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Neural integration smoke test for bindless DescriptorHandle style resources.
+// We import `neural` to ensure experimental module compilation works alongside bindless.
+
+import neural;
+
+struct BufferInfo
+{
+    StructuredBuffer<float>.Handle ro_buffer;
+    RWStructuredBuffer<float>.Handle rw_buffer;
+    uint offset;
+};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(
+    uint3 tid : SV_DispatchThreadID,
+    StructuredBuffer<BufferInfo> buffer_infos,
+    RWStructuredBuffer<float> results)
+{
+    uint index = tid.x;
+    BufferInfo info = buffer_infos[index];
+
+    float value = info.ro_buffer[info.offset];
+    info.rw_buffer[info.offset] = value + 100.0;
+    results[index] = value;
+}
+
diff --git a/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang b/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang
new file mode 100644
index 00000000..02147c50
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_bindless_pointer.slang
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Neural integration smoke test for "bindless pointer" style parameters.
+// This uses raw pointer parameters (passed from Python via Buffer.device_address).
+
+import slangpy;
+import neural;
+
+int read_int_ptr(int* ptr)
+{
+    return ptr[0];
+}
+
diff --git a/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py b/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py
new file mode 100644
index 00000000..604fa3ac
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_bwd_diff_smoke.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""
+Neural smoke test that actually exercises Slang autodiff (`bwd_diff(...)`).
+
+Important constraints:
+- No dependency on sample apps under `samples/`.
+- No dependency on external assets (e.g. image files).
+
+This uses the test-local Slang module `fflayer-bug-repro.slang` which imports the
+experimental `neural` module and calls `bwd_diff(loss)(DifferentialPtrPair<Storage>(...), ...)`.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import slangpy as spy
+from slangpy.core.calldata import SLANG_PATH
+from slangpy.testing import helpers
+
+
+def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device:
+    if helpers.should_skip_test_for_device(device_type):
+        pytest.skip(f"Device type {device_type.name} not selected for this test run")
+
+    test_dir = Path(__file__).resolve().parent
+    compiler_options = spy.SlangCompilerOptions(
+        {
+            "include_paths": [test_dir, SLANG_PATH],
+            "debug_info": spy.SlangDebugInfoLevel.standard,
+            "enable_experimental_features": True,
+        }
+    )
+
+    return spy.Device(
+        type=device_type,
+        enable_debug_layers=True,
+        compiler_options=compiler_options,
+        label=f"uncached-slangpy-neural-bwd-diff-{device_type.name}",
+    )
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_neural_bwd_diff_writes_param_grads(device_type: spy.DeviceType) -> None:
+    device = _get_device_with_native_neural(device_type)
+    try:
+        module = spy.Module(device.load_module("fflayer-bug-repro.slang"))
+
+        # 2*2 weights + 2 biases = 6 floats (matches `fflayer-bug-repo.py`)
+        params = device.create_buffer(
+            data=np.ones((6,), dtype=np.float32),
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+        dparams = device.create_buffer(
+            data=np.zeros((6,), dtype=np.float32),
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+
+        module.calculate_grad(input=spy.float2(1, 1), params=params, dparams=dparams)
+
+        dparams_np = dparams.to_numpy().view(np.float32)
+        assert np.any(dparams_np != 0.0)
+    finally:
+        device.close()
+
diff --git a/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py
new file mode 100644
index 00000000..698233db
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+SlangPy integration test for neural module FFLayer with CoopMat (WaveTangledVector) backend.
+
+This extends test_neural_frontend_training.py to cover the cooperative matrix backend.
+CoopMat requires:
+- Vulkan with cooperative matrix extension support
+- Explicit compute shaders with [numthreads(32, 1, 1)]
+- Types defined inside shader functions
+
+Tests training convergence for a simple quadratic regression task using:
+- FFLayer with WaveTangledVector backend
+- Manual gradient computation (analytic gradients)
+- Simple SGD optimization
+
+We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 and verify convergence.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import slangpy as spy
+from slangpy.core.calldata import SLANG_PATH
+from slangpy.testing import helpers
+
+
+def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device:
+    if helpers.should_skip_test_for_device(device_type):
+        pytest.skip(f"Device type {device_type.name} not selected for this test run")
+
+    test_dir = Path(__file__).resolve().parent
+
+    # Enable experimental features since neural is an experimental module
+    compiler_options = spy.SlangCompilerOptions(
+        {
+            "include_paths": [test_dir, SLANG_PATH],
+            "debug_info": spy.SlangDebugInfoLevel.standard,
+            "enable_experimental_features": True,
+        }
+    )
+
+    return spy.Device(
+        type=device_type,
+        enable_debug_layers=True,
+        compiler_options=compiler_options,
+        label=f"uncached-slangpy-neural-coopmat-frontend-{device_type.name}",
+    )
+
+
+# CoopMat only supported on Vulkan (and CUDA with SM 7.0+, but we focus on Vulkan here)
+COOPMAT_DEVICE_TYPES: list[spy.DeviceType] = [
+    x for x in helpers.DEFAULT_DEVICE_TYPES if x in [spy.DeviceType.vulkan]
+]
+
+
+@pytest.mark.parametrize("device_type", COOPMAT_DEVICE_TYPES)
+def test_neural_coopmat_frontend_training_converges(device_type: spy.DeviceType) -> None:
+    """
+    Test that training converges for a simple quadratic regression task using CoopMat backend.
+    
+    Uses FFLayer with WaveTangledVector (cooperative matrix backend).
+    """
+    device = _get_device_with_native_neural(device_type)
+    try:
+        # Check for cooperative matrix support
+        if not device.has_feature(spy.Feature.cooperative_matrix):
+            pytest.skip("Cooperative matrix not supported on this device.")
+
+        module = device.load_module("test_neural_coopmat_frontend_training.slang")
+
+        # Get param count via simple function (doesn't need CoopMat)
+        param_count = int(spy.Module(module).get_param_count())
+        assert param_count == 3
+
+        # Create compute kernels for CoopMat operations
+        eval_loss_program = device.link_program(
+            modules=[module],
+            entry_points=[module.entry_point("compute_eval_loss")]
+        )
+        eval_loss_kernel = device.create_compute_kernel(eval_loss_program)
+
+        train_step_program = device.link_program(
+            modules=[module],
+            entry_points=[module.entry_point("compute_train_step")]
+        )
+        train_step_kernel = device.create_compute_kernel(train_step_program)
+
+        # Fit: y = 2*x^2 - 0.5*x + 0.25
+        sample_count = 256
+        xs = np.linspace(-1.0, 1.0, sample_count, dtype=np.float32)
+        ys = (2.0 * xs * xs - 0.5 * xs + 0.25).astype(np.float32)
+
+        xs_buf = device.create_buffer(data=xs, usage=spy.BufferUsage.shader_resource)
+        ys_buf = device.create_buffer(data=ys, usage=spy.BufferUsage.shader_resource)
+
+        rng = np.random.default_rng(0)
+        params_init = (0.01 * rng.standard_normal(size=(param_count,))).astype(np.float32)
+
+        params = device.create_buffer(
+            data=params_init,
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+        grads = device.create_buffer(
+            data=np.zeros((param_count,), dtype=np.float32),
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+        loss_buf = device.create_buffer(
+            data=np.zeros(1, dtype=np.float32),
+            usage=spy.BufferUsage.unordered_access,
+        )
+
+        # Dispatch with 1 group of 32 threads (warp-sized for CoopMat)
+        eval_loss_kernel.dispatch(
+            thread_count=[32, 1, 1],
+            params=params,
+            xs=xs_buf,
+            ys=ys_buf,
+            loss_out=loss_buf,
+            count=sample_count,
+        )
+        initial_loss = float(loss_buf.to_numpy().view(np.float32)[0])
+
+        learning_rate = 0.1
+        steps = 200
+        for _ in range(steps):
+            train_step_kernel.dispatch(
+                thread_count=[32, 1, 1],
+                params=params,
+                grads=grads,
+                xs=xs_buf,
+                ys=ys_buf,
+                loss_out=loss_buf,
+                count=sample_count,
+                learningRate=learning_rate,
+            )
+
+        eval_loss_kernel.dispatch(
+            thread_count=[32, 1, 1],
+            params=params,
+            xs=xs_buf,
+            ys=ys_buf,
+            loss_out=loss_buf,
+            count=sample_count,
+        )
+        final_loss = float(loss_buf.to_numpy().view(np.float32)[0])
+
+        # Convergence: should significantly reduce MSE and reach a small absolute error.
+        assert final_loss < initial_loss * 1e-2, f"Final loss {final_loss} not < initial*0.01 {initial_loss * 1e-2}"
+        assert final_loss < 1e-3, f"Final loss {final_loss} not < 1e-3"
+
+        # Parameter packing: [w0, w1, bias] for y = w0*x + w1*x^2 + bias
+        learned = params.to_numpy().view(np.float32)[:param_count]
+        expected = np.array([-0.5, 2.0, 0.25], dtype=np.float32)
+        assert np.allclose(learned, expected, rtol=0.1, atol=0.1), f"Learned {learned} != expected {expected}"
+
+    finally:
+        device.close()
+
+
+@pytest.mark.parametrize("device_type", COOPMAT_DEVICE_TYPES)
+def test_neural_coopmat_fflayer_forward_pass(device_type: spy.DeviceType) -> None:
+    """
+    Test FFLayer forward pass with WaveTangledVector produces correct output.
+    """
+    device = _get_device_with_native_neural(device_type)
+    try:
+        if not device.has_feature(spy.Feature.cooperative_matrix):
+            pytest.skip("Cooperative matrix not supported on this device.")
+
+        module = device.load_module("test_neural_coopmat_frontend_training.slang")
+
+        forward_program = device.link_program(
+            modules=[module],
+            entry_points=[module.entry_point("compute_forward_pass")]
+        )
+        forward_kernel = device.create_compute_kernel(forward_program)
+
+        # Set up known weights for verification
+        # Layer: 2 inputs -> 1 output with bias
+        # y = w0*x0 + w1*x1 + b
+        # With w0=1, w1=2, b=0.5: y = 1*1 + 2*2 + 0.5 = 5.5
+        params = device.create_buffer(
+            data=np.array([1.0, 2.0, 0.5], dtype=np.float32),
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+        result = device.create_buffer(
+            data=np.zeros(1, dtype=np.float32),
+            usage=spy.BufferUsage.unordered_access,
+        )
+
+        # Input: [1.0, 2.0]
+        # Dispatch with 1 group of 32 threads
+        forward_kernel.dispatch(
+            thread_count=[32, 1, 1],
+            params=params,
+            result=result,
+            x0=1.0,
+            x1=2.0,
+        )
+
+        output = result.to_numpy().view(np.float32)[0]
+        expected = 1.0 * 1.0 + 2.0 * 2.0 + 0.5  # = 5.5
+        assert np.isclose(output, expected, rtol=0.1), f"Output {output} != expected {expected}"
+
+    finally:
+        device.close()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
diff --git a/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang
new file mode 100644
index 00000000..6907d758
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_coopmat_frontend_training.slang
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// SlangPy test for FFLayer with CoopMat (WaveTangledVector) backend.
+// This extends test_neural_frontend_training.slang to cover the cooperative matrix backend.
+//
+// Key requirements for CoopMat:
+// - Must use explicit compute shader entry points with [numthreads(32, 1, 1)]
+// - Types must be defined inside shader functions
+// - Requires cooperative matrix capabilities
+
+import neural;
+
+typealias Storage = StructuredBufferStorage<float>;
+
+// Shared memory configuration
+static const int InputSize = 2;
+static const int OutputSize = 1;
+static const int SubgroupSize = 32;
+static const int BatchSize = 32;
+
+typealias ShMemSize = SharedMemorySize<float, TargetEnum.CUDA, ExecutionMode.Training, SubgroupSize, BatchSize / SubgroupSize>;
+typealias ShMemSizeLayer = ShMemSize.OfLayer1<InputSize, OutputSize>;
+
+// Linear layer: Input=2 (x, x^2), Output=1 (y), with bias
+// Parameters: weights (1x2) + bias (1) = 3 params
+static const int PARAM_COUNT = 3;
+
+// Simple function to get param count (doesn't need CoopMat)
+int get_param_count()
+{
+    return PARAM_COUNT;
+}
+
+// Compute shader entry point for forward pass test
+// Dispatched with [1, 1, 1] groups, 32 threads per group
+[shader("compute")]
+[numthreads(32, 1, 1)]
+void compute_forward_pass(
+    uint3 tid : SV_DispatchThreadID,
+    uint gtid : SV_GroupIndex,
+    RWStructuredBuffer<float> params,
+    RWStructuredBuffer<float> result,
+    uniform float x0,
+    uniform float x1)
+{
+    // Define CoopMat types inside shader function
+    typealias ShMemPool = SharedMemoryPool<ShMemSizeLayer>;
+    typealias V2 = WaveTangledVector<float, ShMemPool, InputSize, SubgroupSize>;
+    typealias V1 = WaveTangledVector<float, ShMemPool, OutputSize, SubgroupSize>;
+    typealias Act = IdentityActivation<float>;
+    typealias LinearLayer = FFLayer<float, V2, V1, Storage, Act, true>;
+
+    let storage = Storage(params);
+    let layer = LinearLayer(0, 2);  // weights at 0, bias at 2
+
+    float featsArr[InputSize] = { x0, x1 };
+    let feats = V2(featsArr);
+
+    let predV = layer.eval<Storage>(storage, NoParam(), feats);
+
+    // Only first thread writes result
+    if (gtid == 0)
+    {
+        result[0] = predV[0];
+    }
+}
+
+// Compute shader entry point for evaluating loss
+[shader("compute")]
+[numthreads(32, 1, 1)]
+void compute_eval_loss(
+    uint3 tid : SV_DispatchThreadID,
+    uint gtid : SV_GroupIndex,
+    RWStructuredBuffer<float> params,
+    StructuredBuffer<float> xs,
+    StructuredBuffer<float> ys,
+    RWStructuredBuffer<float> loss_out,
+    uniform int count)
+{
+    typealias ShMemPool = SharedMemoryPool<ShMemSizeLayer>;
+    typealias V2 = WaveTangledVector<float, ShMemPool, InputSize, SubgroupSize>;
+    typealias V1 = WaveTangledVector<float, ShMemPool, OutputSize, SubgroupSize>;
+    typealias Act = IdentityActivation<float>;
+    typealias LinearLayer = FFLayer<float, V2, V1, Storage, Act, true>;
+
+    let storage = Storage(params);
+    let layer = LinearLayer(0, 2);
+
+    float sum = 0.0;
+    for (int i = 0; i < count; i++)
+    {
+        let x = xs[i];
+        float featsArr[InputSize] = { x, x * x };
+        let feats = V2(featsArr);
+
+        let predV = layer.eval<Storage>(storage, NoParam(), feats);
+        let pred = predV[0];
+        let target = ys[i];
+
+        let err = pred - target;
+        sum += err * err;
+    }
+
+    if (gtid == 0)
+    {
+        loss_out[0] = sum / float(count);
+    }
+}
+
+// Compute shader entry point for training step
+[shader("compute")]
+[numthreads(32, 1, 1)]
+void compute_train_step(
+    uint3 tid : SV_DispatchThreadID,
+    uint gtid : SV_GroupIndex,
+    RWStructuredBuffer<float> params,
+    RWStructuredBuffer<float> grads,
+    StructuredBuffer<float> xs,
+    StructuredBuffer<float> ys,
+    RWStructuredBuffer<float> loss_out,
+    uniform int count,
+    uniform float learningRate)
+{
+    typealias ShMemPool = SharedMemoryPool<ShMemSizeLayer>;
+    typealias V2 = WaveTangledVector<float, ShMemPool, InputSize, SubgroupSize>;
+    typealias V1 = WaveTangledVector<float, ShMemPool, OutputSize, SubgroupSize>;
+    typealias Act = IdentityActivation<float>;
+    typealias LinearLayer = FFLayer<float, V2, V1, Storage, Act, true>;
+
+    let storage = Storage(params);
+    let layer = LinearLayer(0, 2);
+
+    // Clear grads (only first thread)
+    if (gtid == 0)
+    {
+        for (int i = 0; i < PARAM_COUNT; i++)
+            grads[i] = 0.0;
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    // Accumulate analytic grads for y = w0*x + w1*x^2 + b
+    float g0 = 0.0;
+    float g1 = 0.0;
+    float gb = 0.0;
+    float lossSum = 0.0;
+
+    for (int i = 0; i < count; i++)
+    {
+        let x = xs[i];
+        let t = ys[i];
+
+        float featsArr[InputSize] = { x, x * x };
+        let feats = V2(featsArr);
+
+        let predV = layer.eval<Storage>(storage, NoParam(), feats);
+        let pred = predV[0];
+
+        let err = pred - t;
+        lossSum += err * err;
+
+        g0 += 2.0 * err * x;
+        g1 += 2.0 * err * (x * x);
+        gb += 2.0 * err;
+    }
+
+    // Only first thread writes and updates
+    if (gtid == 0)
+    {
+        let invN = 1.0 / float(count);
+        grads[0] = g0 * invN;
+        grads[1] = g1 * invN;
+        grads[2] = gb * invN;
+
+        // SGD update
+        for (int i = 0; i < PARAM_COUNT; i++)
+        {
+            params[i] = params[i] - learningRate * grads[i];
+        }
+
+        loss_out[0] = lossSum * invN;
+    }
+}
diff --git a/slangpy/tests/slangpy_tests/test_neural_frontend_training.py b/slangpy/tests/slangpy_tests/test_neural_frontend_training.py
new file mode 100644
index 00000000..75b8675d
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_frontend_training.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+SlangPy integration test for neural module FFLayer (Option 2 design).
+
+Tests training convergence for a simple quadratic regression task using:
+- FFLayer with storage passed as parameter to eval<S>()
+- Manual gradient computation (analytic gradients)
+- Simple SGD optimization
+
+We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 and verify convergence.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import slangpy as spy
+from slangpy.core.calldata import SLANG_PATH
+from slangpy.testing import helpers
+
+
+def _get_device_with_native_neural(device_type: spy.DeviceType) -> spy.Device:
+    if helpers.should_skip_test_for_device(device_type):
+        pytest.skip(f"Device type {device_type.name} not selected for this test run")
+
+    test_dir = Path(__file__).resolve().parent
+
+    # Use pre-built neural module from slang (not compiled from source)
+    # The neural module is built as part of slang-neural-module target
+    # Enable experimental features since neural is an experimental module
+    compiler_options = spy.SlangCompilerOptions(
+        {
+            "include_paths": [test_dir, SLANG_PATH],
+            "debug_info": spy.SlangDebugInfoLevel.standard,
+            "enable_experimental_features": True,
+        }
+    )
+
+    return spy.Device(
+        type=device_type,
+        enable_debug_layers=True,
+        compiler_options=compiler_options,
+        label=f"uncached-slangpy-neural-frontend-{device_type.name}",
+    )
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_neural_frontend_training_converges(device_type: spy.DeviceType) -> None:
+    """
+    Test that training converges for a simple quadratic regression task.
+    
+    Uses FFLayer with Option 2 design (storage as parameter to eval<S>).
+    """
+    device = _get_device_with_native_neural(device_type)
+    try:
+        module = spy.Module(device.load_module("test_neural_frontend_training.slang"))
+
+        param_count = int(module.get_param_count())
+        assert param_count == 3
+
+        # Fit: y = 2*x^2 - 0.5*x + 0.25
+        sample_count = 256
+        xs = np.linspace(-1.0, 1.0, sample_count, dtype=np.float32)
+        ys = (2.0 * xs * xs - 0.5 * xs + 0.25).astype(np.float32)
+
+        xs_buf = device.create_buffer(data=xs, usage=spy.BufferUsage.shader_resource)
+        ys_buf = device.create_buffer(data=ys, usage=spy.BufferUsage.shader_resource)
+
+        rng = np.random.default_rng(0)
+        params_init = (0.01 * rng.standard_normal(size=(param_count,))).astype(np.float32)
+
+        params = device.create_buffer(
+            data=params_init,
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+        grads = device.create_buffer(
+            data=np.zeros((param_count,), dtype=np.float32),
+            usage=spy.BufferUsage.shader_resource | spy.BufferUsage.unordered_access,
+        )
+
+        initial_loss = float(module.eval_loss(params, xs_buf, ys_buf, sample_count))
+
+        learning_rate = 0.1
+        steps = 200
+        for _ in range(steps):
+            module.train_step(params, grads, xs_buf, ys_buf, sample_count, learning_rate)
+
+        final_loss = float(module.eval_loss(params, xs_buf, ys_buf, sample_count))
+
+        # Convergence: should significantly reduce MSE and reach a small absolute error.
+        assert final_loss < initial_loss * 1e-2
+        assert final_loss < 1e-3
+
+        # Parameter packing: [w0, w1, bias] for y = w0*x + w1*x^2 + bias
+        learned = params.to_numpy().view(np.float32)[:param_count]
+        expected = np.array([-0.5, 2.0, 0.25], dtype=np.float32)
+        assert np.allclose(learned, expected, rtol=0.1, atol=0.1)
+
+    finally:
+        device.close()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
diff --git a/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang b/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang
new file mode 100644
index 00000000..204d6477
--- /dev/null
+++ b/slangpy/tests/slangpy_tests/test_neural_frontend_training.slang
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// SlangPy test for FFLayer autodiff backward pass (Option 2 design).
+// Verifies that autodiff correctly computes gradients through eval<S>().
+//
+// We fit a quadratic polynomial y = 2*x^2 - 0.5*x + 0.25 using a single linear layer over
+// features [x, x^2], and verify training converges.
+
+import slangpy;
+import neural;
+
+typealias Storage = StructuredBufferStorage<float>;
+typealias V2 = InlineVector<float, 2>;
+typealias V1 = InlineVector<float, 1>;
+typealias Act = IdentityActivation<float>;
+
+// Linear layer: Input=2 (x, x^2), Output=1 (y), with bias
+// Parameters: weights (1x2) + bias (1) = 3 params
+typealias LinearLayer = FFLayer<float, V2, V1, Storage, Act, true>;
+
+static const int PARAM_COUNT = LinearLayer.ParameterCount;
+
+int get_param_count()
+{
+    return PARAM_COUNT;
+}
+
+float eval_loss(
+    RWStructuredBuffer<float> params,
+    StructuredBuffer<float> xs,
+    StructuredBuffer<float> ys,
+    int count)
+{
+    let storage = Storage(params);
+    // Option 2: only addresses in constructor
+    // weights at 0 (2 floats), bias at 2 (1 float)
+    let layer = LinearLayer(0, 2);
+
+    float sum = 0.0;
+    [MaxIters(1024)]
+    for (int i = 0; i < count; i++)
+    {
+        let x = xs[i];
+
+        float featsArr[2] = { x, x * x };
+        let feats = V2(featsArr);
+
+        // Option 2: storage passed to eval<S>()
+        let predV = layer.eval<Storage>(storage, NoParam(), feats);
+        let pred = predV[0];
+        let target = ys[i];
+
+        let err = pred - target;
+        sum += err * err;
+    }
+
+    return sum / float(count);
+}
+
+float train_step(
+    RWStructuredBuffer<float> params,
+    RWStructuredBuffer<float> grads,
+    no_diff StructuredBuffer<float> xs,
+    no_diff StructuredBuffer<float> ys,
+    no_diff int count,
+    no_diff float learningRate)
+{
+    let pStorage = Storage(params);
+    let gStorage = Storage(grads);
+
+    // Clear gradient buffer
+    [MaxIters(1024)]
+    for (int i = 0; i < PARAM_COUNT; i++)
+        grads[i] = 0.0;
+
+    // Option 2: only addresses in constructor
+    let layer = LinearLayer(0, 2);
+
+    // Accumulate analytic grads for y = w0*x + w1*x^2 + b, loss = mean((y - t)^2)
+    float g0 = 0.0;
+    float g1 = 0.0;
+    float gb = 0.0;
+
+    float lossSum = 0.0;
+
+    [MaxIters(1024)]
+    for (int i = 0; i < count; i++)
+    {
+        let x = xs[i];
+        let t = ys[i];
+
+        float featsArr[2] = { x, x * x };
+        let feats = V2(featsArr);
+
+        // Option 2: storage passed to eval<S>()
+        let predV = layer.eval<Storage>(pStorage, NoParam(), feats);
+        let pred = predV[0];
+
+        let err = pred - t;
+        lossSum += err * err;
+
+        g0 += 2.0 * err * x;
+        g1 += 2.0 * err * (x * x);
+        gb += 2.0 * err;
+    }
+
+    let invN = 1.0 / float(count);
+    grads[0] = g0 * invN;
+    grads[1] = g1 * invN;
+    grads[2] = gb * invN;
+
+    // Simple SGD update: params -= lr * grads
+    [MaxIters(1024)]
+    for (int i = 0; i < PARAM_COUNT; i++)
+    {
+        params[i] = params[i] - learningRate * grads[i];
+    }
+
+    return lossSum * invN;
+}
diff --git a/src/sgl/device/shader.cpp b/src/sgl/device/shader.cpp
index c733e45e..48aa28c1 100644
--- a/src/sgl/device/shader.cpp
+++ b/src/sgl/device/shader.cpp
@@ -333,6 +333,10 @@ void SlangSession::create_session(SlangSessionBuild& build)
     session_options.add(slang::CompilerOptionName::DumpIntermediates, options.dump_intermediates);
     session_options.add(slang::CompilerOptionName::DumpIntermediatePrefix, options.dump_intermediates_prefix);
 
+    // Enable experimental features (e.g., experimental modules like neural).
+    if (options.enable_experimental_features)
+        session_options.add(slang::CompilerOptionName::ExperimentalFeature, true);
+
     // Add hlsl_nvapi capability.
     session_options.add(
         slang::CompilerOptionName::Capability,
diff --git a/src/sgl/device/shader.h b/src/sgl/device/shader.h
index 538b58f8..1b6052e2 100644
--- a/src/sgl/device/shader.h
+++ b/src/sgl/device/shader.h
@@ -183,6 +183,9 @@ struct SlangCompilerOptions {
     /// Specifies a list of additional arguments to be passed to the downstream compiler.
     std::vector<std::string> downstream_args;
 
+    /// Enable experimental features (e.g., experimental modules like neural).
+    bool enable_experimental_features{false};
+
     /// When set will dump the intermediate source output.
     bool dump_intermediates{false};
 
diff --git a/src/slangpy_ext/device/shader.cpp b/src/slangpy_ext/device/shader.cpp
index 84bf61c6..39acec85 100644
--- a/src/slangpy_ext/device/shader.cpp
+++ b/src/slangpy_ext/device/shader.cpp
@@ -24,6 +24,7 @@ SGL_DICT_TO_DESC_FIELD(floating_point_mode, SlangFloatingPointMode)
 SGL_DICT_TO_DESC_FIELD(debug_info, SlangDebugInfoLevel)
 SGL_DICT_TO_DESC_FIELD(optimization, SlangOptimizationLevel)
 SGL_DICT_TO_DESC_FIELD_LIST(downstream_args, std::string)
+SGL_DICT_TO_DESC_FIELD(enable_experimental_features, bool)
 SGL_DICT_TO_DESC_FIELD(dump_intermediates, bool)
 SGL_DICT_TO_DESC_FIELD(dump_intermediates_prefix, std::string)
 SGL_DICT_TO_DESC_END()
@@ -135,6 +136,11 @@ SGL_PY_EXPORT(device_shader)
         .def_rw("debug_info", &SlangCompilerOptions::debug_info, D(SlangCompilerOptions, debug_info))
         .def_rw("optimization", &SlangCompilerOptions::optimization, D(SlangCompilerOptions, optimization))
         .def_rw("downstream_args", &SlangCompilerOptions::downstream_args, D(SlangCompilerOptions, downstream_args))
+        .def_rw(
+            "enable_experimental_features",
+            &SlangCompilerOptions::enable_experimental_features,
+            "Enable experimental features (e.g., experimental modules like neural)."
+        )
         .def_rw(
             "dump_intermediates",
             &SlangCompilerOptions::dump_intermediates,