From 69ccc1088a5d504dbc37925084dde13f6eac4f30 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 17 Sep 2024 21:54:18 -0700
Subject: [PATCH 01/10] initial version

---
 include/CL/opencl.hpp                  |   6 +
 samples/16_floatatomics/CMakeLists.txt |  10 ++
 samples/16_floatatomics/README.md      |  21 +++
 samples/16_floatatomics/main.cpp       | 205 +++++++++++++++++++++++++
 samples/CMakeLists.txt                 |   1 +
 5 files changed, 243 insertions(+)
 create mode 100644 samples/16_floatatomics/CMakeLists.txt
 create mode 100644 samples/16_floatatomics/README.md
 create mode 100644 samples/16_floatatomics/main.cpp
diff --git a/include/CL/opencl.hpp b/include/CL/opencl.hpp
index 88351de..a134a2b 100644
--- a/include/CL/opencl.hpp
+++ b/include/CL/opencl.hpp
@@ -1799,6 +1799,12 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LO
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)
 #endif /* cl_khr_kernel_clock */
 
+#if defined(cl_ext_float_atomics)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
+#endif /* cl_ext_float_atomics */
+
 #if defined(cl_intel_command_queue_families)
 CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // cl_intel_command_queue_families
diff --git a/samples/16_floatatomics/CMakeLists.txt b/samples/16_floatatomics/CMakeLists.txt
new file mode 100644
index 0000000..c60536d
--- /dev/null
+++ b/samples/16_floatatomics/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (c) 2019-2024 Ben Ashbaugh
+#
+# SPDX-License-Identifier: MIT
+
+add_opencl_sample(
+    TEST
+    NUMBER 16
+    TARGET floatatomics
+    VERSION 120
+    SOURCES main.cpp)
diff --git a/samples/16_floatatomics/README.md b/samples/16_floatatomics/README.md
new file mode 100644
index 0000000..9c57aad
--- /dev/null
+++ b/samples/16_floatatomics/README.md
@@ -0,0 +1,21 @@
+# Floating-point Atomic Adds
+
+## Sample Purpose
+
+TODO
+
+Inspired by: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html
+
+## Key APIs and Concepts
+
+TODO
+
+## Command Line Options
+
+| Option | Default Value | Description |
+|:--|:-:|:--|
+| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
+| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
+| `-i <number>` | 16 | Specify the number of iterations to execute.
+| `--gwx <number>` | 1024 | Specify the global work size to execute, which is also the number of floating-point atomics to perform.
+| `-e` | N/A | Unconditionally use the emulated floating-point atomic add.
diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
new file mode 100644
index 0000000..560844a
--- /dev/null
+++ b/samples/16_floatatomics/main.cpp
@@ -0,0 +1,205 @@
+/*
+// Copyright (c) 2019-2024 Ben Ashbaugh
+//
+// SPDX-License-Identifier: MIT
+*/
+
+#include <popl/popl.hpp>
+
+#include <CL/opencl.hpp>
+
+#include <chrono>
+#include <cinttypes>
+
+#include "util.hpp"
+
+static const char kernelString[] = R"CLC(
+inline float atomic_add_f(volatile global float* addr, float val)
+{
+    #if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE)
+        //#pragma message("using cl_ext_float_atomics")
+        return atomic_fetch_add_explicit((volatile global atomic_float*)addr, val, memory_order_relaxed);
+    #elif defined(cl_nv_pragma_unroll) && !defined(EMULATE)
+        //#pragma message("using PTX atomics")
+        float ret; asm volatile("atom.global.add.f32 %0,[%1],%2;":"=f"(ret):"l"(addr),"f"(val):"memory");
+        return ret;
+    #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
+        //#pragma message("using emulated float atomics")
+        float ret = atomic_xchg(addr, 0.0f);
+        float old = ret + val;
+        while((old = atomic_xchg(addr, old)) != 0.0f) {
+            old = atomic_xchg(addr, 0.0f) + old;
+        }
+        return ret;
+    #endif
+}
+
+kernel void FloatAtomicTest(global float* dst)
+{
+    atomic_add_f(dst, 1.0f);
+}
+)CLC";
+
+static void PrintFloatAtomicCapabilities(
+    cl_device_fp_atomic_capabilities_ext caps )
+{
+    if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT\n");
+    if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT        ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT\n");
+    if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT    ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT\n");
+    if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT  ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT\n");
+    if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT         ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT\n");
+    if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT     ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT\n");
+
+    cl_device_command_buffer_capabilities_khr extra = caps & ~(
+        CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT |
+        CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT |
+        CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT |
+        CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT |
+        CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT |
+        CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT );
+    if (extra) {
+        printf("\t\t(Unknown capability: %016" PRIx64 ")\n", extra);
+    }
+}
+
+int main(
+    int argc,
+    char** argv )
+{
+    int platformIndex = 0;
+    int deviceIndex = 0;
+
+    size_t iterations = 16;
+    size_t gwx = 1024 * 1024;
+
+    bool emulate = false;
+
+    {
+        popl::OptionParser op("Supported Options");
+        op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
+        op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
+        op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
+        op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
+        op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
+
+        bool printUsage = false;
+        try {
+            op.parse(argc, argv);
+        } catch (std::exception& e) {
+            fprintf(stderr, "Error: %s\n\n", e.what());
+            printUsage = true;
+        }
+
+        if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
+            fprintf(stderr,
+                "Usage: floatatomics [options]\n"
+                "%s", op.help().c_str());
+            return -1;
+        }
+    }
+
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+
+    printf("Running on platform: %s\n",
+        platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );
+
+    std::vector<cl::Device> devices;
+    platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+
+    printf("Running on device: %s\n",
+        devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
+
+    if (checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
+        printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
+
+        cl_device_fp_atomic_capabilities_ext spcaps =
+            devices[deviceIndex].getInfo<CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT>();
+        printf("CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
+        PrintFloatAtomicCapabilities(spcaps);
+
+        cl_device_fp_atomic_capabilities_ext dpcaps =
+            devices[deviceIndex].getInfo<CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT>();
+        printf("CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
+        PrintFloatAtomicCapabilities(dpcaps);
+
+        cl_device_fp_atomic_capabilities_ext hpcaps =
+            devices[deviceIndex].getInfo<CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT>();
+        printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n");
+        PrintFloatAtomicCapabilities(hpcaps);
+
+        if (spcaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT == 0) {
+            printf("Device does not support fp32 atomic add.\n");
+        }
+    } else {
+        printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
+    }
+
+    cl::Context context{devices[deviceIndex]};
+    cl::CommandQueue commandQueue{context, devices[deviceIndex]};
+
+    cl::Program program{ context, kernelString };
+    std::string buildOptions = "-cl-std=CL3.0";
+
+    if (emulate) {
+        printf("Forcing emulation.\n");
+        buildOptions += " -DEMULATE";
+    }
+
+    program.build(buildOptions);
+    cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" };
+
+    cl::Buffer deviceMemDst = cl::Buffer{
+        context,
+        CL_MEM_READ_WRITE,
+        sizeof(cl_float) };
+
+    // execution
+    {
+        kernel.setArg(0, deviceMemDst);
+
+        // Ensure the queue is empty and no processing is happening
+        // on the device before starting the timer.
+        commandQueue.finish();
+
+        auto start = std::chrono::system_clock::now();
+        for( size_t i = 0; i < iterations; i++ )
+        {
+            cl_float zero = 0.0f;
+            commandQueue.enqueueFillBuffer(
+                deviceMemDst,
+                zero,
+                0,
+                sizeof(zero));
+            commandQueue.enqueueNDRangeKernel(
+                kernel,
+                cl::NullRange,
+                cl::NDRange{gwx});
+        }
+
+        // Ensure all processing is complete before stopping the timer.
+        commandQueue.finish();
+
+        auto end = std::chrono::system_clock::now();
+        std::chrono::duration<float> elapsed_seconds = end - start;
+        printf("Finished in %f seconds\n", elapsed_seconds.count());
+    }
+
+    // validation
+    {
+        cl_float result = 0.0f;
+        commandQueue.enqueueReadBuffer(
+            deviceMemDst,
+            CL_TRUE,
+            0,
+            sizeof(result),
+            &result);
+        if (result != (float)gwx) {
+            printf("Error: expected %f, got %f!\n", (float)gwx, result);
+        } else {
+            printf("Success.\n");
+        }
+    }
+
+    return 0;
+}
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index f0439d7..a55e2f5 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -75,6 +75,7 @@ add_subdirectory( 05_spirvkernelfromfile )
 add_subdirectory( 06_ndrangekernelfromfile )
 
 add_subdirectory( 10_queueexperiments )
+add_subdirectory( 16_floatatomics )
 
 set(BUILD_EXTENSION_SAMPLES TRUE)
 if(NOT TARGET OpenCLExt)

From e0b855b4116e7a38eaa5e09ae86e9b5ce43faa62 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 17 Sep 2024 22:33:22 -0700
Subject: [PATCH 02/10] added documentation added AMD intrinsic variant

---
 samples/16_floatatomics/main.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 560844a..440f573 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -23,6 +23,9 @@ inline float atomic_add_f(volatile global float* addr, float val)
         //#pragma message("using PTX atomics")
         float ret; asm volatile("atom.global.add.f32 %0,[%1],%2;":"=f"(ret):"l"(addr),"f"(val):"memory");
         return ret;
+    #elif __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) && !defined(EMULATE)
+        //#pragma message("using AMD atomics")
+        return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
     #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
         //#pragma message("using emulated float atomics")
         float ret = atomic_xchg(addr, 0.0f);
@@ -139,8 +142,10 @@ int main(
     cl::CommandQueue commandQueue{context, devices[deviceIndex]};
 
     cl::Program program{ context, kernelString };
-    std::string buildOptions = "-cl-std=CL3.0";
 
+    // On some implementations, the feature test macros for float atomics are
+    // only defined when compiling for OpenCL C 3.0 or newer.
+    std::string buildOptions = "-cl-std=CL3.0";
     if (emulate) {
         printf("Forcing emulation.\n");
         buildOptions += " -DEMULATE";

From 5f38f9ada9cf784a7d5c65f822bbb13a4ff0eafc Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 18 Sep 2024 11:12:12 -0700
Subject: [PATCH 03/10] more updates and intermediate results validation

---
 samples/16_floatatomics/main.cpp | 109 +++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 35 deletions(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 440f573..5f66f4b 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -8,13 +8,15 @@
 
 #include <CL/opencl.hpp>
 
+#include <algorithm>
 #include <chrono>
 #include <cinttypes>
+#include <vector>
 
 #include "util.hpp"
 
 static const char kernelString[] = R"CLC(
-inline float atomic_add_f(volatile global float* addr, float val)
+float atomic_add_f(volatile global float* addr, float val)
 {
     #if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE)
         //#pragma message("using cl_ext_float_atomics")
@@ -28,18 +30,20 @@ inline float atomic_add_f(volatile global float* addr, float val)
         return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
     #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
         //#pragma message("using emulated float atomics")
-        float ret = atomic_xchg(addr, 0.0f);
-        float old = ret + val;
-        while((old = atomic_xchg(addr, old)) != 0.0f) {
-            old = atomic_xchg(addr, 0.0f) + old;
-        }
-        return ret;
+        float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
+        // Note: this emulated version cannot reliably return the previous value!
+        // This makes it unsuitable for general-purpose use, but it is sufficient
+        // for some cases, such as reductions.
+        // A more reliable version would use a compare-exchange loop, though it
+        // would be much slower.
+        return 0.0f;
     #endif
 }
 
-kernel void FloatAtomicTest(global float* dst)
+kernel void FloatAtomicTest(global float* dst, global float* results)
 {
-    atomic_add_f(dst, 1.0f);
+    int index = get_global_id(0);
+    results[index] = atomic_add_f(dst, 1.0f);
 }
 )CLC";
 
@@ -73,9 +77,10 @@ int main(
     int deviceIndex = 0;
 
     size_t iterations = 16;
-    size_t gwx = 1024 * 1024;
+    size_t gwx = 64 * 1024;
 
     bool emulate = false;
+    bool check = false;
 
     {
         popl::OptionParser op("Supported Options");
@@ -84,6 +89,7 @@ int main(
         op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
         op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
         op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
+        op.add<popl::Switch>("c", "check", "Check Intermediate Results", &check);
 
         bool printUsage = false;
         try {
@@ -113,7 +119,15 @@ int main(
     printf("Running on device: %s\n",
         devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
 
-    if (checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
+    // On some implementations, the feature test macros for float atomics are
+    // only defined when compiling for OpenCL C 3.0 or newer.
+    std::string buildOptions = "-cl-std=CL3.0";
+    if (emulate) {
+        printf("Forcing emulation.\n");
+        buildOptions += " -DEMULATE";
+    } else if (!checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
+        printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
+    } else {
         printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
 
         cl_device_fp_atomic_capabilities_ext spcaps =
@@ -130,41 +144,29 @@ int main(
             devices[deviceIndex].getInfo<CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT>();
         printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n");
         PrintFloatAtomicCapabilities(hpcaps);
-
-        if (spcaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT == 0) {
-            printf("Device does not support fp32 atomic add.\n");
-        }
-    } else {
-        printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
     }
 
     cl::Context context{devices[deviceIndex]};
     cl::CommandQueue commandQueue{context, devices[deviceIndex]};
 
     cl::Program program{ context, kernelString };
-
-    // On some implementations, the feature test macros for float atomics are
-    // only defined when compiling for OpenCL C 3.0 or newer.
-    std::string buildOptions = "-cl-std=CL3.0";
-    if (emulate) {
-        printf("Forcing emulation.\n");
-        buildOptions += " -DEMULATE";
-    }
-
     program.build(buildOptions);
     cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" };
 
-    cl::Buffer deviceMemDst = cl::Buffer{
+    cl::Buffer dst = cl::Buffer{
         context,
         CL_MEM_READ_WRITE,
         sizeof(cl_float) };
+    cl::Buffer intermediates = cl::Buffer{
+        context,
+        CL_MEM_READ_WRITE,
+        gwx * sizeof(cl_float) };
 
     // execution
     {
-        kernel.setArg(0, deviceMemDst);
+        kernel.setArg(0, dst);
+        kernel.setArg(1, intermediates);
 
-        // Ensure the queue is empty and no processing is happening
-        // on the device before starting the timer.
         commandQueue.finish();
 
         auto start = std::chrono::system_clock::now();
@@ -172,7 +174,7 @@ int main(
         {
             cl_float zero = 0.0f;
             commandQueue.enqueueFillBuffer(
-                deviceMemDst,
+                dst,
                 zero,
                 0,
                 sizeof(zero));
@@ -182,7 +184,6 @@ int main(
                 cl::NDRange{gwx});
         }
 
-        // Ensure all processing is complete before stopping the timer.
         commandQueue.finish();
 
         auto end = std::chrono::system_clock::now();
@@ -190,11 +191,11 @@ int main(
         printf("Finished in %f seconds\n", elapsed_seconds.count());
     }
 
-    // validation
+    // basic validation
     {
         cl_float result = 0.0f;
         commandQueue.enqueueReadBuffer(
-            deviceMemDst,
+            dst,
             CL_TRUE,
             0,
             sizeof(result),
@@ -202,7 +203,45 @@ int main(
         if (result != (float)gwx) {
             printf("Error: expected %f, got %f!\n", (float)gwx, result);
         } else {
-            printf("Success.\n");
+            printf("Basic Validation: Success.\n");
+        }
+    }
+
+    // intermediate results validation
+    if (check) {
+        if (emulate) {
+            printf("Skipping The emulated float atomic add does not support intermediate results.\n");
+        } else {
+            std::vector<cl_float> test(gwx);
+            commandQueue.enqueueReadBuffer(
+                intermediates,
+                CL_TRUE,
+                0,
+                gwx * sizeof(cl_float),
+                test.data());
+
+            std::sort(test.begin(), test.end());
+
+            size_t mismatches = 0;
+            for (size_t i = 0; i < gwx; i++) {
+                if (i == 0 && !(test[i] == 0.0f)) {
+                    if (mismatches < 16) {
+                        printf("Error at index %zu: expected %f, got %f!\n", i, 0.0f, test[i]);
+                    }
+                    mismatches++;
+                } else if (i > 0 && !(test[i] > test[i-1])) {
+                    if (mismatches < 16) {
+                        printf("Error at index %zu: expected %f > %f!\n", i, test[i], test[i-1]);
+                    }
+                    mismatches++;
+                }
+            }
+
+            if (mismatches) {
+                printf("Intermediate Results Validation: Found %zu mismatches / %zu values!!!\n", mismatches, gwx);
+            } else {
+                printf("Intermediate Results Validation: Success.\n");
+            }
         }
     }
 

From bd73dc43484ce918d7be547e0ac8da006d37e670 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Thu, 26 Sep 2024 15:45:05 -0700
Subject: [PATCH 04/10] temporarily enable intermediate result checks for
 emulated atomics

---
 samples/16_floatatomics/main.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 5f66f4b..574fad6 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -29,6 +29,7 @@ float atomic_add_f(volatile global float* addr, float val)
         //#pragma message("using AMD atomics")
         return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
     #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
+        #if 0
         //#pragma message("using emulated float atomics")
         float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
         // Note: this emulated version cannot reliably return the previous value!
@@ -37,6 +38,12 @@ float atomic_add_f(volatile global float* addr, float val)
         // A more reliable version would use a compare-exchange loop, though it
         // would be much slower.
         return 0.0f;
+        #else
+        float old = val;
+        float ret = 0.0f;
+        while ((old = atomic_xchg(addr, ret = atomic_xchg(addr, 0.0f) + old)) != 0.0f);
+        return ret;
+        #endif
     #endif
 }
 
@@ -209,7 +216,7 @@ int main(
 
     // intermediate results validation
     if (check) {
-        if (emulate) {
+        if (false && emulate) {
             printf("Skipping The emulated float atomic add does not support intermediate results.\n");
         } else {
             std::vector<cl_float> test(gwx);

From 9860956026493aa6e5c59b1776400e8aa89fde7a Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Fri, 27 Sep 2024 17:02:54 -0700
Subject: [PATCH 05/10] add documentation, clean up

---
 samples/16_floatatomics/README.md | 23 +++++++++++++++++++----
 samples/16_floatatomics/main.cpp  | 12 ++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/samples/16_floatatomics/README.md b/samples/16_floatatomics/README.md
index 9c57aad..35d59cc 100644
--- a/samples/16_floatatomics/README.md
+++ b/samples/16_floatatomics/README.md
@@ -2,13 +2,27 @@
 
 ## Sample Purpose
 
-TODO
+This is an advanced sample that demonstrates how to do atomic floating-point atomic addition in a kernel.
+The most standard way to perform floating-point atomic addition uses the [cl_ext_float_atomics](https://registry.khronos.org/OpenCL/extensions/ext/cl_ext_float_atomics.html) extension.
+This extension adds device queries and built-in functions to optionally support floating-point atomic add, min, max, load, and store on 16-bit, 32-bit, and 64-bit floating-point types.
+When the `cl_ext_float_atomics` extenison is supported, and 32-bit floating point atomic adds are supported, this sample will use the built-in functions added by this extension.
 
-Inspired by: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html
+This sample also fallback implentations when the `cl_ext_float_atomics` extension is not supported:
+
+* For NVIDIA GPUs, this sample includes a fallback that does the floating-point atomic add using inline PTX assembly language.
+* For AMD GPUs, this sample includes a fallback that calls a compiler intrinsic to do the floating-point atomic add.
+* For other devices, this sample includes a fallback that emulates the floating-point atomic add using 32-bit `atomic_xchg` functions.
+This fallback implementation cannot reliably return the "old" value that was in memory before performing the atomic add, so it is unsuitable for all usages, but it does work for some important uses-cases, such as reductions.
+
+This sample was inspired by the blog post: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html
 
 ## Key APIs and Concepts
 
-TODO
+```c
+CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
+__opencl_c_ext_fp32_global_atomic_add
+atomic_fetch_add_explicit
+```
 
 ## Command Line Options
 
@@ -17,5 +31,6 @@ TODO
 | `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
 | `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
 | `-i <number>` | 16 | Specify the number of iterations to execute.
-| `--gwx <number>` | 1024 | Specify the global work size to execute, which is also the number of floating-point atomics to perform.
+| `--gwx <number>` | 16384 | Specify the global work size to execute, which is also the number of floating-point atomics to perform.
 | `-e` | N/A | Unconditionally use the emulated floating-point atomic add.
+| `-e` | N/A | Check intermediate results for correctness, requires non-emulated atomics, requires adding a positive value.
diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 574fad6..a50598a 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -177,8 +177,7 @@ int main(
         commandQueue.finish();
 
         auto start = std::chrono::system_clock::now();
-        for( size_t i = 0; i < iterations; i++ )
-        {
+        for (size_t i = 0; i < iterations; i++) {
             cl_float zero = 0.0f;
             commandQueue.enqueueFillBuffer(
                 dst,
@@ -200,6 +199,11 @@ int main(
 
     // basic validation
     {
+        cl_float check = 0.0f;
+        for (size_t i = 0; i < gwx; i++) {
+            check += 1.0f;
+        }
+
         cl_float result = 0.0f;
         commandQueue.enqueueReadBuffer(
             dst,
@@ -207,8 +211,8 @@ int main(
             0,
             sizeof(result),
             &result);
-        if (result != (float)gwx) {
-            printf("Error: expected %f, got %f!\n", (float)gwx, result);
+        if (result != check) {
+            printf("Error: expected %f, got %f!\n", check, result);
         } else {
             printf("Basic Validation: Success.\n");
         }

From 83aeec873dc582ca51b3a6f4f6876b814cda6c27 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 30 Sep 2024 23:15:59 -0700
Subject: [PATCH 06/10] add a slower emulated fallback that uses a
 compare-and-swap loop

Might try to shift to the older OpenCL 1.x atomics to improve
portability.
---
 samples/16_floatatomics/main.cpp | 35 ++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index a50598a..1a424a3 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -28,22 +28,26 @@ float atomic_add_f(volatile global float* addr, float val)
     #elif __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) && !defined(EMULATE)
         //#pragma message("using AMD atomics")
         return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
-    #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
-        #if 0
+    #elif !defined(SLOW_EMULATE)
+        // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
         //#pragma message("using emulated float atomics")
         float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
         // Note: this emulated version cannot reliably return the previous value!
         // This makes it unsuitable for general-purpose use, but it is sufficient
         // for some cases, such as reductions.
-        // A more reliable version would use a compare-exchange loop, though it
-        // would be much slower.
         return 0.0f;
-        #else
-        float old = val;
-        float ret = 0.0f;
-        while ((old = atomic_xchg(addr, ret = atomic_xchg(addr, 0.0f) + old)) != 0.0f);
-        return ret;
-        #endif
+    #else
+        // This is the traditional fallback that uses a compare and exchange loop.
+        // It is much slower, but it supports returning the previous value.
+        //#pragma message("using slow emulated float atomics")
+        volatile global atomic_float* faddr = (volatile global atomic_float*)addr;
+        float old;
+        float new;
+        do {
+            old = atomic_load_explicit(faddr, memory_order_relaxed);
+            new = old + val;
+        } while (!atomic_compare_exchange_strong_explicit(faddr, &old, new, memory_order_relaxed, memory_order_relaxed));
+        return old;
     #endif
 }
 
@@ -87,6 +91,7 @@ int main(
     size_t gwx = 64 * 1024;
 
     bool emulate = false;
+    bool slowEmulate = false;
     bool check = false;
 
     {
@@ -96,6 +101,7 @@ int main(
         op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
         op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
         op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
+        op.add<popl::Switch>("s", "slow-emulate", "Unconditionally Emulate Float Atomics with Return Support", &slowEmulate);
         op.add<popl::Switch>("c", "check", "Check Intermediate Results", &check);
 
         bool printUsage = false;
@@ -129,7 +135,10 @@ int main(
     // On some implementations, the feature test macros for float atomics are
     // only defined when compiling for OpenCL C 3.0 or newer.
     std::string buildOptions = "-cl-std=CL3.0";
-    if (emulate) {
+    if (slowEmulate) {
+        printf("Forcing slow and safe emulation.\n");
+        buildOptions += " -DEMULATE -DSLOW_EMULATE";
+    } else if (emulate) {
         printf("Forcing emulation.\n");
         buildOptions += " -DEMULATE";
     } else if (!checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
@@ -220,8 +229,8 @@ int main(
 
     // intermediate results validation
     if (check) {
-        if (false && emulate) {
-            printf("Skipping The emulated float atomic add does not support intermediate results.\n");
+        if (emulate && !slowEmulate) {
+            printf("The emulated float atomic add does not support intermediate results.\n");
         } else {
             std::vector<cl_float> test(gwx);
             commandQueue.enqueueReadBuffer(

From 656647f7497340425a335ee61d8824d4fc2b93f4 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 1 Oct 2024 12:04:31 -0700
Subject: [PATCH 07/10] switch to the OpenCL 1.x atomics for more portability

---
 samples/16_floatatomics/main.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 1a424a3..2ae52f3 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -40,14 +40,15 @@ float atomic_add_f(volatile global float* addr, float val)
         // This is the traditional fallback that uses a compare and exchange loop.
         // It is much slower, but it supports returning the previous value.
         //#pragma message("using slow emulated float atomics")
-        volatile global atomic_float* faddr = (volatile global atomic_float*)addr;
-        float old;
-        float new;
+        volatile global int* iaddr = (volatile global int*)addr;
+        int old;
+        int check;
         do {
-            old = atomic_load_explicit(faddr, memory_order_relaxed);
-            new = old + val;
-        } while (!atomic_compare_exchange_strong_explicit(faddr, &old, new, memory_order_relaxed, memory_order_relaxed));
-        return old;
+            old = atomic_or(iaddr, 0);  // emulated atomic load
+            int new = as_int(as_float(old) + val);
+            check = atomic_cmpxchg(iaddr, old, new);
+        } while (check != old);
+        return as_float(old);
     #endif
 }
 

From d1207e301632ef9e16dd9e84a7e22abc8cce1dde Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 2 Oct 2024 21:42:12 -0700
Subject: [PATCH 08/10] final tidy up

---
 samples/16_floatatomics/README.md | 24 +++++++++++++++---------
 samples/16_floatatomics/main.cpp  |  2 +-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/samples/16_floatatomics/README.md b/samples/16_floatatomics/README.md
index 35d59cc..c3cddd5 100644
--- a/samples/16_floatatomics/README.md
+++ b/samples/16_floatatomics/README.md
@@ -2,26 +2,31 @@
 
 ## Sample Purpose
 
-This is an advanced sample that demonstrates how to do atomic floating-point atomic addition in a kernel.
-The most standard way to perform floating-point atomic addition uses the [cl_ext_float_atomics](https://registry.khronos.org/OpenCL/extensions/ext/cl_ext_float_atomics.html) extension.
+This is an advanced sample that demonstrates how to do atomic floating-point addition in a kernel.
+The most standard way to perform atomic floating-point addition uses the [cl_ext_float_atomics](https://registry.khronos.org/OpenCL/extensions/ext/cl_ext_float_atomics.html) extension.
 This extension adds device queries and built-in functions to optionally support floating-point atomic add, min, max, load, and store on 16-bit, 32-bit, and 64-bit floating-point types.
-When the `cl_ext_float_atomics` extenison is supported, and 32-bit floating point atomic adds are supported, this sample will use the built-in functions added by this extension.
+When the `cl_ext_float_atomics` extension is supported, and 32-bit floating point atomic adds are supported, this sample will use the built-in functions added by this extension.
 
-This sample also fallback implentations when the `cl_ext_float_atomics` extension is not supported:
+This sample also includes fallback implementations when the `cl_ext_float_atomics` extension is not supported:
 
 * For NVIDIA GPUs, this sample includes a fallback that does the floating-point atomic add using inline PTX assembly language.
 * For AMD GPUs, this sample includes a fallback that calls a compiler intrinsic to do the floating-point atomic add.
-* For other devices, this sample includes a fallback that emulates the floating-point atomic add using 32-bit `atomic_xchg` functions.
-This fallback implementation cannot reliably return the "old" value that was in memory before performing the atomic add, so it is unsuitable for all usages, but it does work for some important uses-cases, such as reductions.
+* For other devices, this sample includes two fallback implementations:
+    * The first emulates the floating-point atomic add using 32-bit `atomic_xchg` functions.
+      This fallback implementation cannot reliably return the "old" value that was in memory before performing the atomic add, so it is unsuitable for all usages, but it does work for some important uses-cases, such as reductions.
+    * The second emulates the floating-point atomic add using 32-bit `atomic_cmpxchg` functions.
+      This is a slower emulation, but it is able to reliably return the "old" value that was in memory before performing the atomic add.
 
 This sample was inspired by the blog post: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html
 
 ## Key APIs and Concepts
 
-```c
+```
 CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
 __opencl_c_ext_fp32_global_atomic_add
 atomic_fetch_add_explicit
+atomic_xchg
+atomic_cmpxchg
 ```
 
 ## Command Line Options
@@ -31,6 +36,7 @@ atomic_fetch_add_explicit
 | `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
 | `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
 | `-i <number>` | 16 | Specify the number of iterations to execute.
-| `--gwx <number>` | 16384 | Specify the global work size to execute, which is also the number of floating-point atomics to perform.
+| `--gwx <number>` | 16384 | Specify the global work size, which is also the number of floating-point atomics to perform.
 | `-e` | N/A | Unconditionally use the emulated floating-point atomic add.
-| `-e` | N/A | Check intermediate results for correctness, requires non-emulated atomics, requires adding a positive value.
+| `-s` | N/A | Unconditionally use the slower and safer emulated floating-point atomic add.
+| `-e` | N/A | Check intermediate results for correctness, unsupported for the faster emulated atomics, requires adding a positive value.
diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 2ae52f3..4dff090 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -102,7 +102,7 @@ int main(
         op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
         op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
         op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
-        op.add<popl::Switch>("s", "slow-emulate", "Unconditionally Emulate Float Atomics with Return Support", &slowEmulate);
+        op.add<popl::Switch>("s", "slow-emulate", "Unconditionally Emulate Float Atomics (slowly and safely)", &slowEmulate);
         op.add<popl::Switch>("c", "check", "Check Intermediate Results", &check);
 
         bool printUsage = false;

From b5e817aba7961a27e3c87ee69aae374bc9ec97e5 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 2 Oct 2024 21:45:25 -0700
Subject: [PATCH 09/10] a few more minor fixes

---
 samples/16_floatatomics/CMakeLists.txt | 2 +-
 samples/16_floatatomics/main.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/16_floatatomics/CMakeLists.txt b/samples/16_floatatomics/CMakeLists.txt
index c60536d..9e6989c 100644
--- a/samples/16_floatatomics/CMakeLists.txt
+++ b/samples/16_floatatomics/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024 Ben Ashbaugh
+# Copyright (c) 2024 Ben Ashbaugh
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index 4dff090..a233fc1 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019-2024 Ben Ashbaugh
+// Copyright (c) 2024 Ben Ashbaugh
 //
 // SPDX-License-Identifier: MIT
 */

From 9986e1f5ac734284dbdd4b6d2355c1c0de11aba2 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 2 Oct 2024 21:47:53 -0700
Subject: [PATCH 10/10] fix one very long line

---
 samples/16_floatatomics/main.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp
index a233fc1..e611eb5 100644
--- a/samples/16_floatatomics/main.cpp
+++ b/samples/16_floatatomics/main.cpp
@@ -259,7 +259,8 @@ int main(
             }
 
             if (mismatches) {
-                printf("Intermediate Results Validation: Found %zu mismatches / %zu values!!!\n", mismatches, gwx);
+                printf("Intermediate Results Validation: Found %zu mismatches / %zu values!!!\n",
+                    mismatches, gwx);
             } else {
                 printf("Intermediate Results Validation: Success.\n");
             }