NVIDIA
diff --git a/‎.agent/skills/licudacxx-style/SKILL.md‎
Lines changed: 90 additions & 0 deletions b/‎.agent/skills/licudacxx-style/SKILL.md‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎.claude/skills/licudacxx-style/SKILL.md‎
Lines changed: 6 additions & 0 deletions b/‎.claude/skills/licudacxx-style/SKILL.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cudax/examples/stf/CMakeLists.txt‎
Lines changed: 15 additions & 1 deletion b/‎cudax/examples/stf/CMakeLists.txt‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎cudax/examples/stf/binary_fhe_stackable.cu‎
Lines changed: 240 additions & 0 deletions b/‎cudax/examples/stf/binary_fhe_stackable.cu‎
Lines changed: 240 additions & 0 deletions
@@ -0,0 +1,90 @@
+---
+name: libcudacxx-style
+description: Make the code in libcudacxx/include, cudax/include compliant with the coding style
+---
+
+# libcudacxx Style
+
+## Naming style
+
+- Macros: macro style, e.g. `MY_MACRO`.
+- Template parameters: CamelCase, e.g. `MyParameter`.
+- All other symbols: snake style, e.g. `my_variable`.
+
+All non-public symbols must be C++ reserved identifiers:
+
+- `_` for macros and template parameters, e.g. `_MY_MACRO`., `_MyParameter`.
+- `__` for all other symbols, e.g. `__my_variable`.
+
+- Avoid single letter names for template parameters. Wrong: `_T`, correct: `_Tp`.
+
+## Variables
+
+- All variables that are not modified must use `const`. This includes variables initialized by casts (`static_cast`, `reinterpret_cast`, `bit_cast`), function return values, and loop-invariant computations.
+- All variables that can be evaluated at compile-time must use `constexpr`.
+- Consider using plural names for array, span, list, e.g. `int values[4]` instead of `int value[4]`.
+
+## Function
+
+Declaration/Definition:
+
+- All functions must be marked `_CCCL_HOST_API`, `_CCCL_DEVICE_API`, or `_CCCL_API`.
+- Non-template, non-`constexpr` functions must use `inline`.
+- Most functions with a non-void return type shall use `[[nodiscard]]`. Exceptions are functions with known side effects, e.g. `cuda::std::copy`
+- All functions that don't throw exception must use `noexcept`
+- `constexpr` must be used for all functions that don't depend on run-time features, e.g. pointers.
+- If the return type is not explicit (`auto`), then a trailing return type is strongly preferred, e.g. `auto abs(float) -> float`
+
+Function call:
+
+- All calls to free functions must be fully qualified starting from the global namespace, e.g. `::cuda::ceil_div`. This includes calls to functions defined in the same namespace, e.g. inside `cuda::`, call `::cuda::ceil_div(...)`, not `ceil_div(...)`. This does not apply to (static) member functions of classes.
+
+## Types
+
+- Type names must be fully qualified, except when they are already declared in the current namespace.
+- This includes standard integer type aliases (`::cuda::std::size_t`, `::cuda::std::uintptr_t`, `::cuda::std::int32_t`, etc.) and any other `cuda::std` or standard library types. A local `using` declaration (e.g. `using ::cuda::std::size_t;`) is acceptable to avoid repetition within a function body.
+
+## Headers
+
+- All header inclusions must use the syntax `<header>`.
+- Files must include all headers related to the symbols that they are using.
+- No transitive header inclusion are allowed.
+- Unneeded headers must be removed.
+- The headers must be the most precise one, e.g. `#include <cuda/std/__type_traits/is_array.h>`.
+- Headers in `cuda/std/__cccl/` must not be included directly (they are provided by `__config` or the prologue/epilogue mechanism).
+
+- All headers must have the correct license. If the file is ported from LLVM libc++ then we *must* use the LLVM license.
+- All headers must have the include guard, with the correct name: uppercase full path from the root, separated by `_`.
+- The closing `#endif` always carries a comment repeating the guard name.
+- Right after the include guard, the code must include:
+```cpp
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+```
+- The last included header must be `#include <cuda/std/__cccl/prologue.h>` before the code, and `#include <cuda/std/__cccl/epilogue.h>` at the end of a file.
+
+## Comments
+
+- Commented code without a description is not allowed.
+- Use Doxygen-style `//! @brief comments`.
+- When a function is documented with Doxygen, it must include: `//! @brief`, `//! @param[in/out/in,out]` for every parameter, and `//! @return` for non-void functions.
+- The `@brief/@param/@return` description must accurately reflect the current functionality of the function.
+
+## General guidelines
+
+- The code must reuse `cuda/` or `cuda/std` functionalities as much as possible, including macros.
+- Try to use modern C++ as much as possible. The repository supports C++17 but many more recent functionalities have been backported with functions and macros.
+
+## Prevent compiler errors and improve compatibility
+
+- Never allow lambda expressions in device-only or host-device code.
+- Protect host-only code with `#if !_CCCL_COMPILER(NVRTC)`.
+- Remove unused code, variables, functions, types, template parameters, headers, etc.
+- Variables that are unsigned, or that can become unsigned after template instantiation, must not check for negative values directly. Use `cuda::std::is_unsigned_v<T> ? false : (var < 0)` instead.
@@ -0,0 +1,6 @@
+---
+name: libcudacxx-style
+description: Make the code in libcudacxx/include, cudax/include compliant with the coding style
+---
+
+The skill content is in .agent/skills/licudacxx-style/SKILL.md
@@ -21,28 +21,42 @@ set(
   01-axpy-launch.cu
   01-axpy-parallel_for.cu
   binary_fhe.cu
+  binary_fhe_stackable.cu
   09-dot-reduce.cu
   cfd.cu
   custom_data_interface.cu
   fdtd_mgpu.cu
+  fdtd_while.cu
+  fdtd_repeat_n.cu
   frozen_data_init.cu
   graph_algorithms/degree_centrality.cu
   graph_algorithms/jaccard.cu
   graph_algorithms/pagerank.cu
+  graph_algorithms/pagerank_batched.cu
+  graph_algorithms/pagerank_while.cu
   graph_algorithms/tricount.cu
+  graph_scope.cu
   heat.cu
   heat_mgpu.cu
   jacobi.cu
   jacobi_pfor.cu
+  jacobi_stackable.cu
+  jacobi_stackable_raii.cu
+  jacobi_update_cond.cu
   launch_histogram.cu
   launch_scan.cu
   launch_sum.cu
   launch_sum_cub.cu
+  linear_algebra/burger.cu
+  linear_algebra/burger_sensitivity.cu
+  linear_algebra/cg_csr.cu
+  linear_algebra/cg_csr_stackable.cu
   logical_gates_composition.cu
   mandelbrot.cu
   parallel_for_2D.cu
   pi.cu
   scan.cu
+  sqrt_newton_stackable.cu
   standalone-launches.cu
   word_count.cu
   word_count_reduce.cu
@@ -52,9 +66,9 @@ set(
 set(
   stf_example_mathlib_sources
   linear_algebra/06-pdgemm.cu
+  linear_algebra/06-pdgemm-stackable.cu
   linear_algebra/07-cholesky.cu
   linear_algebra/07-potri.cu
-  linear_algebra/cg_csr.cu
   linear_algebra/cg_dense_2D.cu
   linear_algebra/strassen.cu
 )
 
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ * @brief A toy example to illustrate how we can compose logical operations over encrypted data
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+#include <memory>
+
+class ciphertext;
+
+class plaintext
+{
+public:
+  plaintext(const stackable_ctx& ctx)
+      : ctx(ctx)
+  {}
+
+  plaintext(stackable_ctx& ctx, ::std::vector<char> v)
+      : values(mv(v))
+      , ctx(ctx)
+      , ld(ctx.logical_data(values.data(), values.size()))
+  {}
+
+  auto& set_symbol(const std::string& s)
+  {
+    ld.set_symbol(s);
+    symbol = s;
+
+    return *this;
+  }
+
+  const std::string& get_symbol() const
+  {
+    return symbol;
+  }
+
+  // This will asynchronously fill string s
+  void convert_to_vector(std::vector<char>& v)
+  {
+    ctx.host_launch(ld.read()).set_symbol("to_vector")->*[&](auto dl) {
+      v.resize(dl.size());
+      for (size_t i = 0; i < dl.size(); i++)
+      {
+        v[i] = dl(i);
+      }
+    };
+  }
+
+  ciphertext encrypt() const;
+
+private:
+  std::vector<char> values;
+  mutable stackable_ctx ctx;
+  ::std::string symbol;
+
+public:
+  mutable stackable_logical_data<slice<char>> ld;
+};
+
+class ciphertext
+{
+public:
+  ciphertext() = default;
+
+  // We need a deep-copy semantic
+  ciphertext(const ciphertext& other)
+      : ctx(other.ctx)
+      , symbol(other.symbol)
+  {
+    copy_content(ctx, other, *this);
+  }
+
+  ciphertext(const stackable_ctx& ctx)
+      : ctx(ctx)
+  {}
+
+  ciphertext(ciphertext&&)            = default;
+  ciphertext& operator=(ciphertext&&) = default;
+
+  static void copy_content(stackable_ctx& ctx, const ciphertext& src, ciphertext& dst)
+  {
+    dst.ld = ctx.logical_data(src.ld.shape());
+    ctx.parallel_for(src.ld.shape(), src.ld.read(), dst.ld.write()).set_symbol("copy")->*
+      [] __device__(size_t i, auto src, auto dst) {
+        dst(i) = src(i);
+      };
+  }
+
+  auto& set_symbol(std::string s)
+  {
+    ld.set_symbol(s);
+    symbol = mv(s);
+
+    return *this;
+  }
+
+  const std::string& get_symbol() const
+  {
+    return symbol;
+  }
+
+  plaintext decrypt() const
+  {
+    plaintext p(ctx);
+    p.ld = ctx.logical_data(shape_of<slice<char>>(ld.shape().size()));
+    ctx.parallel_for(ld.shape(), ld.read(), p.ld.write()).set_symbol("decrypt")->*
+      [] __device__(size_t i, auto cipher_data, auto plain_data) {
+        plain_data(i) = static_cast<char>(cipher_data(i) >> 32);
+      };
+    return p;
+  }
+
+  // Copy assignment operator
+  // We need a deep-copy semantic
+  ciphertext& operator=(const ciphertext& other)
+  {
+    if (this != &other)
+    {
+      ctx    = other.ctx;
+      symbol = other.symbol;
+      copy_content(ctx, other, *this);
+    }
+    return *this;
+  }
+
+  ciphertext operator|(const ciphertext& other) const
+  {
+    ciphertext result(ctx);
+    result.ld = ctx.logical_data(ld.shape());
+
+    ctx.parallel_for(ld.shape(), ld.read(), other.ld.read(), result.ld.write()).set_symbol("OR")->*
+      [] __device__(size_t i, auto d_c1, auto d_c2, auto d_res) {
+        d_res(i) = d_c1(i) | d_c2(i);
+      };
+
+    return result;
+  }
+
+  ciphertext operator&(const ciphertext& other) const
+  {
+    ciphertext result(ctx);
+    result.ld = ctx.logical_data(ld.shape());
+
+    ctx.parallel_for(ld.shape(), ld.read(), other.ld.read(), result.ld.write()).set_symbol("AND")->*
+      [] __device__(size_t i, auto d_c1, auto d_c2, auto d_res) {
+        d_res(i) = d_c1(i) & d_c2(i);
+      };
+
+    return result;
+  }
+
+  ciphertext operator~() const
+  {
+    ciphertext result(ctx);
+    result.ld = ctx.logical_data(ld.shape());
+
+    ctx.parallel_for(ld.shape(), ld.read(), result.ld.write()).set_symbol("NOT")->*
+      [] __device__(size_t i, auto d_c, auto d_res) {
+        d_res(i) = ~d_c(i);
+      };
+
+    return result;
+  }
+
+  mutable stackable_logical_data<slice<uint64_t>> ld;
+
+private:
+  mutable stackable_ctx ctx;
+  ::std::string symbol;
+};
+
+ciphertext plaintext::encrypt() const
+{
+  ciphertext c(ctx);
+  c.ld = ctx.logical_data(shape_of<slice<uint64_t>>(ld.shape().size()));
+
+  ctx.parallel_for(ld.shape(), ld.read(), c.ld.write()).set_symbol("encrypt")->*
+    [] __device__(size_t i, auto dptxt, auto dctxt) {
+      // A super safe encryption !
+      dctxt(i) = ((uint64_t) (dptxt(i)) << 32 | 0x4);
+    };
+
+  return c;
+}
+
+template <typename T>
+T circuit(const T& a, const T& b)
+{
+  return ~((a | ~b) & (~a | b));
+}
+
+int main()
+{
+  stackable_ctx ctx;
+
+  const std::vector<char> vA{3, 3, 2, 2, 17};
+  plaintext pA(ctx, std::vector<char>(vA));
+  pA.set_symbol("A");
+
+  const std::vector<char> vB{1, 7, 7, 7, 49};
+  plaintext pB(ctx, std::vector<char>(vB));
+  pB.set_symbol("B");
+
+  auto s_encrypt = ctx.dot_section("encrypt");
+  auto eA        = pA.encrypt().set_symbol("A");
+  auto eB        = pB.encrypt().set_symbol("B");
+  s_encrypt.end();
+
+  ctx.push();
+
+  auto s_circuit = ctx.dot_section("circuit");
+  auto out       = circuit(eA, eB);
+  s_circuit.end();
+
+  ctx.pop();
+
+  std::vector<char> v_out;
+  out.decrypt().convert_to_vector(v_out);
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < v_out.size(); i++)
+  {
+    char expected = circuit(vA[i], vB[i]);
+    EXPECT(expected == v_out[i]);
+  }
+}