kul-optec
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/hyh.cpp‎
Lines changed: 14 additions & 18 deletions b/‎benchmarks/hyh.cpp‎
Lines changed: 14 additions & 18 deletions
diff --git a/‎benchmarks/ocp.cpp‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/ocp.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎conanfile.py‎
Lines changed: 2 additions & 2 deletions b/‎conanfile.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/config.hpp.in‎
Lines changed: 1 addition & 1 deletion b/‎src/config.hpp.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/hyhound/include/hyhound/householder-updowndate-micro-kernels.tpp‎
Lines changed: 40 additions & 40 deletions b/‎src/hyhound/include/hyhound/householder-updowndate-micro-kernels.tpp‎
Lines changed: 40 additions & 40 deletions
@@ -9,8 +9,8 @@ include(CTest)
 
 # Options
 include(CMakeDependentOption)
-set(HYHOUND_DENSE_REAL_TYPE "double" CACHE STRING
-    "The main floating point type for representing real numbers")
+set(HYHOUND_DENSE_REAL_TYPE "double" "float" CACHE STRING
+    "The floating point types that the functions are instantiated for")
 set(HYHOUND_DENSE_INDEX_TYPE "long long" CACHE STRING
     "The main integer type for indices and sizes")
 # Target options
 
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <cstdlib>
+#include <limits>
 #include <map>
 #include <mutex>
 #include <random>
@@ -28,15 +29,15 @@ using std::pow;
 #endif
 
 struct ProblemMatrices {
-    Eigen::MatrixXd K̃, K, L, A;
+    Eigen::MatrixX<real_t> K̃, K, L, A;
 };
 using cache_t = std::map<std::pair<index_t, index_t>, ProblemMatrices>;
 std::mutex cache_mtx;
 cache_t cache;
 
 struct CholeskyFixture : benchmark::Fixture {
     index_t m, n;
-    Eigen::MatrixXd L̃;
+    Eigen::MatrixX<real_t> L̃;
     cache_t::const_iterator matrices;
 
     static cache_t::const_iterator generate_problem(index_t m, index_t n) {
@@ -53,7 +54,7 @@ struct CholeskyFixture : benchmark::Fixture {
 #endif
 
         std::mt19937 rng{12345};
-        std::uniform_real_distribution<> dist(0.0, 1.0);
+        std::uniform_real_distribution<real_t> dist(-1, 1);
         mat.K̃.resize(n, n), mat.K.resize(n, n), mat.L.resize(n, n);
         mat.A.resize(n, m);
         std::ranges::generate(mat.K.reshaped(), [&] { return dist(rng); });
@@ -96,10 +97,11 @@ struct CholeskyFixture : benchmark::Fixture {
     }
 
     void TearDown(benchmark::State &state) final {
-        Eigen::MatrixXd E = matrices->second.K̃;
-        const auto n      = static_cast<index_t>(L̃.rows()),
-                   ldL̃    = static_cast<index_t>(L̃.outerStride()),
-                   ldE    = static_cast<index_t>(E.outerStride());
+        using std::pow;
+        Eigen::MatrixX<real_t> E = matrices->second.K̃;
+        const auto n             = static_cast<index_t>(L̃.rows()),
+                   ldL̃           = static_cast<index_t>(L̃.outerStride()),
+                   ldE           = static_cast<index_t>(E.outerStride());
 #if GUANAQO_WITH_OPENMP
         int old_num_threads = omp_get_max_threads();
         omp_set_num_threads(std::thread::hardware_concurrency() / 2);
@@ -113,7 +115,8 @@ struct CholeskyFixture : benchmark::Fixture {
         E.triangularView<Eigen::StrictlyUpper>().setZero();
         real_t r          = E.lpNorm<Eigen::Infinity>();
         std::string label = "resid=" + guanaqo::float_to_str(r, 6);
-        if (!(r < 1e-9))
+        const auto ε = pow(std::numeric_limits<real_t>::epsilon(), real_t(0.5));
+        if (!(r < ε))
             label = "\x1b[0;31m" + label + "\x1b[0m";
         state.SetLabel(label);
         compute_flops(state);
@@ -129,7 +132,7 @@ struct CholeskyFixture : benchmark::Fixture {
 
     template <auto Func>
     void runUpdateBenchmark(benchmark::State &state) {
-        Eigen::MatrixXd Ã(m, n);
+        Eigen::MatrixX<real_t> Ã(m, n);
         for (auto _ : state) {
             state.PauseTiming();
             Ã = matrices->second.A;
@@ -191,7 +194,7 @@ std::vector<::benchmark::internal::Benchmark *> benchmarks;
     BENCHMARK_TEMPLATE_DEFINE_F(                                               \
         BlockedFixture, BM_BLK_IMPL_NAME(name, __VA_ARGS__), __VA_ARGS__)      \
     (benchmark::State & state) {                                               \
-        this->runUpdateBenchmark<func<{__VA_ARGS__}, updown>>(state);          \
+        this->runUpdateBenchmark<func<real_t, {__VA_ARGS__}, updown>>(state);  \
     }                                                                          \
     BM_BLK_REGISTER_F(BlockedFixture, BM_BLK_IMPL_NAME(name, __VA_ARGS__))     \
         ->Name(BM_BLK_NAME(name, __VA_ARGS__))
@@ -256,24 +259,17 @@ BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 4);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 8);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 12);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 16);
-#if __AVX512F__
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 24);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 32);
-#endif
-BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 12, 2);
-BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 4, 12, 4);
 
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 8);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 12);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 16);
-#if __AVX512F__
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 24);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 32);
+#if __AVX512F__
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 16, 8);
-BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 16, 12);
 BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 16, 16);
-BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 24, 2);
-BENCHMARK_BLOCKED(hyh_update, update_cholesky, Downdate, 8, 24, 4);
 #endif
 // clang-format on
 
 
@@ -38,6 +38,7 @@ void bm_solve_riccati(benchmark::State &state) {
 }
 
 void bm_update_riccati(benchmark::State &state) {
+    using std::exp2;
     std::mt19937 rng{54321};
     std::normal_distribution<real_t> nrml{0, 10};
     std::bernoulli_distribution bern{0.25};
@@ -74,6 +75,7 @@ void bm_solve_schur(benchmark::State &state) {
 }
 
 void bm_update_schur(benchmark::State &state) {
+    using std::exp2;
     std::mt19937 rng{54321};
     std::normal_distribution<real_t> nrml{0, 10};
     std::bernoulli_distribution bern{0.25};
 
@@ -28,12 +28,12 @@ class HyhoundRecipe(ConanFile):
     options = {
         "shared": [True, False],
         "fPIC": [True, False],
-        "real_type": ["double", "float"],
+        "real_type": ["double;float", "float;double", "double", "float"],
     } | {k: [True, False] for k in bool_hyhound_options}
     default_options = {
         "shared": False,
         "fPIC": True,
-        "real_type": "double",
+        "real_type": "double;float",
     } | bool_hyhound_options
 
     # Sources are located in the same place as this recipe, copy them to the recipe
 
@@ -4,6 +4,7 @@ find_package(guanaqo REQUIRED)
 # Configuration options
 # ------------------------------------------------------------------------------
 add_library(config INTERFACE)
+list(GET HYHOUND_DENSE_REAL_TYPE 0 HYHOUND_DENSE_REAL_TYPE_0)
 configure_file("config.hpp.in"
     "${CMAKE_CURRENT_BINARY_DIR}/config/include/hyhound/config.hpp" @ONLY)
 target_sources(config INTERFACE FILE_SET headers TYPE HEADERS
@@ -25,7 +26,6 @@ target_sources(util INTERFACE FILE_SET headers TYPE HEADERS
           "util/include/hyhound/cneg.hpp"
           "util/include/hyhound/loop.hpp"
           "util/include/hyhound/lut.hpp"
-          "util/include/hyhound/matrix-view.hpp"
           "util/include/hyhound/unroll.h"
 )
 target_link_libraries(util INTERFACE hyhound::config guanaqo::guanaqo)
 
@@ -10,7 +10,7 @@ namespace hyhound {
 #ifdef __clangd__
 using real_t = double; // clangd does not support std::float128_t etc.
 #else
-using real_t = @HYHOUND_DENSE_REAL_TYPE@;
+using real_t = @HYHOUND_DENSE_REAL_TYPE_0@;
 #endif
 using index_t = @HYHOUND_DENSE_INDEX_TYPE@;
 
 
@@ -17,13 +17,12 @@ using micro_kernels::householder::mut_W_accessor;
 /// `Func<4, 8>` on the first block row of its arguments, then
 /// `Func<4, 4>` on the second block row, `Func<4, 2>`
 /// and finally `downdate_tail<4, 1>` for the bottom row.
-template <template <auto, class> class Func, Config Conf, class UpDown,
-          index_t M, index_t... Ms>
+template <class T, template <auto, class, class> class Func, Config Conf,
+          class UpDown, index_t M, index_t... Ms>
 inline void tile_tail(index_t rowsA, index_t colsA0, index_t colsA,
-                      mut_W_accessor<> W, real_t *L, index_t ldL,
-                      const real_t *B, index_t ldB, real_t *A, index_t ldA,
-                      UpDown updown) noexcept {
-    constexpr auto simd_M = micro_kernels::native_simd_size;
+                      mut_W_accessor<T> W, T *L, index_t ldL, const T *B,
+                      index_t ldB, T *A, index_t ldA, UpDown updown) noexcept {
+    constexpr auto simd_M = micro_kernels::native_simd_size<T>;
     // If the block size is larger than the config allows, skip it.
     constexpr bool skip_large_M = M > Conf.block_size_s;
     // If the block size is not efficiently vectorizable, and is not yet a
@@ -34,30 +33,30 @@ inline void tile_tail(index_t rowsA, index_t colsA0, index_t colsA,
     constexpr bool skip_suboptimal_M = M > simd_M && (M % simd_M) != 0;
     if constexpr (skip_large_M || skip_suboptimal_M) {
         if constexpr (sizeof...(Ms) > 0)
-            tile_tail<Func, Conf, UpDown, Ms...>(rowsA, colsA0, colsA, W, L,
-                                                 ldL, B, ldB, A, ldA, updown);
+            tile_tail<T, Func, Conf, UpDown, Ms...>(
+                rowsA, colsA0, colsA, W, L, ldL, B, ldB, A, ldA, updown);
         return;
     }
     while (rowsA >= M) {
-        constexpr Config NewConf {.block_size_r = Conf.block_size_r,
-                                  .block_size_s = M};
-        Func<NewConf, UpDown> {}(colsA0, colsA, W, L, ldL, B, ldB, A, ldA,
-                                 updown);
+        constexpr Config NewConf{.block_size_r = Conf.block_size_r,
+                                 .block_size_s = M};
+        Func<NewConf, T, UpDown>{}(colsA0, colsA, W, L, ldL, B, ldB, A, ldA,
+                                   updown);
         L += M;
         A += M;
         rowsA -= M;
     }
     if constexpr (sizeof...(Ms) > 0)
         if (rowsA > 0)
-            tile_tail<Func, Conf, UpDown, Ms...>(rowsA, colsA0, colsA, W, L,
-                                                 ldL, B, ldB, A, ldA, updown);
+            tile_tail<T, Func, Conf, UpDown, Ms...>(
+                rowsA, colsA0, colsA, W, L, ldL, B, ldB, A, ldA, updown);
 }
 
-template <Config Conf, class UpDown>
+template <Config Conf, class T, class UpDown>
 struct updowndate_tail_func {
     template <class... Args>
     decltype(auto) operator()(Args &&...args) const {
-        return micro_kernels::householder::updowndate_tail<Conf, UpDown>(
+        return micro_kernels::householder::updowndate_tail<Conf, T, UpDown>(
             std::forward<Args>(args)...);
     }
 };
@@ -67,44 +66,45 @@ struct updowndate_tail_func {
 /// @see @ref detail::tile_tail
 /// The sizes specified here should be instantiated in the code generated by
 /// CMake.
-template <micro_kernels::householder::Config Conf, class UpDown>
+template <micro_kernels::householder::Config Conf, class T, class UpDown>
 inline void updowndate_tile_tail(index_t rowsA, index_t colsA0, index_t colsA,
-                                 detail::mut_W_accessor<> W,
-                                 detail::mut_matrix_accessor L,
-                                 detail::matrix_accessor B,
-                                 detail::mut_matrix_accessor A, UpDown signs) {
-    detail::tile_tail<detail::updowndate_tail_func, Conf, UpDown, //
+                                 detail::mut_W_accessor<T> W,
+                                 detail::mut_matrix_accessor<T> L,
+                                 detail::matrix_accessor<T> B,
+                                 detail::mut_matrix_accessor<T> A,
+                                 UpDown signs) {
+    detail::tile_tail<T, detail::updowndate_tail_func, Conf, UpDown, //
                       32, 24, 16, 12, 8, 4, 2, 1>(
         rowsA, colsA0, colsA, W, L.data, L.outer_stride, B.data, B.outer_stride,
         A.data, A.outer_stride, signs);
 }
 
-template <micro_kernels::householder::Config Conf, class UpDown>
+template <micro_kernels::householder::Config Conf, class T, class UpDown>
 inline void
-updowndate_tail(index_t colsA0, index_t colsA, detail::mut_W_accessor<> W,
-                detail::mut_matrix_accessor L, detail::matrix_accessor B,
-                detail::mut_matrix_accessor A, UpDown signs) {
+updowndate_tail(index_t colsA0, index_t colsA, detail::mut_W_accessor<T> W,
+                detail::mut_matrix_accessor<T> L, detail::matrix_accessor<T> B,
+                detail::mut_matrix_accessor<T> A, UpDown signs) {
     using micro_kernels::householder::updowndate_tail;
-    updowndate_tail<Conf, UpDown>(colsA0, colsA, W, L.data, L.outer_stride,
-                                  B.data, B.outer_stride, A.data,
-                                  A.outer_stride, signs);
+    updowndate_tail<Conf, T, UpDown>(colsA0, colsA, W, L.data, L.outer_stride,
+                                     B.data, B.outer_stride, A.data,
+                                     A.outer_stride, signs);
 }
 
-template <index_t R, class UpDown>
-inline void updowndate_diag(index_t colsA, detail::mut_W_accessor<> W,
-                            detail::mut_matrix_accessor L,
-                            detail::mut_matrix_accessor A, UpDown signs) {
+template <index_t R, class T, class UpDown>
+inline void updowndate_diag(index_t colsA, detail::mut_W_accessor<T> W,
+                            detail::mut_matrix_accessor<T> L,
+                            detail::mut_matrix_accessor<T> A, UpDown signs) {
     using micro_kernels::householder::updowndate_diag;
-    updowndate_diag<R, UpDown>(colsA, W, L.data, L.outer_stride, A.data,
-                               A.outer_stride, signs);
+    updowndate_diag<R, T, UpDown>(colsA, W, L.data, L.outer_stride, A.data,
+                                  A.outer_stride, signs);
 }
 
-template <index_t R, class UpDown>
-inline void updowndate_full(index_t colsA, detail::mut_matrix_accessor L,
-                            detail::mut_matrix_accessor A, UpDown signs) {
+template <index_t R, class T, class UpDown>
+inline void updowndate_full(index_t colsA, detail::mut_matrix_accessor<T> L,
+                            detail::mut_matrix_accessor<T> A, UpDown signs) {
     using micro_kernels::householder::updowndate_full;
-    updowndate_full<R, UpDown>(colsA, L.data, L.outer_stride, A.data,
-                               A.outer_stride, signs);
+    updowndate_full<R, T, UpDown>(colsA, L.data, L.outer_stride, A.data,
+                                  A.outer_stride, signs);
 }
 
 } // namespace hyhound
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ void bm_solve_riccati(benchmark::State &state) {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`void bm_update_riccati(benchmark::State &state) {`
	`41`	`+ using std::exp2;`
`41`	`42`	`std::mt19937 rng{54321};`
`42`	`43`	`std::normal_distribution<real_t> nrml{0, 10};`
`43`	`44`	`std::bernoulli_distribution bern{0.25};`
`@@ -74,6 +75,7 @@ void bm_solve_schur(benchmark::State &state) {`
`74`	`75`	`}`
`75`	`76`
`76`	`77`	`void bm_update_schur(benchmark::State &state) {`
	`78`	`+ using std::exp2;`
`77`	`79`	`std::mt19937 rng{54321};`
`78`	`80`	`std::normal_distribution<real_t> nrml{0, 10};`
`79`	`81`	`std::bernoulli_distribution bern{0.25};`