Docs: Remove namespace nesting

ashvardanian · ashvardanian · commit 5e732256470d · 2025-05-03T17:36:22.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,6 +1,7 @@
 {
     "cSpell.words": [
         "ashvardanian",
+        "Autovectorized",
         "blas",
         "blasint",
         "cblas",
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ You are expected to build this on an x86 machine with CUDA drivers installed.
 
 ```sh
 cmake -B build_release -D CMAKE_BUILD_TYPE=Release         # Generate the build files
-cmake --build build_release --config Release               # Build the project
+cmake --build build_release --config Release -j            # Build the project
 build_release/reduce_bench                                 # Run all benchmarks
 build_release/reduce_bench --benchmark_filter="cuda"       # Only CUDA-related
 PARALLEL_REDUCTIONS_LENGTH=1024 build_release/reduce_bench # Set a different input size
diff --git a/reduce_bench.cpp b/reduce_bench.cpp
@@ -60,7 +60,7 @@
 #endif
 
 namespace bm = benchmark;
-using namespace ashvardanian::reduce;
+using namespace ashvardanian;
 
 /**
  *  @brief  Wraps the memory allocated for the benchmark either from `malloc` or `mmap`.
diff --git a/reduce_blas.hpp b/reduce_blas.hpp
@@ -9,7 +9,7 @@
 #include <limits>    // `std::numeric_limits`
 #include <stdexcept> // `std::length_error`
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 /**
  *  @brief Using BLAS dot-product interface to accumulate a vector.
@@ -42,4 +42,4 @@ class blas_dot_t {
     }
 };
 
-} // namespace ashvardanian::reduce
+} // namespace ashvardanian
diff --git a/reduce_cpu.hpp b/reduce_cpu.hpp
@@ -1,8 +1,8 @@
 /**
- *  @date 04/09/2019
- *  @file reduce_cpu.hpp
  *  @brief Parallel reduction with SIMD and multicore acceleration
+ *  @file reduce_cpu.hpp
  *  @author Ash Vardanian
+ *  @date 04/09/2019
  */
 #pragma once
 #include <cstring>   // `std::memcpy`
@@ -24,7 +24,7 @@
 #include <arm_sve.h> // ARM SVE intrinsics
 #endif
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 /**
  *  @brief Returns the current number of logical cores on the CPU.
@@ -41,6 +41,8 @@ inline static std::size_t round_up_to_multiple(std::size_t value, std::size_t mu
     return ((value + multiple - 1) / multiple) * multiple;
 }
 
+#pragma region - Serial and Autovectorized
+
 /**
  *  @brief Computes the sum of a sequence of float values using an unrolled @b `for`-loop,
  *         accumulating into 8 separate registers and summing them at the end.
@@ -131,6 +133,11 @@ class stl_par_unseq_reduce_gt {
 
 #endif // defined(__cpp_lib_execution)
 
+#pragma endregion - Serial and Autovectorized
+
+#pragma region - Handwritten SIMD Kernels
+#pragma region x86
+
 #if defined(__SSE__)
 
 /**
@@ -614,4 +621,6 @@ class threads_gt {
     }
 };
 
-} // namespace ashvardanian::reduce
+#pragma endregion - Multicore
+
+} // namespace ashvardanian
diff --git a/reduce_cublas.cuh b/reduce_cublas.cuh
@@ -16,7 +16,7 @@
 
 using namespace nvcuda;
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 /**
  *  @brief Using cuBLAS dot-product interfaces to accumulate a vector.
@@ -164,4 +164,4 @@ struct cuda_tensors_t {
     }
 };
 
-} // namespace ashvardanian::reduce
+} // namespace ashvardanian
diff --git a/reduce_cuda.cuh b/reduce_cuda.cuh
@@ -13,7 +13,7 @@
 
 #include <cub/cub.cuh>
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 std::size_t cuda_device_count() noexcept {
     int count;
@@ -280,4 +280,4 @@ class cuda_cub_t {
     }
 };
 
-} // namespace ashvardanian::reduce
+} // namespace ashvardanian
diff --git a/reduce_metal.h b/reduce_metal.h
@@ -17,7 +17,7 @@
 #include <cstdio>
 #include <cstring>
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 struct metal_t {
 
@@ -216,6 +216,6 @@ struct metal_t {
     }
 };
 
-} // namespace ashvardanian::reduce
+} // namespace ashvardanian
 
 #endif
diff --git a/reduce_opencl.hpp b/reduce_opencl.hpp
@@ -19,7 +19,7 @@
 #include <CL/cl.h>
 #endif
 
-namespace ashvardanian::reduce {
+namespace ashvardanian {
 
 /**
  *  @brief OpenCL target device information, including its name, driver version,
@@ -337,4 +337,4 @@ char const *opencl_error_name(cl_int code) noexcept {
     }
 }
 
-} // namespace ashvardanian::reduce
+} // namespace ashvardanian

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"cSpell.words": [`
`3`	`3`	`"ashvardanian",`
	`4`	`+ "Autovectorized",`
`4`	`5`	`"blas",`
`5`	`6`	`"blasint",`
`6`	`7`	`"cblas",`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`#include <CL/cl.h>`
`20`	`20`	`#endif`
`21`	`21`
`22`		`-namespace ashvardanian::reduce {`
	`22`	`+namespace ashvardanian {`
`23`	`23`
`24`	`24`	`/**`
`25`	`25`	`* @brief OpenCL target device information, including its name, driver version,`
`@@ -337,4 +337,4 @@ char const *opencl_error_name(cl_int code) noexcept {`
`337`	`337`	`}`
`338`	`338`	`}`
`339`	`339`
`340`		`-} // namespace ashvardanian::reduce`
	`340`	`+} // namespace ashvardanian`