add dense linear solve

rrsettgast · rrsettgast · commit 255d74982c31 · 2025-02-20T16:35:50.000-08:00
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -53,6 +53,7 @@ target_include_directories( hpcReact
 
 add_subdirectory( reactions/bulkDebyeHuckel/unitTests )
 add_subdirectory( reactions/bulkGeneric/unitTests )
+add_subdirectory( common/unitTests )
 
 
 HPCReact_add_code_checks( PREFIX hpcReact
diff --git a/src/common/DirectSystemSolve.hpp b/src/common/DirectSystemSolve.hpp
@@ -0,0 +1,49 @@
+#include "macros.hpp"
+
+#include<cmath>
+template< typename REAL_TYPE, int N >
+HPCREACT_HOST_DEVICE void solveNxN_pivoted(REAL_TYPE A[N][N], REAL_TYPE b[N], REAL_TYPE x[N]) 
+{
+    int pivot[3] = {0, 1, 2};  // Row index tracker
+
+    // **Step 1: Forward Elimination with Pivoting**
+    for (int k = 0; k < N-1; k++) 
+    {
+        // **Find Pivot Row**
+        int max_row = k;
+        REAL_TYPE max_val = abs(A[pivot[k]][k]);
+        for (int i = k + 1; i < N; i++) 
+        {
+            if (fabs(A[pivot[i]][k]) > max_val) 
+            {
+                max_val = fabs(A[pivot[i]][k]);
+                max_row = i;
+            }
+        }
+
+        // **Swap Rows in Pivot Array**
+        if (max_row != k) 
+        {
+            int temp = pivot[k];
+            pivot[k] = pivot[max_row];
+            pivot[max_row] = temp;
+        }
+
+        // **Gaussian Elimination**
+        for (int i = k + 1; i < N; i++) 
+        {
+            REAL_TYPE factor = A[pivot[i]][k] / A[pivot[k]][k];
+            for (int j = k; j < N; j++) 
+            {
+                A[pivot[i]][j] -= factor * A[pivot[k]][j];
+            }
+            b[pivot[i]] -= factor * b[pivot[k]];
+        }
+    }
+
+    // **Step 2: Back-Substitution**
+    x[2] = b[pivot[2]] / A[pivot[2]][2];
+    x[1] = (b[pivot[1]] - A[pivot[1]][2] * x[2]) / A[pivot[1]][1];
+    x[0] = (b[pivot[0]] - A[pivot[0]][1] * x[1] - A[pivot[0]][2] * x[2]) / A[pivot[0]][0];
+}
+
diff --git a/src/common/pmpl.hpp b/src/common/pmpl.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
+#include "common/macros.hpp"
 
-#include "ShivaMacros.hpp"
+#include<utility>
 
-namespace shiva
+
+namespace hpcReact
 {
 #if defined(HPCREACT_USE_DEVICE)
   #if defined(HPCREACT_USE_CUDA)
@@ -112,6 +114,7 @@ void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && fu
 #if defined(HPCREACT_USE_DEVICE)
   DATA_TYPE * deviceData;
   deviceMalloc( &deviceData, N * sizeof(DATA_TYPE) );
+  deviceMemCpy( deviceData, hostData, N * sizeof(DATA_TYPE), cudaMemcpyHostToDevice );
   genericKernel <<< 1, 1 >>> ( std::forward< LAMBDA >( func ), deviceData );
   deviceDeviceSynchronize();
   deviceMemCpy( hostData, deviceData, N * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost );
@@ -128,7 +131,7 @@ void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && fu
  * @param data The data pointer to deallocate.
  */
 template< typename DATA_TYPE >
-HPCREACT_CONSTEXPR_HOSTDEVICE_FORCEINLINE void deallocateData( DATA_TYPE * data )
+HPCREACT_HOST_DEVICE void deallocateData( DATA_TYPE * data )
 {
 #if defined(HPCREACT_USE_DEVICE)
   deviceFree( data );
@@ -138,4 +141,4 @@ HPCREACT_CONSTEXPR_HOSTDEVICE_FORCEINLINE void deallocateData( DATA_TYPE * data
 }
 
 } // namespace pmpl
-} // namespace shiva
+} // namespace hpcReact
diff --git a/src/common/unitTests/CMakeLists.txt b/src/common/unitTests/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Specify list of tests
+set( testSourceFiles
+     testDirectSystemSolve.cpp )
+
+set( dependencyList hpcReact gtest )
+
+# Add gtest C++ based tests
+foreach(test ${testSourceFiles})
+    get_filename_component( test_name ${test} NAME_WE )
+    blt_add_executable( NAME ${test_name}
+                        SOURCES ${test}
+                        OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY}
+                        DEPENDS_ON ${dependencyList} )
+    blt_add_test( NAME ${test_name}
+                   COMMAND ${test_name} )
+endforeach()
diff --git a/src/common/unitTests/testDirectSystemSolve.cpp b/src/common/unitTests/testDirectSystemSolve.cpp
@@ -0,0 +1,127 @@
+
+#include "../DirectSystemSolve.hpp"
+#include "common/pmpl.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace hpcReact;
+// TEST( testDirectSystemSolve, test3x3 )
+// {
+//   // **Define a Sample NxN Linear System**
+//   double A_host[9] = 
+//   {
+//     1.0,  2.0,  3.0,
+//     2.0, -1.0,  1.0,
+//     3.0,  4.0,  5.0
+//   };
+//   double b_host[3] = { 14.0, 3.0, 24.0 }; // Right-hand side
+//   double x_host[3]; // Solution
+
+//   // **Allocate Memory on the GPU**
+//   double *d_A, *d_b, *d_x;
+//   cudaMalloc(&d_A, sizeof(A_host));
+//   cudaMalloc(&d_b, sizeof(b_host));
+//   cudaMalloc(&d_x, sizeof(x_host));
+
+//   // **Copy Data to the GPU**
+//   cudaMemcpy(d_A, A_host, sizeof(A_host), cudaMemcpyHostToDevice);
+//   cudaMemcpy(d_b, b_host, sizeof(b_host), cudaMemcpyHostToDevice);
+
+//   // **Launch Kernel (1 block, 1 thread per system)**
+//   kernel_solveNxN_pivoted<double,3><<<1, 1>>>(d_A, d_b, d_x, 1);
+
+//   // **Copy Result Back to Host**
+//   cudaMemcpy(x_host, d_x, sizeof(x_host), cudaMemcpyDeviceToHost);
+
+//   // **Print the Solution**
+//   std::cout << "Solution: x = [" << x_host[0] << ", " << x_host[1] << ", " << x_host[2] << "]" << std::endl;
+
+//   // **Free GPU Memory**
+//   cudaFree(d_A);
+//   cudaFree(d_b);
+//   cudaFree(d_x);
+
+// }
+
+template< typename REAL_TYPE, int N >
+struct LinearSystem
+{
+  REAL_TYPE A[N][N];
+  REAL_TYPE b[N];
+  REAL_TYPE x[N];
+};
+
+TEST( testDirectSystemSolve, test3x3 )
+{
+  // **Define a Sample NxN Linear System**
+
+  LinearSystem< double, 3 > linearSystem 
+  {
+    { { 1.0,  2.0,  3.0 },
+      { 2.0, -1.0,  1.0 },
+      { 3.0,  4.0,  5.0 } 
+    },
+    { 14.0, 3.0, 24.0 }, // Right-hand side
+    { 0.0, 0.0, 0.0 } // Solution
+  };
+
+  pmpl::genericKernelWrapper( 1, &linearSystem, [&]( auto * copyOfLinearSystem )
+  {
+    solveNxN_pivoted<double,3>( copyOfLinearSystem->A, copyOfLinearSystem->b, copyOfLinearSystem->x );
+  } );
+
+  EXPECT_NEAR( linearSystem.x[0], 0.0, std::numeric_limits< double >::epsilon()*100 );
+  EXPECT_NEAR( linearSystem.x[1], 1.0, std::numeric_limits< double >::epsilon()*100 );
+  EXPECT_NEAR( linearSystem.x[2], 4.0, std::numeric_limits< double >::epsilon()*100 );
+//  std::cout << "Solution: x = [" << linearSystem.x[0] << ", " << linearSystem.x[1] << ", " << linearSystem.x[2] << "]" << std::endl;
+
+}
+
+
+#if 0 
+TEST( testDirectSystemSolve, test3x3_CUDA )
+{
+  // **Define a Sample NxN Linear System**
+  double A_host[9] = 
+  {
+    1.0,  2.0,  3.0,
+    2.0, -1.0,  1.0,
+    3.0,  4.0,  5.0
+  };
+  double b_host[3] = { 14.0, 3.0, 24.0 }; // Right-hand side
+  double x_host[3]; // Solution
+
+  // **Allocate Memory on the GPU**
+  double *d_A, *d_b, *d_x;
+  cudaMalloc(&d_A, sizeof(A_host));
+  cudaMalloc(&d_b, sizeof(b_host));
+  cudaMalloc(&d_x, sizeof(x_host));
+
+  // **Copy Data to the GPU**
+  cudaMemcpy(d_A, A_host, sizeof(A_host), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, b_host, sizeof(b_host), cudaMemcpyHostToDevice);
+
+  // **Launch Kernel (1 block, 1 thread per system)**
+  kernel_solveNxN_pivoted<double,3><<<1, 1>>>(d_A, d_b, d_x, 1);
+
+  // **Copy Result Back to Host**
+  cudaMemcpy(x_host, d_x, sizeof(x_host), cudaMemcpyDeviceToHost);
+
+  // **Print the Solution**
+  std::cout << "Solution: x = [" << x_host[0] << ", " << x_host[1] << ", " << x_host[2] << "]" << std::endl;
+
+  // **Free GPU Memory**
+  cudaFree(d_A);
+  cudaFree(d_b);
+  cudaFree(d_x);
+
+}
+#endif
+
+
+int main( int argc, char * * argv )
+{
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+  return result;
+}