Skip to content

Commit 255d749

Browse files
committed
add dense linear solve
1 parent aa7b273 commit 255d749

File tree

5 files changed

+200
-4
lines changed

5 files changed

+200
-4
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ target_include_directories( hpcReact
5353

5454
add_subdirectory( reactions/bulkDebyeHuckel/unitTests )
5555
add_subdirectory( reactions/bulkGeneric/unitTests )
56+
add_subdirectory( common/unitTests )
5657

5758

5859
HPCReact_add_code_checks( PREFIX hpcReact

src/common/DirectSystemSolve.hpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#include "macros.hpp"
2+
3+
#include<cmath>
4+
template< typename REAL_TYPE, int N >
5+
HPCREACT_HOST_DEVICE void solveNxN_pivoted(REAL_TYPE A[N][N], REAL_TYPE b[N], REAL_TYPE x[N])
6+
{
7+
int pivot[3] = {0, 1, 2}; // Row index tracker
8+
9+
// **Step 1: Forward Elimination with Pivoting**
10+
for (int k = 0; k < N-1; k++)
11+
{
12+
// **Find Pivot Row**
13+
int max_row = k;
14+
REAL_TYPE max_val = abs(A[pivot[k]][k]);
15+
for (int i = k + 1; i < N; i++)
16+
{
17+
if (fabs(A[pivot[i]][k]) > max_val)
18+
{
19+
max_val = fabs(A[pivot[i]][k]);
20+
max_row = i;
21+
}
22+
}
23+
24+
// **Swap Rows in Pivot Array**
25+
if (max_row != k)
26+
{
27+
int temp = pivot[k];
28+
pivot[k] = pivot[max_row];
29+
pivot[max_row] = temp;
30+
}
31+
32+
// **Gaussian Elimination**
33+
for (int i = k + 1; i < N; i++)
34+
{
35+
REAL_TYPE factor = A[pivot[i]][k] / A[pivot[k]][k];
36+
for (int j = k; j < N; j++)
37+
{
38+
A[pivot[i]][j] -= factor * A[pivot[k]][j];
39+
}
40+
b[pivot[i]] -= factor * b[pivot[k]];
41+
}
42+
}
43+
44+
// **Step 2: Back-Substitution**
45+
x[2] = b[pivot[2]] / A[pivot[2]][2];
46+
x[1] = (b[pivot[1]] - A[pivot[1]][2] * x[2]) / A[pivot[1]][1];
47+
x[0] = (b[pivot[0]] - A[pivot[0]][1] * x[1] - A[pivot[0]][2] * x[2]) / A[pivot[0]][0];
48+
}
49+

src/common/pmpl.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#pragma once
22

3+
#include "common/macros.hpp"
34

4-
#include "ShivaMacros.hpp"
5+
#include<utility>
56

6-
namespace shiva
7+
8+
namespace hpcReact
79
{
810
#if defined(HPCREACT_USE_DEVICE)
911
#if defined(HPCREACT_USE_CUDA)
@@ -112,6 +114,7 @@ void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && fu
112114
#if defined(HPCREACT_USE_DEVICE)
113115
DATA_TYPE * deviceData;
114116
deviceMalloc( &deviceData, N * sizeof(DATA_TYPE) );
117+
deviceMemCpy( deviceData, hostData, N * sizeof(DATA_TYPE), cudaMemcpyHostToDevice );
115118
genericKernel <<< 1, 1 >>> ( std::forward< LAMBDA >( func ), deviceData );
116119
deviceDeviceSynchronize();
117120
deviceMemCpy( hostData, deviceData, N * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost );
@@ -128,7 +131,7 @@ void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && fu
128131
* @param data The data pointer to deallocate.
129132
*/
130133
template< typename DATA_TYPE >
131-
HPCREACT_CONSTEXPR_HOSTDEVICE_FORCEINLINE void deallocateData( DATA_TYPE * data )
134+
HPCREACT_HOST_DEVICE void deallocateData( DATA_TYPE * data )
132135
{
133136
#if defined(HPCREACT_USE_DEVICE)
134137
deviceFree( data );
@@ -138,4 +141,4 @@ HPCREACT_CONSTEXPR_HOSTDEVICE_FORCEINLINE void deallocateData( DATA_TYPE * data
138141
}
139142

140143
} // namespace pmpl
141-
} // namespace shiva
144+
} // namespace hpcReact
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Specify list of tests
2+
set( testSourceFiles
3+
testDirectSystemSolve.cpp )
4+
5+
set( dependencyList hpcReact gtest )
6+
7+
# Add gtest C++ based tests
8+
foreach(test ${testSourceFiles})
9+
get_filename_component( test_name ${test} NAME_WE )
10+
blt_add_executable( NAME ${test_name}
11+
SOURCES ${test}
12+
OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY}
13+
DEPENDS_ON ${dependencyList} )
14+
blt_add_test( NAME ${test_name}
15+
COMMAND ${test_name} )
16+
endforeach()
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
2+
#include "../DirectSystemSolve.hpp"
3+
#include "common/pmpl.hpp"
4+
5+
#include <gtest/gtest.h>
6+
7+
using namespace hpcReact;
8+
// TEST( testDirectSystemSolve, test3x3 )
9+
// {
10+
// // **Define a Sample NxN Linear System**
11+
// double A_host[9] =
12+
// {
13+
// 1.0, 2.0, 3.0,
14+
// 2.0, -1.0, 1.0,
15+
// 3.0, 4.0, 5.0
16+
// };
17+
// double b_host[3] = { 14.0, 3.0, 24.0 }; // Right-hand side
18+
// double x_host[3]; // Solution
19+
20+
// // **Allocate Memory on the GPU**
21+
// double *d_A, *d_b, *d_x;
22+
// cudaMalloc(&d_A, sizeof(A_host));
23+
// cudaMalloc(&d_b, sizeof(b_host));
24+
// cudaMalloc(&d_x, sizeof(x_host));
25+
26+
// // **Copy Data to the GPU**
27+
// cudaMemcpy(d_A, A_host, sizeof(A_host), cudaMemcpyHostToDevice);
28+
// cudaMemcpy(d_b, b_host, sizeof(b_host), cudaMemcpyHostToDevice);
29+
30+
// // **Launch Kernel (1 block, 1 thread per system)**
31+
// kernel_solveNxN_pivoted<double,3><<<1, 1>>>(d_A, d_b, d_x, 1);
32+
33+
// // **Copy Result Back to Host**
34+
// cudaMemcpy(x_host, d_x, sizeof(x_host), cudaMemcpyDeviceToHost);
35+
36+
// // **Print the Solution**
37+
// std::cout << "Solution: x = [" << x_host[0] << ", " << x_host[1] << ", " << x_host[2] << "]" << std::endl;
38+
39+
// // **Free GPU Memory**
40+
// cudaFree(d_A);
41+
// cudaFree(d_b);
42+
// cudaFree(d_x);
43+
44+
// }
45+
46+
template< typename REAL_TYPE, int N >
47+
struct LinearSystem
48+
{
49+
REAL_TYPE A[N][N];
50+
REAL_TYPE b[N];
51+
REAL_TYPE x[N];
52+
};
53+
54+
TEST( testDirectSystemSolve, test3x3 )
55+
{
56+
// **Define a Sample NxN Linear System**
57+
58+
LinearSystem< double, 3 > linearSystem
59+
{
60+
{ { 1.0, 2.0, 3.0 },
61+
{ 2.0, -1.0, 1.0 },
62+
{ 3.0, 4.0, 5.0 }
63+
},
64+
{ 14.0, 3.0, 24.0 }, // Right-hand side
65+
{ 0.0, 0.0, 0.0 } // Solution
66+
};
67+
68+
pmpl::genericKernelWrapper( 1, &linearSystem, [&]( auto * copyOfLinearSystem )
69+
{
70+
solveNxN_pivoted<double,3>( copyOfLinearSystem->A, copyOfLinearSystem->b, copyOfLinearSystem->x );
71+
} );
72+
73+
EXPECT_NEAR( linearSystem.x[0], 0.0, std::numeric_limits< double >::epsilon()*100 );
74+
EXPECT_NEAR( linearSystem.x[1], 1.0, std::numeric_limits< double >::epsilon()*100 );
75+
EXPECT_NEAR( linearSystem.x[2], 4.0, std::numeric_limits< double >::epsilon()*100 );
76+
// std::cout << "Solution: x = [" << linearSystem.x[0] << ", " << linearSystem.x[1] << ", " << linearSystem.x[2] << "]" << std::endl;
77+
78+
}
79+
80+
81+
#if 0
82+
TEST( testDirectSystemSolve, test3x3_CUDA )
83+
{
84+
// **Define a Sample NxN Linear System**
85+
double A_host[9] =
86+
{
87+
1.0, 2.0, 3.0,
88+
2.0, -1.0, 1.0,
89+
3.0, 4.0, 5.0
90+
};
91+
double b_host[3] = { 14.0, 3.0, 24.0 }; // Right-hand side
92+
double x_host[3]; // Solution
93+
94+
// **Allocate Memory on the GPU**
95+
double *d_A, *d_b, *d_x;
96+
cudaMalloc(&d_A, sizeof(A_host));
97+
cudaMalloc(&d_b, sizeof(b_host));
98+
cudaMalloc(&d_x, sizeof(x_host));
99+
100+
// **Copy Data to the GPU**
101+
cudaMemcpy(d_A, A_host, sizeof(A_host), cudaMemcpyHostToDevice);
102+
cudaMemcpy(d_b, b_host, sizeof(b_host), cudaMemcpyHostToDevice);
103+
104+
// **Launch Kernel (1 block, 1 thread per system)**
105+
kernel_solveNxN_pivoted<double,3><<<1, 1>>>(d_A, d_b, d_x, 1);
106+
107+
// **Copy Result Back to Host**
108+
cudaMemcpy(x_host, d_x, sizeof(x_host), cudaMemcpyDeviceToHost);
109+
110+
// **Print the Solution**
111+
std::cout << "Solution: x = [" << x_host[0] << ", " << x_host[1] << ", " << x_host[2] << "]" << std::endl;
112+
113+
// **Free GPU Memory**
114+
cudaFree(d_A);
115+
cudaFree(d_b);
116+
cudaFree(d_x);
117+
118+
}
119+
#endif
120+
121+
122+
int main( int argc, char * * argv )
123+
{
124+
::testing::InitGoogleTest( &argc, argv );
125+
int const result = RUN_ALL_TESTS();
126+
return result;
127+
}

0 commit comments

Comments
 (0)