wave_equation_2d_cuda/wave_2d_parallel.cu at main · jorgenfj/wave_equation_2d_cuda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
#define _XOPEN_SOURCE 600
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <iostream>

// TASK: T1
// Include the cooperative groups library
// BEGIN: T1
#include <cooperative_groups.h>
// END: T1


// Convert 'struct timeval' into seconds in double prec. floating point
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)

// Option to change numerical precision
typedef int64_t int_t;
typedef double real_t;

// TASK: T1b
// Variables needed for implementation
// BEGIN: T1b

namespace cg = cooperative_groups;
cudaDeviceProp device_prop;

// Device buffers for three time steps, indexed with 2 ghost points for the boundary
real_t *d_U_prv, *d_U_cur, *d_U_nxt;

#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16

// Can't index past the ghost points, because device only allows
// indexing with unsigned ints
#define D_U_PRV(i,j) d_U_prv[((i+1))*(d_N+2)+(j+1)]
#define D_U_CUR(i,j)     d_U_cur[((i+1))*(d_N+2)+(j+1)]
#define D_U_NXT(i,j) d_U_nxt[((i+1))*(d_N+2)+(j+1)]

#define SHARED_U(i,j) shared_U[(i+1)*(BLOCK_SIZE_Y+2)+(j+1)]

// Simulation parameters: size, step count, and how often to save the state
int_t
    N = 128,
    M = 128,
    max_iteration = 1000000,
    snapshot_freq = 1000;

// Wave equation parameters, time step is derived from the space step
const real_t
    c  = 1.0,
    dx = 1.0,
    dy = 1.0;
real_t
    dt;

// Declare the grid dimensions as constants on the device
// Since we don't specify grid dimensions as arguments when running the program,
// we could just set them as constexpr variables on the host side, but I used __constant__ memory instead
// to use more CUDA features.
__constant__ int_t d_N;
__constant__ int_t d_M;
__constant__ int_t d_snapshot_freq;

__constant__ real_t d_dt, d_c, d_dx, d_dy;

// dim3 blockDim(16, 16);
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y);

// We only need the current time step on the host side
real_t *h_U_cur;

#define h_U(i,j) h_U_cur[((i)+1)*(N+2)+(j)+1]
// END: T1b

#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

void move_buffer_window( void ) {
    real_t* temp = d_U_prv;
    d_U_prv = d_U_cur;
    d_U_cur = d_U_nxt;
    d_U_nxt = temp;
}

__device__ void move_buffer_window_device(real_t** U_prv, real_t** U_cur, real_t** U_nxt) {
    // Ensure only one thread performs the buffer swap
    real_t* temp = *U_prv;
    *U_prv = *U_cur;
    *U_cur = *U_nxt;
    *U_nxt = temp;
}


// Save the present time step in a numbered file under 'data/'
void domain_save ( int_t step )
{
    char filename[256];
    sprintf ( filename, "data/%.5ld.dat", step );
    FILE *out = fopen ( filename, "wb" );
    for ( int_t i=0; i<M; i++ )
    {
        fwrite ( &h_U(i,0), sizeof(real_t), N, out );
    }
    fclose ( out );
}


// TASK: T4
// Get rid of all the memory allocations
void domain_finalize ( void )
{
// BEGIN: T4
    // Free the host memory
    free(h_U_cur);

    // Free the device memory
    cudaFree(d_U_prv);
    cudaFree(d_U_cur);
    cudaFree(d_U_nxt);
// END: T4
}

// TASK: T6
// TASK: T6
__device__ void apply_boundary_conditions_global(real_t *d_U_cur, real_t* shared_U) {

    int global_i = blockIdx.x * blockDim.x + threadIdx.x;
    int global_j = blockIdx.y * blockDim.y + threadIdx.y;

    int local_i = threadIdx.x;
    int local_j = threadIdx.y;

    // Neumann (reflective) boundary condition
     if (global_i < d_M && global_j < d_N) {
    if (global_j == 0) {
        SHARED_U(local_i, local_j-1) = D_U_CUR(global_i, global_j + 1);
        // D_U_CUR(global_i, global_j -1) = D_U_CUR(global_i, global_j + 1);
    }
    if (global_j == d_N - 1) {
        SHARED_U(local_i, local_j+1) = D_U_CUR(global_i, global_j - 1);
        // D_U_CUR(global_i, d_N) = D_U_CUR(global_i, d_N - 2);
    }
    if (global_i == 0) {
        SHARED_U(local_i-1, local_j) = D_U_CUR(global_i+1, global_j);
        // D_U_CUR(global_i-1, global_j) = D_U_CUR(global_i+1, global_j);
    }
    if (global_i == d_M - 1) {
        SHARED_U(local_i+1, local_j) = D_U_CUR(global_i-1, global_j);
        // D_U_CUR(d_M, global_j) = D_U_CUR(d_M - 2, global_j);
    }
    }
}


// END: T6


// TASK: T5
__global__ void time_step_kernel(real_t* d_U_prv, real_t* d_U_cur, real_t* d_U_nxt) {

    __shared__ real_t shared_U[(BLOCK_SIZE_X+2)*(BLOCK_SIZE_Y+2)];

    int global_i = blockIdx.x * blockDim.x + threadIdx.x;
    int global_j = blockIdx.y * blockDim.y + threadIdx.y;

    int local_i = threadIdx.x;
    int local_j = threadIdx.y;

    // Loop over the number of time steps to take
    for (int iter = 0; iter < d_snapshot_freq; ++iter) {
        // Load the current time step into shared memory
        if (global_i < d_M && global_j < d_N) {
            SHARED_U(local_i, local_j) = D_U_CUR(global_i, global_j);

            // Load the ghost points for boirder blocks that are not global boundaries
            if (local_i == 0 && global_i > 0) {
                SHARED_U(local_i-1, local_j) = D_U_CUR(global_i-1, global_j);
            }
            if (local_i == blockDim.x - 1 && global_i < d_M - 1) {
                SHARED_U(local_i+1, local_j) = D_U_CUR(global_i+1, global_j);
            }
            if (local_j == 0 && global_j > 0) {
                SHARED_U(local_i, local_j-1) = D_U_CUR(global_i, global_j-1);
            }
            if (local_j == blockDim.y - 1 && global_j < d_N - 1) {
                SHARED_U(local_i, local_j+1) = D_U_CUR(global_i, global_j+1);
            }

            // Apply boundary conditions for the global boundaries
            apply_boundary_conditions_global(d_U_cur, shared_U);

        // Only need to synchronize within the block here
        cg::this_thread_block().sync();

        // Perform the time step calculation
        if (global_i < d_M && global_j < d_N) {
            D_U_NXT(global_i, global_j) = -D_U_PRV(global_i, global_j) + 2.0 * SHARED_U(local_i, local_j)
                + (d_dt * d_dt * d_c * d_c) / (d_dx * d_dy) * (
                    SHARED_U(local_i-1,local_j) + SHARED_U(local_i+1,local_j)
                    + SHARED_U(local_i,local_j-1) + SHARED_U(local_i,local_j+1)
                    - 4.0 * SHARED_U(local_i,local_j)

                );
        }

        // Move the buffer window
        move_buffer_window_device(&d_U_prv, &d_U_cur, &d_U_nxt);

        // Synchronize before next iteration
        cg::grid_group grid = cg::this_grid();
        grid.sync();
        }

    }
}
// END: T5

// TASK: T7
void simulate(void) {
    dim3 blockDim(BLOCK_SIZE_X, BLOCK_SIZE_Y);
    dim3 gridDim((N + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X, (M + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y);

    for (int_t iteration = 0; iteration <= max_iteration; iteration += snapshot_freq) {
        cudaMemcpy(h_U_cur, d_U_cur, (M + 2) * (N + 2) * sizeof(real_t), cudaMemcpyDeviceToHost);
        domain_save(iteration / snapshot_freq);

        void* kernel_args[] = { (void*)&d_U_prv, (void*)&d_U_cur, (void*)&d_U_nxt, (void*)&snapshot_freq };
        cudaErrorCheck(cudaLaunchCooperativeKernel(
            (void*)time_step_kernel,
            gridDim,
            blockDim,
            kernel_args
        ));

        // Swap the buffers on host side to reflect the changes on the device
        for (int i = 0; i < snapshot_freq % 3; ++i) {
            move_buffer_window();
        }

    }


}
// END: T7


// TASK: T8
// GPU occupancy
void occupancy( void )
{
// BEGIN: T8
    cudaDeviceProp deviceProp;
    cudaErrorCheck(cudaGetDeviceProperties(&deviceProp, 0));
    real_t shared_mem_size = (BLOCK_SIZE_X+2)*(BLOCK_SIZE_Y+2)*sizeof(real_t);
    dim3 blockDim(BLOCK_SIZE_X, BLOCK_SIZE_Y);
    dim3 gridDim((N + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X, (M + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y);

    int maxActiveBlocks;
    cudaErrorCheck(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &maxActiveBlocks, (void *)time_step_kernel, BLOCK_SIZE_X * BLOCK_SIZE_Y, shared_mem_size
    ));

    int maxWarpsPerMultiprocessor = deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
    int activeWarps = maxActiveBlocks * (BLOCK_SIZE_X * BLOCK_SIZE_Y / deviceProp.warpSize);

    float occupancy = (float)activeWarps / maxWarpsPerMultiprocessor;

    printf("Grid size set to:             %d.\n", gridDim.x * gridDim.y * gridDim.z);
    printf("Launched blocks of size:      %d.\n", blockDim.x * blockDim.y * blockDim.z);
    printf("Theoretical occupancy:        %f\n", occupancy);
// END: T8
}


// TASK: T2
// Make sure at least one CUDA-capable device exists
static bool init_cuda()
{
// BEGIN: T2
     // Check the number of CUDA-capable devices.
    int device_count = 0;
    cudaError_t error = cudaGetDeviceCount(&device_count);
    if (error != cudaSuccess || device_count == 0) {
        std::cerr << "No CUDA-compatible device found or failed to get device count: "
                  << cudaGetErrorString(error) << std::endl;
        return false;
    }

    std::cout << "Number of CUDA-compatible devices: " << device_count << std::endl;

    // Iterate through devices and select a suitable one.
    for (int device = 0; device < device_count; ++device) {
        error = cudaGetDeviceProperties(&device_prop, device);
        if (error != cudaSuccess) {
            std::cerr << "Failed to get properties for device " << device << ": "
                      << cudaGetErrorString(error) << std::endl;
            return false;
        }

        // Print information about the device (similar to Figure 3).
        std::cout << "Device " << device << ": " << device_prop.name << std::endl;
        std::cout << "  Compute capability: " << device_prop.major << "." << device_prop.minor << std::endl;
        std::cout << "  Total global memory: " << device_prop.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
        std::cout << "  Shared memory per block: " << device_prop.sharedMemPerBlock / 1024 << " KB" << std::endl;
        std::cout << "  Registers per block: " << device_prop.regsPerBlock << std::endl;
        std::cout << "  Warp size: " << device_prop.warpSize << std::endl;
        std::cout << "  Max threads per block: " << device_prop.maxThreadsPerBlock << std::endl;
        std::cout << "  Max threads dimensions: [" << device_prop.maxThreadsDim[0] << ", "
                  << device_prop.maxThreadsDim[1] << ", " << device_prop.maxThreadsDim[2] << "]" << std::endl;
        std::cout << "  Max grid size: [" << device_prop.maxGridSize[0] << ", "
                  << device_prop.maxGridSize[1] << ", " << device_prop.maxGridSize[2] << "]" << std::endl;

        // Select the device (you can customize which one to select).
        cudaSetDevice(device);
    }
    return true;
// END: T2
}


// TASK: T3
// Set up our three buffers, and fill two with an initial perturbation
void domain_initialize ( void )
{
// BEGIN: T3
    bool locate_cuda = init_cuda();
    if (!locate_cuda)
    {
        exit( EXIT_FAILURE );
    }

    // We only need the current time step on the host
    h_U_cur = (real_t *) malloc ( (M+2)*(N+2)*sizeof(real_t) );

    for ( int_t i=0; i<M; i++ )
    {
        for ( int_t j=0; j<N; j++ )
        {
            // Calculate delta (radial distance) adjusted for M x N grid
            real_t delta = sqrt ( ((i - M/2.0) * (i - M/2.0)) / (real_t)M +
                                ((j - N/2.0) * (j - N/2.0)) / (real_t)N );
            h_U(i,j) = exp ( -4.0*delta*delta );
        }
    }


    // Set the time step for 2D case
    dt = dx*dy / (c * sqrt (dx*dx+dy*dy));

    // Allocate device memory for the three time steps
    cudaErrorCheck(cudaMalloc((void **)&d_U_prv, (M+2)*(N+2)*sizeof(real_t)));
    cudaErrorCheck(cudaMalloc((void **)&d_U_cur, (M+2)*(N+2)*sizeof(real_t)));
    cudaErrorCheck(cudaMalloc((void **)&d_U_nxt, (M+2)*(N+2)*sizeof(real_t)));

    // Copy the initial conditions from host to device for d_U_prv and d_U_cur
    cudaErrorCheck(cudaMemcpy(d_U_prv, h_U_cur, (M+2)*(N+2)*sizeof(real_t), cudaMemcpyHostToDevice));
    cudaErrorCheck(cudaMemcpy(d_U_cur, h_U_cur, (M+2)*(N+2)*sizeof(real_t), cudaMemcpyHostToDevice));

    // Copy the grid size constants to the device constant memory using cudaMemcpyToSymbol
    cudaErrorCheck(cudaMemcpyToSymbol(d_N, &N, sizeof(int_t)));
    cudaErrorCheck(cudaMemcpyToSymbol(d_M, &M, sizeof(int_t)));

    // Copy the wave equation parameters to the device constant memory using cudaMemcpyToSymbol
    cudaErrorCheck(cudaMemcpyToSymbol(d_dt, &dt, sizeof(real_t)));
    cudaErrorCheck(cudaMemcpyToSymbol(d_c, &c, sizeof(real_t)));
    cudaErrorCheck(cudaMemcpyToSymbol(d_dx, &dx, sizeof(real_t)));
    cudaErrorCheck(cudaMemcpyToSymbol(d_dy, &dy, sizeof(real_t)));
    cudaErrorCheck(cudaMemcpyToSymbol(d_snapshot_freq, &snapshot_freq, sizeof(int_t)));

// END: T3
}


int main ( void )
{
    // Set up the initial state of the domain
    domain_initialize();

    struct timeval t_start, t_end;

    gettimeofday ( &t_start, NULL );
    simulate();
    gettimeofday ( &t_end, NULL );

    printf ( "Total elapsed time: %lf seconds\n",
        WALLTIME(t_end) - WALLTIME(t_start)
    );

    occupancy();

    // Clean up and shut down
    domain_finalize();
    exit ( EXIT_SUCCESS );
}