MSc_parallel_prog_2025_lab9/reduction.cu at main · DobDan42/MSc_parallel_prog_2025_lab9 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <chrono>
#include <iostream>


//This is a little wrapper that checks for error codes returned by CUDA API calls
#define checkCuda(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

////////////////////////////////////////////////////////////////////////////////
// CPU routines
////////////////////////////////////////////////////////////////////////////////

double reduction_gold(double* idata, const unsigned int len)
{
  double sum = 0;
  for(int i=0; i<len; i++) sum += idata[i];
  return sum;
}

////////////////////////////////////////////////////////////////////////////////
// GPU routines
////////////////////////////////////////////////////////////////////////////////

__global__ void reduction_atomic(double *g_odata, double *g_idata, int num_elements)
{
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if (id<num_elements){
    double my_value = g_idata[id];
    atomicAdd(g_odata, my_value);
  }
}
__global__ void reduction_shared1(double *g_odata, double *g_idata, int num_elements)
{
  __shared__ double local[256]; //shared by threads in the same block
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  local[threadIdx.x] = id < num_elements? g_idata[id] : 0.0; //because last block may have threads that extend over the size of the array
  // In this kernel, each block has one thread that sums up all values.
  //wait for all threads to copy their data, since their order with warps may be scheduled randomly.
  __syncthreads();

  if (threadIdx.x == 0){
    double sum=0;
    for (int i = 0 ; i< 256; i++){
      sum+= local[i];
    }
    atomicAdd(g_odata, sum);
  }
}

__global__ void reduction_assoc_logtime(double *g_odata, double *g_idata, int num_elements)
{
  __shared__ double local[256]; //shared by threads in the same block
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  local[threadIdx.x] = id < num_elements? g_idata[id] : 0.0; //because last block may have threads that extend over the size of the array

  //wait for all threads to copy their data

  for (int d=blockDim.x/2; d>0; d=d/2){
    __syncthreads();
    if (threadIdx.x < d) local[threadIdx.x] += local[d+threadIdx.x];
  }
  if (threadIdx.x == 0){
    atomicAdd(g_odata, local[0]);
  }
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////

int main( int argc, const char** argv)
{
  int num_elements;

  double *h_data, reference;
  double *d_idata, *d_odata;

  num_elements = 1<<25;

  // allocate host memory to store the input data
  // and initialize to integer values

  h_data = new double[num_elements];

  for(int i = 0; i < num_elements; i++)
    h_data[i] = i;

  // compute reference solutions
  auto t1 = std::chrono::high_resolution_clock::now();

  reference = reduction_gold(h_data, num_elements);

  auto t2 = std::chrono::high_resolution_clock::now();


  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
  // allocate device memory input and output arrays

  checkCuda( cudaMalloc((void**)&d_idata, (size_t)num_elements * sizeof(double)) );
  checkCuda( cudaMalloc((void**)&d_odata, sizeof(double)) );

  // copy host memory to device input array

  checkCuda( cudaMemcpy(d_idata, h_data, (size_t)num_elements * sizeof(double),
                              cudaMemcpyHostToDevice) );

  // execute the kernel
  dim3 threads(256);
  dim3 blocks((num_elements-1)/256+1);

  auto gpu_t1 = std::chrono::high_resolution_clock::now();
  reduction_atomic<<<blocks, threads>>>(d_odata, d_idata, num_elements);
  checkCuda(cudaDeviceSynchronize());
  auto gpu_t2 = std::chrono::high_resolution_clock::now();
  auto duration_gpu = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_t2 - gpu_t1);
  auto duration_gpu_sharednano = std::chrono::duration_cast<std::chrono::microseconds>(gpu_t2 - gpu_t1);

  checkCuda(cudaMemset(d_odata,0,sizeof(double))); // zero out the sum

  gpu_t1 = std::chrono::high_resolution_clock::now();
  reduction_shared1<<<blocks, threads>>>(d_odata, d_idata, num_elements);
  checkCuda(cudaDeviceSynchronize());
  gpu_t2 = std::chrono::high_resolution_clock::now();
  auto duration_gpu_shared = std::chrono::duration_cast<std::chrono::microseconds>(gpu_t2 - gpu_t1);


  checkCuda(cudaMemset(d_odata,0,sizeof(double))); // zero out the sum

  gpu_t1 = std::chrono::high_resolution_clock::now();
  reduction_assoc_logtime<<<blocks, threads>>>(d_odata, d_idata, num_elements);
  checkCuda(cudaDeviceSynchronize());
  gpu_t2 = std::chrono::high_resolution_clock::now();
  auto duration_gpu_assoc = std::chrono::duration_cast<std::chrono::microseconds>(gpu_t2 - gpu_t1);


  // copy result from device to host

  checkCuda( cudaMemcpy(h_data, d_odata, sizeof(double),
                              cudaMemcpyDeviceToHost) );

  // check results
  printf("CPU time (ms) = %lld \n",  duration.count());

  printf("GPU time (ms) = %lld \n, and in (us) = %lld \n",  duration_gpu.count(), duration_gpu_sharednano.count());

  printf("GPU shared time (us) = %lld \n",  duration_gpu_shared.count());

  printf("GPU associativity time (us) = %lld \n",  duration_gpu_assoc.count());

  printf("reduction error = %f\n",h_data[0]-reference);

  // cleanup memory

  delete[] h_data;
  checkCuda( cudaFree(d_idata) );
  checkCuda( cudaFree(d_odata) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();
}