|
| 1 | +/* |
| 2 | + * ===================================================================================== |
| 3 | + * |
| 4 | + * Filename: hello.cu |
| 5 | + * |
| 6 | + * Description: CUDA test |
| 7 | + * |
| 8 | + * Version: 1.0 |
| 9 | + * Created: 02/06/2012 03:54:42 PM |
| 10 | + * Revision: none |
| 11 | + * Compiler: gcc |
| 12 | + * |
| 13 | + * Author: Milan Kabat (), [email protected] |
| 14 | + * Company: FI MUNI |
| 15 | + * |
| 16 | + * ===================================================================================== |
| 17 | + */ |
| 18 | + |
| 19 | +#include <stdio.h> |
| 20 | +#include <cuda.h> |
| 21 | + |
| 22 | +#include "timer-util.h" |
| 23 | +#include "gpu.cuh" |
| 24 | + |
| 25 | +#define INTS_PER_THREAD 4 |
| 26 | + |
| 27 | +void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params ); |
| 28 | + |
| 29 | +void gpu_encode ( char* source_data, char* parity, int* pc_matrix, struct coding_params ); |
| 30 | + |
| 31 | +void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params ); |
| 32 | + |
| 33 | +__global__ void encode ( int* data, int* parity, int* pcm, struct coding_params params ) |
| 34 | +{ |
| 35 | + |
| 36 | + //partial result |
| 37 | + extern __shared__ int shared_mem[]; |
| 38 | +// int idx = blockIdx.x*blockDim.x + threadIdx.x; |
| 39 | + int num_packets = params.max_row_weight; |
| 40 | + int p_size = params.packet_size/4; |
| 41 | + |
| 42 | + //in each iteration, we copy corresponding packet to shared memory and XOR its content |
| 43 | + //into partial result |
| 44 | + for ( int i = 0; i < num_packets; i++ ) |
| 45 | + { |
| 46 | + int index = pcm [ blockIdx.x*(num_packets+2) + i ]; |
| 47 | + |
| 48 | + if ( index > -1 && index < params.k ) |
| 49 | + { |
| 50 | + for ( int j = 0; j < INTS_PER_THREAD; j++ ) |
| 51 | + { |
| 52 | + shared_mem [ threadIdx.x*INTS_PER_THREAD + j ] ^= |
| 53 | + data [ index*p_size + threadIdx.x*INTS_PER_THREAD + j ]; |
| 54 | + } |
| 55 | + } |
| 56 | + } |
| 57 | + |
| 58 | + for ( int j = 0; j < INTS_PER_THREAD; j++ ) |
| 59 | + { |
| 60 | + parity [ blockIdx.x*p_size + threadIdx.x*INTS_PER_THREAD + j ] = |
| 61 | + shared_mem [ threadIdx.x*INTS_PER_THREAD + j]; |
| 62 | + shared_mem [ threadIdx.x*INTS_PER_THREAD + j] = 0; |
| 63 | + } |
| 64 | + |
| 65 | + __syncthreads(); |
| 66 | + |
| 67 | +} |
| 68 | + |
| 69 | +__global__ void decode ( int* received, int* error_vec, int* pcm, struct coding_params params ) |
| 70 | +{ |
| 71 | +/* extern __shared__ int shared_mem[]; |
| 72 | + * |
| 73 | + * int num_neighbours = params.max_row_weight + 2; |
| 74 | + * |
| 75 | + * int p_size = params.packet_size/sizeof(int); |
| 76 | + * int num_threads = p_size/4; |
| 77 | + * |
| 78 | + * //load the neighbouring packets into shared memory |
| 79 | + * int idx = blockIdx.x*blockDim.x + threadIdx.x; |
| 80 | + * for ( int i = 0; i < num_beighbours; i++ ) |
| 81 | + * { |
| 82 | + * if ( threadIdx.x == 0 ) |
| 83 | + * shared_mem[i] = pcm [ blockIdx.x*num_beighbours + i ]; |
| 84 | + * |
| 85 | + * __syncthreads(); |
| 86 | + * |
| 87 | + * if ( shared_mem[i] != -1 ) |
| 88 | + * { |
| 89 | + * // shared_mem [ num_neighbours + threadIdx.x*4 + i ] = |
| 90 | + * |
| 91 | + * |
| 92 | + * |
| 93 | + * |
| 94 | + * } |
| 95 | + * } |
| 96 | + * |
| 97 | + * __syncthreads(); |
| 98 | + * for ( int i = 0; i < (params.packet_size/sizeof(int))/num_threads; i++ ) |
| 99 | + * received [ (params.k + blockIdx.x)*p_size + threadIdx.x*4 + i] = |
| 100 | + * pkts [ idx + i ]; |
| 101 | + * |
| 102 | + * __syncthreads(); |
| 103 | + */ |
| 104 | + |
| 105 | +// for ( int i = 0; i < params.packet_size/sizeof(int); i++ ) |
| 106 | +// received [ params.k*p_size + blockIdx.x*p_size + idx + i ] = pkts [ idx + i ]; |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | +} |
| 111 | + |
| 112 | +void gpu_encode ( char* source_data, char* parity, int* pcm, struct coding_params params ) |
| 113 | +{ |
| 114 | + int* src_data_d; |
| 115 | + int* parity_data_d; |
| 116 | + int* pcm_d; |
| 117 | + short show_info = 0; |
| 118 | + |
| 119 | + cudaError_t cuda_error; |
| 120 | + |
| 121 | + struct cudaDeviceProp dev_prop; |
| 122 | + |
| 123 | + cudaGetDeviceProperties (&dev_prop, 0); |
| 124 | + |
| 125 | + if (show_info) |
| 126 | + { |
| 127 | + if (!dev_prop.canMapHostMemory) |
| 128 | + printf("Cannot map host memory.\n"); |
| 129 | + printf ( "name: %s\n", dev_prop.name ); |
| 130 | + printf ( "totalGlobalMem: %d MB\n", (unsigned int)dev_prop.totalGlobalMem/(1024*1024) ); |
| 131 | + printf ( "sharedMemPerBlock: %d kB\n", (unsigned int)dev_prop.sharedMemPerBlock/1024 ); |
| 132 | + printf ( "maxThreadsPerBlock: %d\n", dev_prop.maxThreadsPerBlock ); |
| 133 | + printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[0] ); |
| 134 | + printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[1] ); |
| 135 | + printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[2] ); |
| 136 | + printf ( "maxGridSize: %d\n", dev_prop.maxGridSize[0] ); |
| 137 | + } |
| 138 | + |
| 139 | +// pcm = (int*) malloc (params.m*(params.max_row_weight+2)*sizeof(int*)); |
| 140 | + |
| 141 | +// make_compact_pcm ( pc_matrix, pcm, params ); |
| 142 | + |
| 143 | + cuda_error = cudaMalloc ( (void**) &src_data_d, params.k*params.packet_size); |
| 144 | + if ( cuda_error != cudaSuccess ) |
| 145 | + printf ( "cudaMalloc returned %d\n", cuda_error ); |
| 146 | + |
| 147 | + cuda_error = cudaMalloc ( (void**) &parity_data_d, params.m*params.packet_size); |
| 148 | + if ( cuda_error != cudaSuccess ) |
| 149 | + printf ( "cudaMalloc returned %d\n", cuda_error ); |
| 150 | + |
| 151 | + cuda_error = cudaMemset ( parity_data_d, 0, params.m*params.packet_size); |
| 152 | + if ( cuda_error != cudaSuccess ) |
| 153 | + printf ( "cudaMemset returned %d\n", cuda_error ); |
| 154 | + |
| 155 | + cuda_error = cudaMemcpy ( src_data_d, source_data, params.k*params.packet_size, |
| 156 | + cudaMemcpyHostToDevice ); |
| 157 | + if ( cuda_error != cudaSuccess ) |
| 158 | + printf ( "cudaMemcpy returned %d\n", cuda_error ); |
| 159 | + |
| 160 | + cuda_error = cudaMalloc ( (void**) &pcm_d, params.m*(params.max_row_weight+2)*sizeof(int)); |
| 161 | + if ( cuda_error != cudaSuccess ) |
| 162 | + printf ( "cudaMalloc return %d\n", cuda_error ); |
| 163 | + |
| 164 | + cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*params.m*(params.max_row_weight+2), |
| 165 | + cudaMemcpyHostToDevice ); |
| 166 | + if ( cuda_error != cudaSuccess ) |
| 167 | + printf ( "cudaMempcy return %d\n", cuda_error ); |
| 168 | + cuda_error = cudaDeviceSynchronize(); |
| 169 | + |
| 170 | + if ( cuda_error != cudaSuccess ) |
| 171 | + printf ( "cudaSyn returned %d\n", cuda_error ); |
| 172 | + |
| 173 | + |
| 174 | + int block_size = (params.packet_size / sizeof(int))/INTS_PER_THREAD; |
| 175 | + int block_count = params.m; |
| 176 | + |
| 177 | + int num_bytes_shared = params.packet_size; |
| 178 | + |
| 179 | + |
| 180 | +// for ( int i = 0; i < 1000; i++) |
| 181 | + encode <<< block_count, block_size, num_bytes_shared >>> (src_data_d, parity_data_d, |
| 182 | + pcm_d, params ); |
| 183 | + |
| 184 | + cuda_error = cudaGetLastError(); |
| 185 | + if ( cuda_error != cudaSuccess ) |
| 186 | + printf("kernel execution returned %d\n", cuda_error); |
| 187 | + |
| 188 | + cudaThreadSynchronize(); |
| 189 | + |
| 190 | + |
| 191 | + cudaMemcpy ( parity, parity_data_d, params.m*params.packet_size, cudaMemcpyDeviceToHost ); |
| 192 | + cuda_error = cudaGetLastError(); |
| 193 | + if ( cuda_error != cudaSuccess ) |
| 194 | + printf("cudaMemcpy from device returned %d\n", cuda_error); |
| 195 | + |
| 196 | + cudaFree(src_data_d); |
| 197 | + cudaFree(parity_data_d); |
| 198 | + |
| 199 | +} |
| 200 | + |
| 201 | +void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params params ) |
| 202 | +{ |
| 203 | +/* int* received_d; |
| 204 | + * int* pcm_d; |
| 205 | + * int* error_vec_d; |
| 206 | + * cudaError_t cuda_error; |
| 207 | + * |
| 208 | + * int k = params.k; |
| 209 | + * int m = params.m; |
| 210 | + * int packet_size = params.packet_size; |
| 211 | + * |
| 212 | + * int **pcm = make_compact_pcm ( pc_matrix, params ); |
| 213 | + * |
| 214 | + * //alocate space and copy data to device |
| 215 | + * cuda_error = cudaMalloc ( (void**) &received_d, (k+m)*packet_size); |
| 216 | + * if ( cuda_error != cudaSuccess ) |
| 217 | + * printf ( "cudaMalloc return %d\n", cuda_error ); |
| 218 | + * |
| 219 | + * cuda_error = cudaMemcpy ( received_d, received, (k+m)*packet_size, cudaMemcpyHostToDevice ); |
| 220 | + * if ( cuda_error != cudaSuccess ) |
| 221 | + * printf ( "cudaMempcy return %d\n", cuda_error ); |
| 222 | + * |
| 223 | + * cuda_error = cudaMalloc ( (void**) &pcm_d, m*params.max_row_weight*sizeof(int)); |
| 224 | + * if ( cuda_error != cudaSuccess ) |
| 225 | + * printf ( "cudaMalloc return %d\n", cuda_error ); |
| 226 | + * |
| 227 | + * cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*m*params.max_row_weight, |
| 228 | + * cudaMemcpyHostToDevice ); |
| 229 | + * if ( cuda_error != cudaSuccess ) |
| 230 | + * printf ( "cudaMempcy return %d\n", cuda_error ); |
| 231 | + * |
| 232 | + * cuda_error = cudaMalloc ( (void**) &error_vec_d, params.num_lost*sizeof(int)); |
| 233 | + * if ( cuda_error != cudaSuccess ) |
| 234 | + * printf ( "cudaMalloc return %d\n", cuda_error ); |
| 235 | + * |
| 236 | + * cuda_error = cudaMemcpy ( pcm_d, pcm, params.num_lost*sizeof(int), |
| 237 | + * cudaMemcpyHostToDevice ); |
| 238 | + * if ( cuda_error != cudaSuccess ) |
| 239 | + * printf ( "cudaMempcy pcm return %d\n", cuda_error ); |
| 240 | + * |
| 241 | + * int block_size = (packet_size/sizeof(int)) / 4; |
| 242 | + * int block_count = m; |
| 243 | + * int shared_mem_size = (packet_size + sizeof(int))*(params.max_row_weight+2); |
| 244 | + * |
| 245 | + * decode <<< block_count, block_size, shared_mem_size >>> (received_d, error_vec, pcm_d, params ); |
| 246 | + * |
| 247 | + * cuda_error = cudaMemcpy ( received, received_d, (k+m)*packet_size, cudaMemcpyDeviceToHost ); |
| 248 | + * if ( cuda_error != cudaSuccess ) |
| 249 | + * printf ( "cudaMempcy from device return %d\n", cuda_error ); |
| 250 | + * |
| 251 | + * cudaFree ( received_d ); |
| 252 | + */ |
| 253 | +} |
| 254 | + |
| 255 | +/* void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params) |
| 256 | + * { |
| 257 | + * //we need to create a compact representation of sparse pc_matrix |
| 258 | + * |
| 259 | + * int counter = 0; |
| 260 | + * int columns = params.max_row_weight + 2; |
| 261 | + * |
| 262 | + * for ( int i = 0; i < params.m; i++) { |
| 263 | + * for ( int j = 0; j < params.k; j++) |
| 264 | + * if ( pc_matrix[i*params.k + j] ) |
| 265 | + * { |
| 266 | + * pcm[i*columns + counter] = j; |
| 267 | + * counter++; |
| 268 | + * } |
| 269 | + * //add indices from staircase matrix |
| 270 | + * pcm[i*columns + counter] = params.k + i; |
| 271 | + * counter++; |
| 272 | + * |
| 273 | + * if ( i > 0 ) |
| 274 | + * { |
| 275 | + * pcm[i*columns + counter] = params.k + i - 1; |
| 276 | + * counter++; |
| 277 | + * } |
| 278 | + * |
| 279 | + * if ( counter < columns ) |
| 280 | + * for ( int j = counter; j < columns; j++) |
| 281 | + * pcm[i*columns + j] = -1; |
| 282 | + * counter = 0; |
| 283 | + * } |
| 284 | + * |
| 285 | + * |
| 286 | + * for ( int i = 0; i < params.m; i++) |
| 287 | + * { |
| 288 | + * for ( int j = 0; j < columns; j++ ) |
| 289 | + * printf ( "%d, ", pcm[i*columns + j] ); |
| 290 | + * printf ( "\n" ); |
| 291 | + * } |
| 292 | + * |
| 293 | + * |
| 294 | + * } |
| 295 | + */ |
0 commit comments