4040 */
4141
4242
43- #define BLKXSIZE 16
44- #define BLKYSIZE 16
43+ #define BLKXSIZE 8
44+ #define BLKYSIZE 4
4545#define idivup (a, b ) ( ((a)%(b) != 0 ) ? (a)/(b)+1 : (a)/(b) )
4646#define M_PI 3.14159265358979323846
4747#define EPS 1.0e-8
@@ -68,46 +68,56 @@ __device__ void swapUS(unsigned short *xp, unsigned short *yp)
6868__global__ void IndexSelect2D_5_kernel (float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
6969{
7070
71- long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
71+ long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2, ind ;
7272 float normsum;
7373
7474 float Weight_Vec[CONSTVECSIZE5];
7575 unsigned short ind_i[CONSTVECSIZE5];
7676 unsigned short ind_j[CONSTVECSIZE5];
7777
78+ for (ind=0 ; ind<CONSTVECSIZE5; ind++) {
79+ Weight_Vec[ind] = 0.0 ;
80+ ind_i[ind] = 0 ;
81+ ind_j[ind] = 0 ; }
82+
7883 int i = blockDim .x * blockIdx .x + threadIdx .x ;
7984 int j = blockDim .y * blockIdx .y + threadIdx .y ;
8085
81- long index = i*M+ j;
86+ long index = i + N* j;
8287
8388 counter = 0 ;
8489 for (i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
90+ i1 = i+i_m;
91+ if ((i1 >= 0 ) && (i1 < N)) {
8592 for (j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
86- i1 = i+i_m;
8793 j1 = j+j_m;
88- if (((i1 >= 0 ) && (i1 < N)) && (( j1 >= 0 ) && (j1 < M) )) {
94+ if ((j1 >= 0 ) && (j1 < M)) {
8995 normsum = 0 .0f ; counterG = 0 ;
9096 for (i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
97+ i2 = i1 + i_c;
98+ i3 = i + i_c;
99+ if ((i2 >= 0 ) && (i2 < N) && (i3 >= 0 ) && (i3 < N)) {
91100 for (j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
92- i2 = i1 + i_c;
93101 j2 = j1 + j_c;
94- i3 = i + i_c;
95102 j3 = j + j_c;
96- if (((i2 >= 0 ) && (i2 < N)) && ((j2 >= 0 ) && (j2 < M))) {
97- if (((i3 >= 0 ) && (i3 < N)) && ((j3 >= 0 ) && (j3 < M))) {
98- normsum += Eucl_Vec_d[counterG]*powf (Ad[i3*M + j3] - Ad[i2*M + j2], 2 );
103+ if ((j2 >= 0 ) && (j2 < M) && (j3 >= 0 ) && (j3 < M)) {
104+ normsum += Eucl_Vec_d[counterG]*powf (Ad[i3 + N*j3] - Ad[i2 + N*j2], 2 );
99105 counterG++;
100- }}
101- }}
106+ } /* if j2 j3*/
107+ }
108+ } /* if i2 i3*/
109+ }
102110 /* writing temporarily into vectors */
103111 if (normsum > EPS) {
104- Weight_Vec[counter] = __expf (-normsum/h2);
112+ Weight_Vec[counter] = expf (-normsum/h2);
105113 ind_i[counter] = i1;
106114 ind_j[counter] = j1;
107115 counter++;
108116 }
109- }
110- }}
117+ } /* if j1*/
118+ }
119+ } /* if i1*/
120+ }
111121
112122 /* do sorting to choose the most prominent weights [HIGH to LOW] */
113123 /* and re-arrange indeces accordingly */
@@ -133,46 +143,56 @@ __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigne
133143__global__ void IndexSelect2D_7_kernel (float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
134144{
135145
136- long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
146+ long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2, ind ;
137147 float normsum;
138148
139149 float Weight_Vec[CONSTVECSIZE7];
140150 unsigned short ind_i[CONSTVECSIZE7];
141151 unsigned short ind_j[CONSTVECSIZE7];
142152
153+ for (ind=0 ; ind<CONSTVECSIZE7; ind++) {
154+ Weight_Vec[ind] = 0.0 ;
155+ ind_i[ind] = 0 ;
156+ ind_j[ind] = 0 ; }
157+
143158 int i = blockDim .x * blockIdx .x + threadIdx .x ;
144159 int j = blockDim .y * blockIdx .y + threadIdx .y ;
145160
146- long index = i*M+ j;
161+ long index = i + N* j;
147162
148163 counter = 0 ;
149164 for (i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
165+ i1 = i+i_m;
166+ if ((i1 >= 0 ) && (i1 < N)) {
150167 for (j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
151- i1 = i+i_m;
152168 j1 = j+j_m;
153- if (((i1 >= 0 ) && (i1 < N)) && (( j1 >= 0 ) && (j1 < M) )) {
169+ if ((j1 >= 0 ) && (j1 < M)) {
154170 normsum = 0 .0f ; counterG = 0 ;
155171 for (i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
172+ i2 = i1 + i_c;
173+ i3 = i + i_c;
174+ if ((i2 >= 0 ) && (i2 < N) && (i3 >= 0 ) && (i3 < N)) {
156175 for (j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
157- i2 = i1 + i_c;
158176 j2 = j1 + j_c;
159- i3 = i + i_c;
160177 j3 = j + j_c;
161- if (((i2 >= 0 ) && (i2 < N)) && ((j2 >= 0 ) && (j2 < M))) {
162- if (((i3 >= 0 ) && (i3 < N)) && ((j3 >= 0 ) && (j3 < M))) {
163- normsum += Eucl_Vec_d[counterG]*powf (Ad[i3*M + j3] - Ad[i2*M + j2], 2 );
178+ if ((j2 >= 0 ) && (j2 < M) && (j3 >= 0 ) && (j3 < M)) {
179+ normsum += Eucl_Vec_d[counterG]*powf (Ad[i3 + N*j3] - Ad[i2 + N*j2], 2 );
164180 counterG++;
165- }}
166- }}
181+ } /* if j2 j3*/
182+ }
183+ } /* if i2 i3*/
184+ }
167185 /* writing temporarily into vectors */
168186 if (normsum > EPS) {
169- Weight_Vec[counter] = __expf (-normsum/h2);
187+ Weight_Vec[counter] = expf (-normsum/h2);
170188 ind_i[counter] = i1;
171189 ind_j[counter] = j1;
172190 counter++;
173191 }
174- }
175- }}
192+ } /* if j1*/
193+ }
194+ } /* if i1*/
195+ }
176196
177197 /* do sorting to choose the most prominent weights [HIGH to LOW] */
178198 /* and re-arrange indeces accordingly */
@@ -225,16 +245,16 @@ __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigne
225245 for (i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
226246 i2 = i1 + i_c;
227247 i3 = i + i_c;
228- // if ((i2 >= 0) && (i2 < N) && (i3 >= 0) && (i3 < N)) {
248+ if ((i2 >= 0 ) && (i2 < N) && (i3 >= 0 ) && (i3 < N)) {
229249 for (j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
230250 j2 = j1 + j_c;
231251 j3 = j + j_c;
232- // if ((j2 >= 0) && (j2 < M) && (j3 >= 0) && (j3 < M)) {
252+ if ((j2 >= 0 ) && (j2 < M) && (j3 >= 0 ) && (j3 < M)) {
233253 normsum += Eucl_Vec_d[counterG]*powf (Ad[i3 + N*j3] - Ad[i2 + N*j2], 2 );
234254 counterG++;
235- // } /*if j2 j3*/
255+ } /* if j2 j3*/
236256 }
237- // } /*if i2 i3*/
257+ } /* if i2 i3*/
238258 }
239259 /* writing temporarily into vectors */
240260 if (normsum > EPS) {
@@ -271,46 +291,56 @@ __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigne
271291__global__ void IndexSelect2D_11_kernel (float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
272292{
273293
274- long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
294+ long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2, ind ;
275295 float normsum;
276296
277297 float Weight_Vec[CONSTVECSIZE11];
278298 unsigned short ind_i[CONSTVECSIZE11];
279299 unsigned short ind_j[CONSTVECSIZE11];
280300
301+ for (ind=0 ; ind<CONSTVECSIZE11; ind++) {
302+ Weight_Vec[ind] = 0.0 ;
303+ ind_i[ind] = 0 ;
304+ ind_j[ind] = 0 ; }
305+
281306 int i = blockDim .x * blockIdx .x + threadIdx .x ;
282307 int j = blockDim .y * blockIdx .y + threadIdx .y ;
283308
284- long index = i*M+ j;
309+ long index = i + N* j;
285310
286311 counter = 0 ;
287312 for (i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
313+ i1 = i+i_m;
314+ if ((i1 >= 0 ) && (i1 < N)) {
288315 for (j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
289- i1 = i+i_m;
290316 j1 = j+j_m;
291- if (((i1 >= 0 ) && (i1 < N)) && (( j1 >= 0 ) && (j1 < M) )) {
317+ if ((j1 >= 0 ) && (j1 < M)) {
292318 normsum = 0 .0f ; counterG = 0 ;
293319 for (i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
320+ i2 = i1 + i_c;
321+ i3 = i + i_c;
322+ if ((i2 >= 0 ) && (i2 < N) && (i3 >= 0 ) && (i3 < N)) {
294323 for (j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
295- i2 = i1 + i_c;
296324 j2 = j1 + j_c;
297- i3 = i + i_c;
298325 j3 = j + j_c;
299- if (((i2 >= 0 ) && (i2 < N)) && ((j2 >= 0 ) && (j2 < M))) {
300- if (((i3 >= 0 ) && (i3 < N)) && ((j3 >= 0 ) && (j3 < M))) {
301- normsum += Eucl_Vec_d[counterG]*powf (Ad[i3*M + j3] - Ad[i2*M + j2], 2 );
326+ if ((j2 >= 0 ) && (j2 < M) && (j3 >= 0 ) && (j3 < M)) {
327+ normsum += Eucl_Vec_d[counterG]*powf (Ad[i3 + N*j3] - Ad[i2 + N*j2], 2 );
302328 counterG++;
303- }}
304- }}
329+ } /* if j2 j3*/
330+ }
331+ } /* if i2 i3*/
332+ }
305333 /* writing temporarily into vectors */
306334 if (normsum > EPS) {
307- Weight_Vec[counter] = __expf (-normsum/h2);
335+ Weight_Vec[counter] = expf (-normsum/h2);
308336 ind_i[counter] = i1;
309337 ind_j[counter] = j1;
310338 counter++;
311339 }
312- }
313- }}
340+ } /* if j1*/
341+ }
342+ } /* if i1*/
343+ }
314344
315345 /* do sorting to choose the most prominent weights [HIGH to LOW] */
316346 /* and re-arrange indeces accordingly */
@@ -335,46 +365,56 @@ __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsign
335365__global__ void IndexSelect2D_13_kernel (float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
336366{
337367
338- long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
368+ long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2, ind ;
339369 float normsum;
340370
341371 float Weight_Vec[CONSTVECSIZE13];
342372 unsigned short ind_i[CONSTVECSIZE13];
343373 unsigned short ind_j[CONSTVECSIZE13];
344374
375+ for (ind=0 ; ind<CONSTVECSIZE13; ind++) {
376+ Weight_Vec[ind] = 0.0 ;
377+ ind_i[ind] = 0 ;
378+ ind_j[ind] = 0 ; }
379+
345380 int i = blockDim .x * blockIdx .x + threadIdx .x ;
346381 int j = blockDim .y * blockIdx .y + threadIdx .y ;
347382
348- long index = i*M+ j;
383+ long index = i + N* j;
349384
350385 counter = 0 ;
351386 for (i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
387+ i1 = i+i_m;
388+ if ((i1 >= 0 ) && (i1 < N)) {
352389 for (j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
353- i1 = i+i_m;
354390 j1 = j+j_m;
355- if (((i1 >= 0 ) && (i1 < N)) && (( j1 >= 0 ) && (j1 < M) )) {
391+ if ((j1 >= 0 ) && (j1 < M)) {
356392 normsum = 0 .0f ; counterG = 0 ;
357393 for (i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
394+ i2 = i1 + i_c;
395+ i3 = i + i_c;
396+ if ((i2 >= 0 ) && (i2 < N) && (i3 >= 0 ) && (i3 < N)) {
358397 for (j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
359- i2 = i1 + i_c;
360398 j2 = j1 + j_c;
361- i3 = i + i_c;
362399 j3 = j + j_c;
363- if (((i2 >= 0 ) && (i2 < N)) && ((j2 >= 0 ) && (j2 < M))) {
364- if (((i3 >= 0 ) && (i3 < N)) && ((j3 >= 0 ) && (j3 < M))) {
365- normsum += Eucl_Vec_d[counterG]*powf (Ad[i3*M + j3] - Ad[i2*M + j2], 2 );
400+ if ((j2 >= 0 ) && (j2 < M) && (j3 >= 0 ) && (j3 < M)) {
401+ normsum += Eucl_Vec_d[counterG]*powf (Ad[i3 + N*j3] - Ad[i2 + N*j2], 2 );
366402 counterG++;
367- }}
368- }}
403+ } /* if j2 j3*/
404+ }
405+ } /* if i2 i3*/
406+ }
369407 /* writing temporarily into vectors */
370408 if (normsum > EPS) {
371- Weight_Vec[counter] = __expf (-normsum/h2);
409+ Weight_Vec[counter] = expf (-normsum/h2);
372410 ind_i[counter] = i1;
373411 ind_j[counter] = j1;
374412 counter++;
375413 }
376- }
377- }}
414+ } /* if j1*/
415+ }
416+ } /* if i1*/
417+ }
378418
379419 /* do sorting to choose the most prominent weights [HIGH to LOW] */
380420 /* and re-arrange indeces accordingly */
0 commit comments