Skip to content

Commit baad13c

Browse files
Avoid image copy in vng opencl code
We had some image copy operations while doing the vng opencl processing. Removed them for performance, no additional cl_mem is used. Minor renaming and additional comments to make code understandable.
1 parent 818055e commit baad13c

File tree

1 file changed

+43
-39
lines changed

1 file changed

+43
-39
lines changed

src/iop/demosaicing/vng.c

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ static void vng_interpolate(float *out,
150150
const int height,
151151
const uint32_t filters,
152152
const uint8_t (*const xtrans)[6],
153-
const gboolean only_vng_linear)
153+
const gboolean only_linear)
154154
{
155155
static const signed char terms[]
156156
= { -2, -2, +0, -1, 1, 0x01, -2, -2, +0, +0, 2, 0x01, -2, -1, -1, +0, 1, 0x01, -2, -1, +0, -1, 1, 0x02,
@@ -194,7 +194,7 @@ static void vng_interpolate(float *out,
194194
lin_interpolate(out, in, width, height, filters4, xtrans);
195195

196196
// if only linear interpolation is requested we can stop it here
197-
if(only_vng_linear)
197+
if(only_linear)
198198
{
199199
if(is_bayer) goto bayer_greens;
200200
else return;
@@ -332,7 +332,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
332332
const int width,
333333
const int height,
334334
const uint32_t filters,
335-
const gboolean only_vng_linear)
335+
const gboolean only_linear)
336336
{
337337
const dt_iop_demosaic_global_data_t *gd = self->global_data;
338338
const gboolean is_xtrans = (filters == 9u);
@@ -419,7 +419,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
419419
static const signed char chood[]
420420
= { -1, -1, -1, 0, -1, +1, 0, +1, +1, +1, +1, 0, +1, -1, 0, -1 };
421421

422-
if(!only_vng_linear)
422+
if(!only_linear)
423423
{
424424
const size_t ips_size = (size_t)prow * pcol * 352 * sizeof(int);
425425
ips = malloc(ips_size);
@@ -485,43 +485,55 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
485485
dev_lookup = dt_opencl_copy_host_to_device_constant(devid, lookup_size, lookup);
486486
if(dev_lookup == NULL) goto finish;
487487

488-
dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
489-
if(dev_tmp == NULL) goto finish;
490-
491-
cl_mem tmp_out = only_vng_linear ? dev_tmp : dev_out;
488+
// Only xtrans only-linear does not require a tmp buffer and can render directly to out
489+
const gboolean linear_xtrans = is_xtrans && only_linear;
490+
if(!linear_xtrans)
491+
{
492+
dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
493+
if(dev_tmp == NULL) goto finish;
494+
}
492495

493-
// manage borders for linear interpolation part
496+
/* We don't want any copy of data so we fiddle a bit
497+
linear xtrans: directly to out
498+
linear bayer: first to dev_tmp, greens always take dev_tmp to out
499+
full xtrans: first to tmp, then directly to out
500+
full bayer: first to out, then to tmp, greens take tmp to out
501+
*/
502+
const gboolean full_bayer = !is_xtrans && !only_linear;
503+
const gboolean lin_to_out = linear_xtrans || full_bayer;
504+
cl_mem dev_lin_out = lin_to_out ? dev_out : dev_tmp;
505+
cl_mem dev_full_out = is_xtrans ? dev_out : dev_tmp;
506+
507+
// write border **before** linear interpolation
494508
int border = 1;
495509
err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_vng_border_interpolate, width, height,
496-
CLARG(dev_in), CLARG(tmp_out), CLARG(width), CLARG(height), CLARG(border),
510+
CLARG(dev_in), CLARG(dev_lin_out), CLARG(width), CLARG(height), CLARG(border),
497511
CLARG(filters4), CLARG(dev_xtrans));
498512
if(err != CL_SUCCESS) goto finish;
499513

500-
{
501-
// do linear interpolation
502-
dt_opencl_local_buffer_t locopt
514+
// do linear interpolation
515+
dt_opencl_local_buffer_t locopt_lin
503516
= (dt_opencl_local_buffer_t){ .xoffset = 2*1, .xfactor = 1, .yoffset = 2*1, .yfactor = 1,
504517
.cellsize = 1 * sizeof(float), .overhead = 0,
505518
.sizex = 1 << 8, .sizey = 1 << 8 };
506-
507-
if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt))
508-
{
509-
err = CL_INVALID_WORK_DIMENSION;
510-
goto finish;
511-
}
512-
size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
513-
size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
514-
dt_opencl_set_kernel_args(devid, gd->kernel_vng_lin_interpolate, 0,
515-
CLARG(dev_in), CLARG(tmp_out),
516-
CLARG(width), CLARG(height), CLARG(filters4), CLARG(dev_lookup), CLLOCAL(sizeof(float) * (locopt.sizex + 2) * (locopt.sizey + 2)));
517-
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_vng_lin_interpolate, sizes, local);
518-
if(err != CL_SUCCESS) goto finish;
519+
if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt_lin))
520+
{
521+
err = CL_INVALID_WORK_DIMENSION;
522+
goto finish;
519523
}
520524

521-
if(only_vng_linear)
525+
size_t sizes_lin[3] = { ROUNDUP(width, locopt_lin.sizex), ROUNDUP(height, locopt_lin.sizey), 1 };
526+
size_t local_lin[3] = { locopt_lin.sizex, locopt_lin.sizey, 1 };
527+
dt_opencl_set_kernel_args(devid, gd->kernel_vng_lin_interpolate, 0,
528+
CLARG(dev_in), CLARG(dev_lin_out),
529+
CLARG(width), CLARG(height), CLARG(filters4), CLARG(dev_lookup), CLLOCAL(sizeof(float) * (locopt_lin.sizex + 2) * (locopt_lin.sizey + 2)));
530+
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_vng_lin_interpolate, sizes_lin, local_lin);
531+
if(err != CL_SUCCESS) goto finish;
532+
533+
if(only_linear)
522534
goto backcopy;
523535

524-
// do full VNG interpolation; linear data is in dev_out
536+
// do full VNG interpolation; linear data is in dev_lin_out
525537
dt_opencl_local_buffer_t locopt
526538
= (dt_opencl_local_buffer_t){ .xoffset = 2*2, .xfactor = 1, .yoffset = 2*2, .yfactor = 1,
527539
.cellsize = 4 * sizeof(float), .overhead = 0,
@@ -535,31 +547,23 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
535547
size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
536548
size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
537549
dt_opencl_set_kernel_args(devid, gd->kernel_vng_interpolate, 0,
538-
CLARG(dev_out), CLARG(dev_tmp),
550+
CLARG(dev_lin_out), CLARG(dev_full_out),
539551
CLARG(width), CLARG(height), CLARG(filters4),
540552
CLARG(dev_xtrans), CLARG(dev_ips), CLARG(dev_code), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 4) * (locopt.sizey + 4)));
541553
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_vng_interpolate, sizes, local);
542554
if(err != CL_SUCCESS) goto finish;
543555

544-
// manage borders
556+
// overwrite border as 2nd outermost pixels were not interpolated
545557
border = 2;
546558
err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_vng_border_interpolate, width, height,
547-
CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(border),
559+
CLARG(dev_in), CLARG(dev_full_out), CLARG(width), CLARG(height), CLARG(border),
548560
CLARG(filters4), CLARG(dev_xtrans));
549561
if(err != CL_SUCCESS) goto finish;
550562

551563
backcopy:
552564
if(!is_xtrans)
553-
{
554565
err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_vng_green_equilibrate, width, height,
555566
CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height));
556-
}
557-
else
558-
{
559-
size_t origin[] = { 0, 0, 0 };
560-
size_t region[] = { width, height, 1 };
561-
err = dt_opencl_enqueue_copy_image(devid, dev_tmp, dev_out, origin, origin, region);
562-
}
563567

564568
finish:
565569
dt_opencl_release_mem_object(dev_tmp);

0 commit comments

Comments
 (0)