Skip to content

Commit 318e6b8

Browse files
Some minimal VNG OpenCL performance improvement
1. subtle vng border_interpolate kernel improvements - #define AVGWINDOW - use samplerA where coordinates have been checked 2. in OpenCL VNG code don't calculate and copy to device buffers if not required as we do only the linear interpolation part. 3. use vectorized copy_zero 4. capture log fix
1 parent a19a9f2 commit 318e6b8

File tree

3 files changed

+70
-72
lines changed

3 files changed

+70
-72
lines changed

data/kernels/demosaic_vng.cl

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
This file is part of darktable,
3-
Copyright (C) 2016-2025 darktable developers.
3+
Copyright (C) 2016-2026 darktable developers.
44
55
darktable is free software: you can redistribute it and/or modify
66
it under the terms of the GNU General Public License as published by
@@ -19,6 +19,8 @@
1919
#include "common.h"
2020

2121
kernel void
22+
#define AVGWINDOW 1
23+
2224
vng_border_interpolate(read_only image2d_t in,
2325
write_only image2d_t out,
2426
const int width,
@@ -33,21 +35,20 @@ vng_border_interpolate(read_only image2d_t in,
3335
if(x >= width || y >= height) return;
3436

3537
const int colors = (filters == 9) ? 3 : 4;
36-
const int avgwindow = 1;
3738

3839
if(x >= border && x < width-border && y >= border && y < height-border) return;
3940

4041
float o[4] = { 0.0f };
4142
float sum[4] = { 0.0f };
4243
int count[4] = { 0 };
4344

44-
for(int j = y-avgwindow; j <= y+avgwindow; j++)
45-
for(int i = x-avgwindow; i <= x+avgwindow; i++)
45+
for(int j = y-AVGWINDOW; j <= y+AVGWINDOW; j++)
46+
for(int i = x-AVGWINDOW; i <= x+AVGWINDOW; i++)
4647
{
4748
if(j >= 0 && i >= 0 && j < height && i < width)
4849
{
4950
const int f = fcol(j, i, filters, xtrans);
50-
sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x);
51+
sum[f] += fmax(0.0f, read_imagef(in, samplerA, (int2)(i, j)).x);
5152
count[f]++;
5253
}
5354
}
@@ -118,7 +119,7 @@ vng_lin_interpolate(read_only image2d_t in, write_only image2d_t out, const int
118119
float o[4] = { 0.0f };
119120

120121
global const int *ip = lookup[y % size][x % size];
121-
int num_pixels = ip[0];
122+
const int num_pixels = ip[0];
122123
ip++;
123124

124125
// for each adjoining pixel not of this pixel's color, sum up its weighted values
@@ -239,7 +240,7 @@ vng_interpolate(read_only image2d_t in, write_only image2d_t out, const int widt
239240
return;
240241
}
241242

242-
float thold = gmin + (gmax * 0.5f);
243+
const float thold = gmin + (gmax * 0.5f);
243244
float sum[4] = { 0.0f };
244245
const int color = fcol(y, x, filters, xtrans);
245246
int num = 0;
@@ -298,7 +299,7 @@ vng_green_equilibrate(read_only image2d_t in, write_only image2d_t out, const in
298299

299300
if(x >= width || y >= height) return;
300301

301-
float4 pixel = read_imagef(in, sampleri, (int2)(x , y));
302+
float4 pixel = read_imagef(in, samplerA, (int2)(x , y));
302303

303304
pixel.y = (pixel.y + pixel.w) / 2.0f;
304305
pixel.w = 0.0f;

src/iop/demosaicing/capture.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
This file is part of darktable,
3-
Copyright (C) 2025 darktable developers.
3+
Copyright (C) 2026 darktable developers.
44
55
darktable is free software: you can redistribute it and/or modify
66
it under the terms of the GNU General Public License as published by
@@ -731,7 +731,7 @@ static void _capture_radius(dt_iop_module_t *self,
731731

732732
dt_print_pipe(DT_DEBUG_PIPE, filters != 9u ? "bayer autoradius" : "xtrans autoradius",
733733
pipe, self, DT_DEVICE_NONE, NULL, NULL,
734-
"%sradius=%.2f from %s image data is %s reliable",
734+
"%sradius=%.2f from %s image data is %sreliable",
735735
same_radius ? "unchanged" : "", radius,
736736
enough ? "enough" : "small",
737737
reliable ? "" : "not ");

src/iop/demosaicing/vng.c

Lines changed: 59 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
This file is part of darktable,
3-
Copyright (C) 2010-2025 darktable developers.
3+
Copyright (C) 2010-2026 darktable developers.
44
55
darktable is free software: you can redistribute it and/or modify
66
it under the terms of the GNU General Public License as published by
@@ -137,10 +137,11 @@ static void lin_interpolate(float *out,
137137
I've extended the basic idea to work with non-Bayer filter arrays.
138138
Gradients are numbered clockwise from NW=0 to W=7.
139139
*/
140-
static inline void _ensure_abovezero(float *to, float *from, const int floats)
140+
static void _copy_abovezero(float *to, float *from, const int pixels)
141141
{
142-
for(int i = 0; i < floats; i++)
143-
to[i] = fmaxf(0.0f, from[i]);
142+
static dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f};
143+
for(int i = 0; i < pixels; i++)
144+
dt_vector_max(&to[i*4], zero, &from[i*4]);
144145
}
145146

146147
static void vng_interpolate(float *out,
@@ -295,14 +296,14 @@ static void vng_interpolate(float *out,
295296
}
296297
}
297298
if(row > 3) /* Write buffer to image */
298-
_ensure_abovezero(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), 4 * (width - 4));
299+
_copy_abovezero(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), width - 4);
299300

300301
// rotate ring buffer
301302
for(int g = 0; g < 4; g++) brow[(g - 1) & 3] = brow[g];
302303
}
303304
// copy the final two rows to the image
304-
_ensure_abovezero(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), 4 * (width - 4));
305-
_ensure_abovezero(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), 4 * (width - 4));
305+
_copy_abovezero(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), width - 4);
306+
_copy_abovezero(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), width - 4);
306307
dt_free_align(buffer);
307308

308309
if(filters != 9 && !FILTERS_ARE_4BAYER(filters)) // x-trans or CYGM/RGBE
@@ -337,7 +338,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
337338
else
338339
filters4 = filters | 0x0c0c0c0cu;
339340

340-
const int size = (filters4 == 9u) ? 6 : 16;
341+
const int lsize = (filters4 == 9u) ? 6 : 16;
341342
const int colors = (filters4 == 9u) ? 3 : 4;
342343
const int prow = (filters4 == 9u) ? 6 : 8;
343344
const int pcol = (filters4 == 9u) ? 6 : 2;
@@ -351,53 +352,46 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
351352
cl_mem dev_ips = NULL;
352353
cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
353354

354-
int32_t(*lookup)[16][32] = NULL;
355-
356-
357-
// build interpolation lookup table for linear interpolation which for a given offset in the sensor
358-
// lists neighboring pixels from which to interpolate:
359-
// NUM_PIXELS # of neighboring pixels to read
360-
// for(1..NUM_PIXELS):
361-
// OFFSET # in bytes from current pixel
362-
// WEIGHT # how much weight to give this neighbor
363-
// COLOR # sensor color
364-
// # weights of adjoining pixels not of this pixel's color
365-
// COLORA TOT_WEIGHT
366-
// COLORB TOT_WEIGHT
367-
// COLORPIX # color of center pixel
368-
const size_t lookup_size = (size_t)16 * 16 * 32 * sizeof(int32_t);
369-
lookup = malloc(lookup_size);
370-
371-
for(int row = 0; row < size; row++)
372-
for(int col = 0; col < size; col++)
355+
const size_t lookup_size = (size_t)16 * 16 * 32 * sizeof(int32_t);
356+
int32_t(*lookup)[16][32] = malloc(lookup_size);
357+
if(!lookup) goto finish;
358+
// build interpolation lookup table for linear interpolation which for a given offset in the sensor
359+
// lists neighboring pixels from which to interpolate:
360+
for(int row = 0; row < lsize; row++)
361+
{
362+
for(int col = 0; col < lsize; col++)
363+
{
364+
int32_t *ip = &(lookup[row][col][1]);
365+
int sum[4] = { 0 };
366+
const int f = fcol(row, col, filters4, xtrans);
367+
// make list of adjoining pixel offsets by weight & color
368+
for(int y = -1; y <= 1; y++)
373369
{
374-
int32_t *ip = &(lookup[row][col][1]);
375-
int sum[4] = { 0 };
376-
const int f = fcol(row, col, filters4, xtrans);
377-
// make list of adjoining pixel offsets by weight & color
378-
for(int y = -1; y <= 1; y++)
379-
for(int x = -1; x <= 1; x++)
380-
{
381-
const int weight = 1 << ((y == 0) + (x == 0));
382-
const int color = fcol(row + y, col + x, filters4, xtrans);
383-
if(color == f) continue;
384-
*ip++ = (y << 16) | (x & 0xffffu);
385-
*ip++ = weight;
386-
*ip++ = color;
387-
sum[color] += weight;
388-
}
389-
lookup[row][col][0] = (ip - &(lookup[row][col][0])) / 3; /* # of neighboring pixels found */
390-
for(int c = 0; c < colors; c++)
391-
if(c != f)
392-
{
393-
*ip++ = c;
394-
*ip++ = sum[c];
395-
}
396-
*ip = f;
370+
for(int x = -1; x <= 1; x++)
371+
{
372+
const int weight = 1 << ((y == 0) + (x == 0));
373+
const int color = fcol(row + y, col + x, filters4, xtrans);
374+
if(color == f) continue;
375+
*ip++ = (y << 16) | (x & 0xffffu);
376+
*ip++ = weight;
377+
*ip++ = color;
378+
sum[color] += weight;
379+
}
380+
}
381+
lookup[row][col][0] = (ip - &(lookup[row][col][0])) / 3; /* # of neighboring pixels found */
382+
for(int c = 0; c < colors; c++)
383+
{
384+
if(c != f)
385+
{
386+
*ip++ = c;
387+
*ip++ = sum[c];
388+
}
397389
}
390+
*ip = f;
391+
}
392+
}
398393

399-
// Precalculate for VNG
400-
static const signed char terms[]
394+
static const signed char terms[]
401395
= { -2, -2, +0, -1, 1, 0x01, -2, -2, +0, +0, 2, 0x01, -2, -1, -1, +0, 1, 0x01, -2, -1, +0, -1, 1, 0x02,
402396
-2, -1, +0, +0, 1, 0x03, -2, -1, +0, +1, 2, 0x01, -2, +0, +0, -1, 1, 0x06, -2, +0, +0, +0, 2, 0x02,
403397
-2, +0, +0, +1, 1, 0x03, -2, +1, -1, +0, 1, 0x04, -2, +1, +0, -1, 2, 0x04, -2, +1, +0, +0, 1, 0x06,
@@ -414,16 +408,19 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
414408
+0, +0, +2, +2, 2, 0x10, +0, +1, +1, +0, 1, 0x44, +0, +1, +1, +2, 1, 0x10, +0, +1, +2, -1, 2, 0x40,
415409
+0, +1, +2, +0, 1, 0x60, +0, +1, +2, +1, 1, 0x20, +0, +1, +2, +2, 1, 0x10, +1, -2, +1, +0, 1, 0x80,
416410
+1, -1, +1, +1, 1, 0x88, +1, +0, +1, +2, 1, 0x08, +1, +0, +2, -1, 1, 0x40, +1, +0, +2, +1, 1, 0x10 };
417-
static const signed char chood[]
411+
static const signed char chood[]
418412
= { -1, -1, -1, 0, -1, +1, 0, +1, +1, +1, +1, 0, +1, -1, 0, -1 };
419413

414+
if(!only_vng_linear)
415+
{
420416
const size_t ips_size = (size_t)prow * pcol * 352 * sizeof(int);
421417
ips = malloc(ips_size);
422418

423419
int *ip = ips;
424420
int code[16][16];
425421

426422
for(int row = 0; row < prow; row++)
423+
{
427424
for(int col = 0; col < pcol; col++)
428425
{
429426
code[row][col] = ip - ips;
@@ -468,20 +465,18 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
468465
}
469466
}
470467
}
468+
}
469+
470+
dev_code = dt_opencl_copy_host_to_device_constant(devid, sizeof(code), code);
471+
if(dev_code == NULL) goto finish;
471472

473+
dev_ips = dt_opencl_copy_host_to_device_constant(devid, ips_size, ips);
474+
if(dev_ips == NULL) goto finish;
475+
}
472476

473477
dev_lookup = dt_opencl_copy_host_to_device_constant(devid, lookup_size, lookup);
474478
if(dev_lookup == NULL) goto finish;
475479

476-
dev_code = dt_opencl_copy_host_to_device_constant(devid, sizeof(code), code);
477-
if(dev_code == NULL) goto finish;
478-
479-
dev_ips = dt_opencl_copy_host_to_device_constant(devid, ips_size, ips);
480-
if(dev_ips == NULL) goto finish;
481-
482-
// need to reserve scaled auxiliary buffer or use dev_out
483-
err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
484-
485480
// manage borders for linear interpolation part
486481
int border = 1;
487482
err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_vng_border_interpolate, width, height,
@@ -514,6 +509,8 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
514509
if(only_vng_linear)
515510
goto finish;
516511

512+
// need to reserve scaled auxiliary buffer or use dev_out
513+
err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
517514
// do full VNG interpolation
518515
dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
519516
if(dev_tmp == NULL) goto finish;

0 commit comments

Comments
 (0)