|
| 1 | +//! Optimized alpha blending routines based on libwebp |
| 2 | +//! |
| 3 | +//! https://github.com/webmproject/libwebp/blob/e4f7a9f0c7c9fbfae1568bc7fa5c94b989b50872/src/demux/anim_decode.c#L215-L267 |
| 4 | +
|
| 5 | +fn channel_shift(i: u32) -> u32 { |
| 6 | + i * 8 |
| 7 | +} |
| 8 | + |
| 9 | +/// Blend a single channel of `src` over `dst`, given their alpha channel values. |
| 10 | +/// `src` and `dst` are assumed to be NOT pre-multiplied by alpha. |
| 11 | +fn blend_channel_nonpremult( |
| 12 | + src: u32, |
| 13 | + src_a: u8, |
| 14 | + dst: u32, |
| 15 | + dst_a: u8, |
| 16 | + scale: u32, |
| 17 | + shift: u32, |
| 18 | +) -> u8 { |
| 19 | + let src_channel = ((src >> shift) & 0xff) as u8; |
| 20 | + let dst_channel = ((dst >> shift) & 0xff) as u8; |
| 21 | + let blend_unscaled = (src_channel as u32 * src_a as u32) + (dst_channel as u32 * dst_a as u32); |
| 22 | + debug_assert!(u64::from(blend_unscaled) < (1u64 << 32) / scale as u64); |
| 23 | + ((blend_unscaled * scale) >> channel_shift(3)) as u8 |
| 24 | +} |
| 25 | + |
| 26 | +/// Blend `src` over `dst` assuming they are NOT pre-multiplied by alpha. |
| 27 | +fn blend_pixel_nonpremult(src: u32, dst: u32) -> u32 { |
| 28 | + let src_a = ((src >> channel_shift(3)) & 0xff) as u8; |
| 29 | + |
| 30 | + if src_a == 0 { |
| 31 | + dst |
| 32 | + } else { |
| 33 | + let dst_a = ((dst >> channel_shift(3)) & 0xff) as u8; |
| 34 | + // Approximate integer arithmetic for: dst_factor_a = (dst_a * (255 - src_a)) / 255 |
| 35 | + // libwebp used the following formula here: |
| 36 | + //let dst_factor_a = (dst_a as u32 * (256 - src_a as u32)) >> 8; |
| 37 | + // however, we've found that we can use a more precise approximation without losing performance: |
| 38 | + let dst_factor_a = div_by_255(dst_a as u32 * (255 - src_a as u32)); |
| 39 | + let blend_a = src_a as u32 + dst_factor_a; |
| 40 | + let scale = (1u32 << 24) / blend_a; |
| 41 | + |
| 42 | + let blend_r = |
| 43 | + blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(0)); |
| 44 | + let blend_g = |
| 45 | + blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(1)); |
| 46 | + let blend_b = |
| 47 | + blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(2)); |
| 48 | + debug_assert!(src_a as u32 + dst_factor_a < 256); |
| 49 | + |
| 50 | + ((blend_r as u32) << channel_shift(0)) |
| 51 | + | ((blend_g as u32) << channel_shift(1)) |
| 52 | + | ((blend_b as u32) << channel_shift(2)) |
| 53 | + | (blend_a << channel_shift(3)) |
| 54 | + } |
| 55 | +} |
| 56 | + |
| 57 | +pub(crate) fn do_alpha_blending(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] { |
| 58 | + // The original C code contained different shift functions for different endianness, |
| 59 | + // but they didn't work when ported to Rust directly (and probably didn't work in C either). |
| 60 | + // So instead we reverse the order of bytes on big-endian here, at the interface. |
| 61 | + // `from_le_bytes` is a no-op on little endian (most systems) and a cheap shuffle on big endian. |
| 62 | + blend_pixel_nonpremult(u32::from_le_bytes(buffer), u32::from_le_bytes(canvas)).to_le_bytes() |
| 63 | +} |
| 64 | + |
| 65 | +/// Divides by 255, rounding to nearest (as opposed to down, like regular integer division does). |
| 66 | +/// TODO: cannot output 256, so the output is effecitively u8. Plumb that through the code. |
| 67 | +// |
| 68 | +// Sources: |
| 69 | +// https://arxiv.org/pdf/2202.02864 |
| 70 | +// https://github.com/image-rs/image-webp/issues/119#issuecomment-2544007820 |
| 71 | +#[inline] |
| 72 | +fn div_by_255(v: u32) -> u32 { |
| 73 | + (((v + 0x80) >> 8) + v + 0x80) >> 8 |
| 74 | +} |
| 75 | + |
| 76 | +#[cfg(test)] |
| 77 | +mod tests { |
| 78 | + use super::*; |
| 79 | + |
| 80 | + fn do_alpha_blending_reference(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] { |
| 81 | + let canvas_alpha = f64::from(canvas[3]); |
| 82 | + let buffer_alpha = f64::from(buffer[3]); |
| 83 | + let blend_alpha_f64 = buffer_alpha + canvas_alpha * (1.0 - buffer_alpha / 255.0); |
| 84 | + //value should be between 0 and 255, this truncates the fractional part |
| 85 | + let blend_alpha: u8 = blend_alpha_f64 as u8; |
| 86 | + |
| 87 | + let blend_rgb: [u8; 3] = if blend_alpha == 0 { |
| 88 | + [0, 0, 0] |
| 89 | + } else { |
| 90 | + let mut rgb = [0u8; 3]; |
| 91 | + for i in 0..3 { |
| 92 | + let canvas_f64 = f64::from(canvas[i]); |
| 93 | + let buffer_f64 = f64::from(buffer[i]); |
| 94 | + |
| 95 | + let val = (buffer_f64 * buffer_alpha |
| 96 | + + canvas_f64 * canvas_alpha * (1.0 - buffer_alpha / 255.0)) |
| 97 | + / blend_alpha_f64; |
| 98 | + //value should be between 0 and 255, this truncates the fractional part |
| 99 | + rgb[i] = val as u8; |
| 100 | + } |
| 101 | + |
| 102 | + rgb |
| 103 | + }; |
| 104 | + |
| 105 | + [blend_rgb[0], blend_rgb[1], blend_rgb[2], blend_alpha] |
| 106 | + } |
| 107 | + |
| 108 | + #[test] |
| 109 | + #[ignore] // takes too long to run on CI. Run this locally when changing the function. |
| 110 | + fn alpha_blending_optimization() { |
| 111 | + for r1 in 0..u8::MAX { |
| 112 | + for a1 in 11..u8::MAX { |
| 113 | + for r2 in 0..u8::MAX { |
| 114 | + for a2 in 11..u8::MAX { |
| 115 | + let opt = do_alpha_blending([r1, 0, 0, a1], [r2, 0, 0, a2]); |
| 116 | + let slow = do_alpha_blending_reference([r1, 0, 0, a1], [r2, 0, 0, a2]); |
| 117 | + // libwebp doesn't do exact blending and so we don't either |
| 118 | + for (o, s) in opt.iter().zip(slow.iter()) { |
| 119 | + if o.abs_diff(*s) > 3 { |
| 120 | + panic!("Mismatch in results! opt: {opt:?}, slow: {slow:?}, blended values: [{r1}, 0, 0, {a1}], [{r2}, 0, 0, {a2}]"); |
| 121 | + } |
| 122 | + } |
| 123 | + } |
| 124 | + } |
| 125 | + } |
| 126 | + } |
| 127 | + } |
| 128 | +} |
0 commit comments