Skip to content

Commit 34223fa

Browse files
authored
Faster alpha blending (#123)
1 parent 833e6e7 commit 34223fa

File tree

5 files changed

+131
-28
lines changed

5 files changed

+131
-28
lines changed

src/alpha_blending.rs

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
//! Optimized alpha blending routines based on libwebp
2+
//!
3+
//! https://github.com/webmproject/libwebp/blob/e4f7a9f0c7c9fbfae1568bc7fa5c94b989b50872/src/demux/anim_decode.c#L215-L267
4+
5+
fn channel_shift(i: u32) -> u32 {
6+
i * 8
7+
}
8+
9+
/// Blend a single channel of `src` over `dst`, given their alpha channel values.
10+
/// `src` and `dst` are assumed to be NOT pre-multiplied by alpha.
11+
fn blend_channel_nonpremult(
12+
src: u32,
13+
src_a: u8,
14+
dst: u32,
15+
dst_a: u8,
16+
scale: u32,
17+
shift: u32,
18+
) -> u8 {
19+
let src_channel = ((src >> shift) & 0xff) as u8;
20+
let dst_channel = ((dst >> shift) & 0xff) as u8;
21+
let blend_unscaled = (src_channel as u32 * src_a as u32) + (dst_channel as u32 * dst_a as u32);
22+
debug_assert!(u64::from(blend_unscaled) < (1u64 << 32) / scale as u64);
23+
((blend_unscaled * scale) >> channel_shift(3)) as u8
24+
}
25+
26+
/// Blend `src` over `dst` assuming they are NOT pre-multiplied by alpha.
27+
fn blend_pixel_nonpremult(src: u32, dst: u32) -> u32 {
28+
let src_a = ((src >> channel_shift(3)) & 0xff) as u8;
29+
30+
if src_a == 0 {
31+
dst
32+
} else {
33+
let dst_a = ((dst >> channel_shift(3)) & 0xff) as u8;
34+
// Approximate integer arithmetic for: dst_factor_a = (dst_a * (255 - src_a)) / 255
35+
// libwebp used the following formula here:
36+
//let dst_factor_a = (dst_a as u32 * (256 - src_a as u32)) >> 8;
37+
// however, we've found that we can use a more precise approximation without losing performance:
38+
let dst_factor_a = div_by_255(dst_a as u32 * (255 - src_a as u32));
39+
let blend_a = src_a as u32 + dst_factor_a;
40+
let scale = (1u32 << 24) / blend_a;
41+
42+
let blend_r =
43+
blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(0));
44+
let blend_g =
45+
blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(1));
46+
let blend_b =
47+
blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(2));
48+
debug_assert!(src_a as u32 + dst_factor_a < 256);
49+
50+
((blend_r as u32) << channel_shift(0))
51+
| ((blend_g as u32) << channel_shift(1))
52+
| ((blend_b as u32) << channel_shift(2))
53+
| (blend_a << channel_shift(3))
54+
}
55+
}
56+
57+
pub(crate) fn do_alpha_blending(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] {
58+
// The original C code contained different shift functions for different endianness,
59+
// but they didn't work when ported to Rust directly (and probably didn't work in C either).
60+
// So instead we reverse the order of bytes on big-endian here, at the interface.
61+
// `from_le_bytes` is a no-op on little endian (most systems) and a cheap shuffle on big endian.
62+
blend_pixel_nonpremult(u32::from_le_bytes(buffer), u32::from_le_bytes(canvas)).to_le_bytes()
63+
}
64+
65+
/// Divides by 255, rounding to nearest (as opposed to down, like regular integer division does).
66+
/// TODO: cannot output 256, so the output is effecitively u8. Plumb that through the code.
67+
//
68+
// Sources:
69+
// https://arxiv.org/pdf/2202.02864
70+
// https://github.com/image-rs/image-webp/issues/119#issuecomment-2544007820
71+
#[inline]
72+
fn div_by_255(v: u32) -> u32 {
73+
(((v + 0x80) >> 8) + v + 0x80) >> 8
74+
}
75+
76+
#[cfg(test)]
77+
mod tests {
78+
use super::*;
79+
80+
fn do_alpha_blending_reference(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] {
81+
let canvas_alpha = f64::from(canvas[3]);
82+
let buffer_alpha = f64::from(buffer[3]);
83+
let blend_alpha_f64 = buffer_alpha + canvas_alpha * (1.0 - buffer_alpha / 255.0);
84+
//value should be between 0 and 255, this truncates the fractional part
85+
let blend_alpha: u8 = blend_alpha_f64 as u8;
86+
87+
let blend_rgb: [u8; 3] = if blend_alpha == 0 {
88+
[0, 0, 0]
89+
} else {
90+
let mut rgb = [0u8; 3];
91+
for i in 0..3 {
92+
let canvas_f64 = f64::from(canvas[i]);
93+
let buffer_f64 = f64::from(buffer[i]);
94+
95+
let val = (buffer_f64 * buffer_alpha
96+
+ canvas_f64 * canvas_alpha * (1.0 - buffer_alpha / 255.0))
97+
/ blend_alpha_f64;
98+
//value should be between 0 and 255, this truncates the fractional part
99+
rgb[i] = val as u8;
100+
}
101+
102+
rgb
103+
};
104+
105+
[blend_rgb[0], blend_rgb[1], blend_rgb[2], blend_alpha]
106+
}
107+
108+
#[test]
109+
#[ignore] // takes too long to run on CI. Run this locally when changing the function.
110+
fn alpha_blending_optimization() {
111+
for r1 in 0..u8::MAX {
112+
for a1 in 11..u8::MAX {
113+
for r2 in 0..u8::MAX {
114+
for a2 in 11..u8::MAX {
115+
let opt = do_alpha_blending([r1, 0, 0, a1], [r2, 0, 0, a2]);
116+
let slow = do_alpha_blending_reference([r1, 0, 0, a1], [r2, 0, 0, a2]);
117+
// libwebp doesn't do exact blending and so we don't either
118+
for (o, s) in opt.iter().zip(slow.iter()) {
119+
if o.abs_diff(*s) > 3 {
120+
panic!("Mismatch in results! opt: {opt:?}, slow: {slow:?}, blended values: [{r1}, 0, 0, {a1}], [{r2}, 0, 0, {a2}]");
121+
}
122+
}
123+
}
124+
}
125+
}
126+
}
127+
}
128+
}

src/extended.rs

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ use crate::decoder::DecodingError;
33
use byteorder_lite::ReadBytesExt;
44
use std::io::{BufRead, Read};
55

6+
use crate::alpha_blending::do_alpha_blending;
7+
68
#[derive(Debug, Clone)]
79
pub(crate) struct WebPExtendedInfo {
810
pub(crate) alpha: bool,
@@ -143,34 +145,6 @@ pub(crate) fn composite_frame(
143145
}
144146
}
145147

146-
fn do_alpha_blending(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] {
147-
let canvas_alpha = f64::from(canvas[3]);
148-
let buffer_alpha = f64::from(buffer[3]);
149-
let blend_alpha_f64 = buffer_alpha + canvas_alpha * (1.0 - buffer_alpha / 255.0);
150-
//value should be between 0 and 255, this truncates the fractional part
151-
let blend_alpha: u8 = blend_alpha_f64 as u8;
152-
153-
let blend_rgb: [u8; 3] = if blend_alpha == 0 {
154-
[0, 0, 0]
155-
} else {
156-
let mut rgb = [0u8; 3];
157-
for i in 0..3 {
158-
let canvas_f64 = f64::from(canvas[i]);
159-
let buffer_f64 = f64::from(buffer[i]);
160-
161-
let val = (buffer_f64 * buffer_alpha
162-
+ canvas_f64 * canvas_alpha * (1.0 - buffer_alpha / 255.0))
163-
/ blend_alpha_f64;
164-
//value should be between 0 and 255, this truncates the fractional part
165-
rgb[i] = val as u8;
166-
}
167-
168-
rgb
169-
};
170-
171-
[blend_rgb[0], blend_rgb[1], blend_rgb[2], blend_alpha]
172-
}
173-
174148
pub(crate) fn get_alpha_predictor(
175149
x: usize,
176150
y: usize,

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ extern crate test;
1212
pub use self::decoder::{DecodingError, LoopCount, WebPDecoder};
1313
pub use self::encoder::{ColorType, EncoderParams, EncodingError, WebPEncoder};
1414

15+
mod alpha_blending;
1516
mod decoder;
1617
mod encoder;
1718
mod extended;
-62 Bytes
Loading
-62 Bytes
Loading

0 commit comments

Comments
 (0)