Skip to content
This repository was archived by the owner on Nov 30, 2020. It is now read-only.

Commit 5dd99bf

Browse files
committed
Quality and performance improvements to Motion Blur
- Completely redesigned the reconstruction filter. It's not only faster than the previous implementation, but also better in quality. - Split the TileMax4 pass into two TileMax2 passes. This improves the throughput of the fragment shader on some major GPUs (e.g. PS4). - Removed the unrolled version of the reconstruction filter. The shader issue on Adreno GPUs doesn't reproduce with the new filter. - Refactorings to the TileMax/NeighborMax filters.
1 parent c276434 commit 5dd99bf

File tree

4 files changed

+129
-231
lines changed

4 files changed

+129
-231
lines changed

PostProcessing/Resources/Shaders/Common.cginc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ inline half4 Pow4(half4 x) { return x * x * x * x; }
7676
#endif
7777

7878
// Returns the largest vector of v1 and v2
79-
inline half2 MaxV(half2 v1, half2 v2) { return lerp(v1, v2, dot(v1, v1) < dot(v2, v2)); }
80-
inline half3 MaxV(half3 v1, half3 v2) { return lerp(v1, v2, dot(v1, v1) < dot(v2, v2)); }
81-
inline half4 MaxV(half4 v1, half4 v2) { return lerp(v1, v2, dot(v1, v1) < dot(v2, v2)); }
79+
inline half2 MaxV(half2 v1, half2 v2) { return dot(v1, v1) < dot(v2, v2) ? v2 : v1; }
80+
inline half3 MaxV(half3 v1, half3 v2) { return dot(v1, v1) < dot(v2, v2) ? v2 : v1; }
81+
inline half4 MaxV(half4 v1, half4 v2) { return dot(v1, v1) < dot(v2, v2) ? v2 : v1; }
8282

8383
// Clamp HDR value within a safe range
8484
inline half SafeHDR(half c) { return min(c, HALF_MAX); }

PostProcessing/Resources/Shaders/MotionBlur.cginc

Lines changed: 99 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ float2 _TileMaxOffs;
2828

2929
// Maximum blur radius (in pixels)
3030
half _MaxBlurRadius;
31+
float _RcpMaxBlurRadius;
3132

3233
// Filter parameters/coefficients
33-
int _LoopCount;
34+
half _LoopCount;
3435

3536
// History buffer for frame blending
3637
sampler2D _History1LumaTex;
@@ -79,81 +80,38 @@ half4 FragVelocitySetup(VaryingsDefault i) : SV_Target
7980
// Sample the motion vector.
8081
float2 v = tex2D(_CameraMotionVectorsTexture, i.uv).rg;
8182

82-
// Apply the exposure time.
83-
v *= _VelocityScale;
84-
85-
// Halve the vector and convert it to the pixel space.
86-
v = v * 0.5 * _CameraMotionVectorsTexture_TexelSize.zw;
83+
// Apply the exposure time and convert to the pixel space.
84+
v *= (_VelocityScale * 0.5) * _CameraMotionVectorsTexture_TexelSize.zw;
8785

8886
// Clamp the vector with the maximum blur radius.
89-
float lv = length(v);
90-
v *= min(lv, _MaxBlurRadius) / max(lv, 1e-6);
87+
v /= max(1.0, length(v) * _RcpMaxBlurRadius);
9188

9289
// Sample the depth of the pixel.
93-
float d = SAMPLE_DEPTH_TEXTURE(_CameraDepthTexture, i.uv.xy);
94-
half z01 = LinearizeDepth(d);
90+
half d = LinearizeDepth(SAMPLE_DEPTH_TEXTURE(_CameraDepthTexture, i.uv));
9591

9692
// Pack into 10/10/10/2 format.
97-
return half4((v / _MaxBlurRadius + 1.0) / 2.0, z01, 0.0);
93+
return half4((v * _RcpMaxBlurRadius + 1.0) * 0.5, d, 0.0);
9894
}
9995

100-
// TileMax filter (4 pixels width with normalization)
101-
half4 FragTileMax4(VaryingsDefault i) : SV_Target
96+
// TileMax filter (2 pixel width with normalization)
97+
half4 FragTileMax1(VaryingsDefault i) : SV_Target
10298
{
103-
float4 d1 = _MainTex_TexelSize.xyxy * float4( 0.5, 0.5, 1.5, 1.5);
104-
float4 d2 = _MainTex_TexelSize.xyxy * float4(-0.5, 0.5, -1.5, 1.5);
105-
106-
half2 v01 = tex2D(_MainTex, i.uv - d1.zw).rg; // -1.5, -1.5
107-
half2 v02 = tex2D(_MainTex, i.uv - d1.xw).rg; // -0.5, -1.5
108-
half2 v03 = tex2D(_MainTex, i.uv - d2.xw).rg; // +0.5, -1.5
109-
half2 v04 = tex2D(_MainTex, i.uv - d2.zw).rg; // +1.5, -1.5
110-
111-
half2 v05 = tex2D(_MainTex, i.uv - d1.zy).rg; // -1.5, -0.5
112-
half2 v06 = tex2D(_MainTex, i.uv - d1.xy).rg; // -0.5, -0.5
113-
half2 v07 = tex2D(_MainTex, i.uv - d2.xy).rg; // +0.5, -0.5
114-
half2 v08 = tex2D(_MainTex, i.uv - d2.zy).rg; // +1.5, -0.5
115-
116-
half2 v09 = tex2D(_MainTex, i.uv + d2.zy).rg; // -1.5, +0.5
117-
half2 v10 = tex2D(_MainTex, i.uv + d2.xy).rg; // -0.5, +0.5
118-
half2 v11 = tex2D(_MainTex, i.uv + d1.xy).rg; // +0.5, +0.5
119-
half2 v12 = tex2D(_MainTex, i.uv + d1.zy).rg; // +1.5, +0.5
120-
121-
half2 v13 = tex2D(_MainTex, i.uv + d2.zw).rg; // -1.5, +1.5
122-
half2 v14 = tex2D(_MainTex, i.uv + d2.xw).rg; // -0.5, +1.5
123-
half2 v15 = tex2D(_MainTex, i.uv + d1.xw).rg; // +0.5, +1.5
124-
half2 v16 = tex2D(_MainTex, i.uv + d1.zw).rg; // +1.5, +1.5
125-
126-
v01 = (v01 * 2.0 - 1.0) * _MaxBlurRadius;
127-
v02 = (v02 * 2.0 - 1.0) * _MaxBlurRadius;
128-
v03 = (v03 * 2.0 - 1.0) * _MaxBlurRadius;
129-
v04 = (v04 * 2.0 - 1.0) * _MaxBlurRadius;
130-
131-
v05 = (v05 * 2.0 - 1.0) * _MaxBlurRadius;
132-
v06 = (v06 * 2.0 - 1.0) * _MaxBlurRadius;
133-
v07 = (v07 * 2.0 - 1.0) * _MaxBlurRadius;
134-
v08 = (v08 * 2.0 - 1.0) * _MaxBlurRadius;
135-
136-
v09 = (v09 * 2.0 - 1.0) * _MaxBlurRadius;
137-
v10 = (v10 * 2.0 - 1.0) * _MaxBlurRadius;
138-
v11 = (v11 * 2.0 - 1.0) * _MaxBlurRadius;
139-
v12 = (v12 * 2.0 - 1.0) * _MaxBlurRadius;
140-
141-
v13 = (v13 * 2.0 - 1.0) * _MaxBlurRadius;
142-
v14 = (v14 * 2.0 - 1.0) * _MaxBlurRadius;
143-
v15 = (v15 * 2.0 - 1.0) * _MaxBlurRadius;
144-
v16 = (v16 * 2.0 - 1.0) * _MaxBlurRadius;
145-
146-
half2 va = MaxV(MaxV(MaxV(v01, v02), v03), v04);
147-
half2 vb = MaxV(MaxV(MaxV(v05, v06), v07), v08);
148-
half2 vc = MaxV(MaxV(MaxV(v09, v10), v11), v12);
149-
half2 vd = MaxV(MaxV(MaxV(v13, v14), v15), v16);
150-
151-
half2 vo = MaxV(MaxV(MaxV(va, vb), vc), vd);
99+
float4 d = _MainTex_TexelSize.xyxy * float4(-0.5, -0.5, 0.5, 0.5);
152100

153-
return half4(vo, 0.0, 0.0);
101+
half2 v1 = tex2D(_MainTex, i.uv + d.xy).rg;
102+
half2 v2 = tex2D(_MainTex, i.uv + d.zy).rg;
103+
half2 v3 = tex2D(_MainTex, i.uv + d.xw).rg;
104+
half2 v4 = tex2D(_MainTex, i.uv + d.zw).rg;
105+
106+
v1 = (v1 * 2.0 - 1.0) * _MaxBlurRadius;
107+
v2 = (v2 * 2.0 - 1.0) * _MaxBlurRadius;
108+
v3 = (v3 * 2.0 - 1.0) * _MaxBlurRadius;
109+
v4 = (v4 * 2.0 - 1.0) * _MaxBlurRadius;
110+
111+
return half4(MaxV(MaxV(MaxV(v1, v2), v3), v4), 0.0, 0.0);
154112
}
155113

156-
// TileMax filter (2 pixels width)
114+
// TileMax filter (2 pixel width)
157115
half4 FragTileMax2(VaryingsDefault i) : SV_Target
158116
{
159117
float4 d = _MainTex_TexelSize.xyxy * float4(-0.5, -0.5, 0.5, 0.5);
@@ -163,9 +121,7 @@ half4 FragTileMax2(VaryingsDefault i) : SV_Target
163121
half2 v3 = tex2D(_MainTex, i.uv + d.xw).rg;
164122
half2 v4 = tex2D(_MainTex, i.uv + d.zw).rg;
165123

166-
half2 vo = MaxV(MaxV(MaxV(v1, v2), v3), v4);
167-
168-
return half4(vo, 0.0, 0.0);
124+
return half4(MaxV(MaxV(MaxV(v1, v2), v3), v4), 0.0, 0.0);
169125
}
170126

171127
// TileMax filter (variable width)
@@ -215,165 +171,118 @@ half4 FragNeighborMax(VaryingsDefault i) : SV_Target
215171
half2 vb = MaxV(v4, MaxV(v5, v6));
216172
half2 vc = MaxV(v7, MaxV(v8, v9));
217173

218-
return half4(MaxV(va, MaxV(vb, vc)) / cw, 0.0, 0.0);
174+
return half4(MaxV(va, MaxV(vb, vc)) * (1.0 / cw), 0.0, 0.0);
219175
}
220176

221177
// -----------------------------------------------------------------------------
222178
// Reconstruction
223179

224-
// Strength of the depth filter
225-
static const float kDepthFilterCoeff = 15.0;
226-
227-
// Safer version of vector normalization function
228-
half2 SafeNorm(half2 v)
180+
// Returns true or false with a given interval.
181+
bool Interval(half phase, half interval)
229182
{
230-
half l = max(length(v), EPSILON);
231-
return v / l * (l >= 0.5);
183+
return frac(phase / interval) > 0.499;
232184
}
233185

234186
// Jitter function for tile lookup
235187
float2 JitterTile(float2 uv)
236188
{
237189
float rx, ry;
238190
sincos(GradientNoise(uv + float2(2.0, 0.0)) * UNITY_PI_2, ry, rx);
239-
return float2(rx, ry) * _NeighborMaxTex_TexelSize.xy / 4.0;
240-
}
241-
242-
// Cone shaped interpolation
243-
half Cone(half T, half l_V)
244-
{
245-
return saturate(1.0 - T / l_V);
246-
}
247-
248-
// Cylinder shaped interpolation
249-
half Cylinder(half T, half l_V)
250-
{
251-
return 1.0 - smoothstep(0.95 * l_V, 1.05 * l_V, T);
252-
}
253-
254-
// Depth comparison function
255-
half CompareDepth(half za, half zb)
256-
{
257-
return saturate(1.0 - kDepthFilterCoeff * (zb - za) / min(za, zb));
258-
}
259-
260-
// Lerp and normalization
261-
half2 RNMix(half2 a, half2 b, half p)
262-
{
263-
return SafeNorm(lerp(a, b, saturate(p)));
191+
return float2(rx, ry) * _NeighborMaxTex_TexelSize.xy * 0.25;
264192
}
265193

266194
// Velocity sampling function
267195
half3 SampleVelocity(float2 uv)
268196
{
269-
half3 v = tex2D(_VelocityTex, uv).xyz;
197+
half3 v = tex2Dlod(_VelocityTex, float4(uv, 0.0, 0.0)).xyz;
270198
return half3((v.xy * 2.0 - 1.0) * _MaxBlurRadius, v.z);
271199
}
272200

273-
// Sample weighting function
274-
half SampleWeight(half2 d_n, half l_v_c, half z_p, half T, float2 S_uv, half w_A)
201+
// Reconstruction filter
202+
half4 FragReconstruction(VaryingsMultitex i) : SV_Target
275203
{
276-
half3 temp = tex2Dlod(_VelocityTex, float4(S_uv, 0.0, 0.0));
204+
// Color sample at the center point
205+
const half4 c_p = tex2D(_MainTex, i.uv0);
277206

278-
half2 v_S = (temp.xy * 2.0 - 1.0) * _MaxBlurRadius;
279-
half l_v_S = max(length(v_S), 0.5);
207+
// Velocity/Depth sample at the center point
208+
const half3 vd_p = SampleVelocity(i.uv1);
209+
const half l_v_p = max(length(vd_p.xy), 0.5);
210+
const half rcp_d_p = 1.0 / vd_p.z;
280211

281-
half z_S = temp.z;
212+
// NeighborMax vector sample at the center point
213+
const half2 v_max = tex2D(_NeighborMaxTex, i.uv1 + JitterTile(i.uv1)).xy;
214+
const half l_v_max = length(v_max);
215+
const half rcp_l_v_max = 1.0 / l_v_max;
282216

283-
half f = CompareDepth(z_p, z_S);
284-
half b = CompareDepth(z_S, z_p);
217+
// Escape early if the NeighborMax vector is small enough.
218+
if (l_v_max < 2.0) return c_p;
285219

286-
half w_B = abs(dot(v_S / l_v_S, d_n));
220+
// Use V_p as a secondary sampling direction except when it's too small
221+
// compared to V_max. This vector is rescaled to be the length of V_max.
222+
const half2 v_alt = (l_v_p * 2.0 > l_v_max) ? vd_p.xy * (l_v_max / l_v_p) : v_max;
287223

288-
half weight = 0.0;
289-
weight += f * Cone(T, l_v_S) * w_B;
290-
weight += b * Cone(T, l_v_c) * w_A;
291-
weight += Cylinder(T, min(l_v_S, l_v_c)) * max(w_A, w_B) * 2.0;
224+
// Determine the sample count.
225+
const half sc = floor(min(_LoopCount, l_v_max * 0.5));
292226

293-
return weight;
294-
}
227+
// Loop variables (starts from the outermost sample)
228+
const half dt = 1.0 / sc;
229+
const half t_offs = (GradientNoise(i.uv0) - 0.5) * dt;
230+
half t = 1.0 - dt * 0.5;
231+
half count = 0.0;
295232

296-
// Reconstruction filter
297-
half4 FragReconstruction(VaryingsMultitex i) : SV_Target
298-
{
299-
float2 p = i.uv1 * _ScreenParams.xy;
300-
float2 p_uv = i.uv1;
301-
302-
// Nonfiltered source color;
303-
half4 source = tex2D(_MainTex, i.uv0);
304-
305-
// Velocity vector at p.
306-
half3 v_c_t = SampleVelocity(p_uv);
307-
half2 v_c = v_c_t.xy;
308-
half2 v_c_n = SafeNorm(v_c);
309-
half l_v_c = max(length(v_c), 0.5);
310-
311-
// NeighborMax vector at p (with small).
312-
half2 v_max = tex2D(_NeighborMaxTex, p_uv + JitterTile(p_uv)).xy;
313-
half2 v_max_n = SafeNorm(v_max);
314-
half l_v_max = length(v_max);
315-
316-
// Escape early if the NeighborMax vector is too short.
317-
if (l_v_max < 0.5)
318-
return source;
319-
320-
// Linearized depth at p.
321-
half z_p = v_c_t.z;
322-
323-
// A vector perpendicular to v_max.
324-
half2 w_p = v_max_n.yx * float2(-1.0, 1.0);
325-
if (dot(w_p, v_c) < 0.0)
326-
w_p = -w_p;
327-
328-
// Secondary sampling direction.
329-
half2 w_c = RNMix(w_p, v_c_n, (l_v_c - 0.5) / 1.5);
330-
331-
// The center sample.
332-
half sampleCount = _LoopCount * 2.0;
333-
half totalWeight = sampleCount / (l_v_c * 40.0);
334-
half3 result = source.rgb * totalWeight;
335-
336-
// Start from t=-1 + small jitter.
337-
// The width of jitter is equivalent to 4 sample steps.
338-
half sampleJitter = 4.0 * 2.0 / (sampleCount + 4.0);
339-
half t = -1.0 + GradientNoise(p_uv) * sampleJitter;
340-
half dt = (2.0 - sampleJitter) / sampleCount;
341-
342-
// Precalculate the w_A parameters.
343-
half w_A1 = dot(w_c, v_c_n);
344-
half w_A2 = dot(w_c, v_max_n);
345-
346-
#ifndef UNROLL_LOOP_COUNT
347-
UNITY_LOOP for (int c = 0; c < _LoopCount; c++)
348-
#else
349-
UNITY_UNROLL for (int c = 0; c < UNROLL_LOOP_COUNT; c++)
350-
#endif
233+
// Background velocity
234+
// This is used for tracking the maximum velocity in the background layer.
235+
half l_v_bg = max(l_v_p, 1.0);
236+
237+
// Color accumlation
238+
half4 acc = 0.0;
239+
240+
UNITY_LOOP while (t > dt * 0.25)
351241
{
352-
// Odd-numbered sample: sample along v_c.
353-
{
354-
float2 S_uv0 = i.uv0 + t * v_c * _MainTex_TexelSize.xy;
355-
float2 S_uv1 = i.uv1 + t * v_c * _VelocityTex_TexelSize.xy;
356-
half weight = SampleWeight(v_c_n, l_v_c, z_p, abs(t * l_v_max), S_uv1, w_A1);
242+
// Sampling direction (switched per every two samples)
243+
const half2 v_s = Interval(count, 4.0) ? v_alt : v_max;
357244

358-
result += tex2Dlod(_MainTex, float4(S_uv0, 0.0, 0.0)).rgb * weight;
359-
totalWeight += weight;
245+
// Sample position (inverted per every sample)
246+
const half t_s = (Interval(count, 2.0) ? -t : t) + t_offs;
360247

361-
t += dt;
362-
}
363-
// Even-numbered sample: sample along v_max.
364-
{
365-
float2 S_uv0 = i.uv0 + t * v_max * _MainTex_TexelSize.xy;
366-
float2 S_uv1 = i.uv1 + t * v_max * _VelocityTex_TexelSize.xy;
367-
half weight = SampleWeight(v_max_n, l_v_c, z_p, abs(t * l_v_max), S_uv1, w_A2);
248+
// Distance to the sample position
249+
const half l_t = l_v_max * abs(t_s);
368250

369-
result += tex2Dlod(_MainTex, float4(S_uv0, 0.0, 0.0)).rgb * weight;
370-
totalWeight += weight;
251+
// UVs for the sample position
252+
const float2 uv0 = i.uv0 + v_s * t_s * _MainTex_TexelSize.xy;
253+
const float2 uv1 = i.uv1 + v_s * t_s * _VelocityTex_TexelSize.xy;
371254

372-
t += dt;
373-
}
255+
// Color sample
256+
const half3 c = tex2Dlod(_MainTex, float4(uv0, 0.0, 0.0)).rgb;
257+
258+
// Velocity/Depth sample
259+
const half3 vd = SampleVelocity(uv1);
260+
261+
// Background/Foreground separation
262+
const half fg = saturate((vd_p.z - vd.z) * 20.0 * rcp_d_p);
263+
264+
// Length of the velocity vector
265+
const half l_v = lerp(l_v_bg, length(vd.xy), fg);
266+
267+
// Sample weight
268+
// (Distance test) * (Spreading out by motion) * (Triangular window)
269+
const half w = saturate(l_v - l_t) / l_v * (1.2 - t);
270+
271+
// Color accumulation
272+
acc += half4(c, 1.0) * w;
273+
274+
// Update the background velocity.
275+
l_v_bg = max(l_v_bg, l_v);
276+
277+
// Advance to the next sample.
278+
t = Interval(count, 2.0) ? t - dt : t;
279+
count += 1.0;
374280
}
375281

376-
return half4(result / totalWeight, source.a);
282+
// Add the center sample.
283+
acc += half4(c_p.rgb, 1.0) * (1.2 / (l_v_bg * sc * 2.0));
284+
285+
return half4(acc.rgb / acc.a, c_p.a);
377286
}
378287

379288
// -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)