Make hierarchical Z buffer generation properly conservative. (#22603)

pcwalton · web-flow · commit 2d4bf0cfc582 · 2026-01-20T18:12:28.000Z
The single-pass downsampling (SPD) shader is properly conservative only for depth buffers with size lengths that are powers of two. This is because it assumes that, for any texel in mip level N+1, all texels in mip level N that contribute to that texel are contained within at most a 2×2 square, which is only true for textures that have side lengths that have powers of two. (For textures that have side lengths that aren't powers of two, proper conservative downsampling may require sampling up to a 3×3 square.) This PR solves the problem in a conservative way, by conceptually rounding up the side lengths of the depth buffer to the *next* power of two and scaling the depth buffer appropriately before performing downsampling. This ensures that the SPD shader only sees textures with side lengths that are powers of two at every step of the operation. Note "conceptually"; in reality this patch doesn't actually generate such an intermediate scaled texture. Instead, it changes the `load_mip_0` function in the shader to return the value that *would* have been produced by sampling such a scaled depth buffer. This is obviously more efficient than actually performing such a scaling operation. The sampling operations in the mesh preprocessing occlusion culling code required no changes, as they simply use `textureDimensions` on the hierarchical Z buffer to determine its size. I did, however, have to change the meshlet code to use `textureDimensions` like the mesh preprocessing code does. The meshlet culling indeed seems less broken now (albeit still broken); the rabbits on the right side don't flicker anymore in my testing. Note that this approach, while popular (e.g. in zeux's [Niagara]), is more conservative than a single-pass downsampler that properly handles 3×3 texel blocks would be. However, such a downsampler would be complex, and I figured it was better to make our occlusion culling correct, simple, and fast rather than possibly-complex and slow. This fix allows us to move occlusion culling out of experimental status. I opted not to do that in this PR in order to make it easier to review, but a follow-up PR should do that. [Niagara]: zeux/niagara#15 (comment)
diff --git a/crates/bevy_core_pipeline/src/mip_generation/experimental/depth.rs b/crates/bevy_core_pipeline/src/mip_generation/experimental/depth.rs
@@ -1,8 +1,7 @@
 //! Generation of hierarchical Z buffers for occlusion culling.
 //!
-//! This is marked experimental because the shader is designed only for
-//! power-of-two texture sizes and is slightly incorrect for non-power-of-two
-//! depth buffer sizes.
+//! Currently, this module only supports generation of hierarchical Z buffers
+//! for occlusion culling.
 
 use core::array;
 
@@ -515,10 +514,11 @@ impl ViewDepthPyramid {
         texture_label: &'static str,
         texture_view_label: &'static str,
     ) -> ViewDepthPyramid {
-        // Calculate the size of the depth pyramid.
+        // Calculate the size of the depth pyramid. This is the size of the
+        // depth buffer rounded down to the previous power of two.
         let depth_pyramid_size = Extent3d {
-            width: size.x.div_ceil(2),
-            height: size.y.div_ceil(2),
+            width: previous_power_of_two(size.x),
+            height: previous_power_of_two(size.y),
             depth_or_array_layers: 1,
         };
 
@@ -616,6 +616,22 @@ impl ViewDepthPyramid {
         downsample_depth_first_pipeline: &ComputePipeline,
         downsample_depth_second_pipeline: &ComputePipeline,
     ) {
+        // We need to make sure that every mip level the single-pass
+        // downsampling (SPD) shader sees has lengths that are powers of two for
+        // correct conservative depth buffer downsampling. To do this, we
+        // maintain the fiction that we're downsampling a depth buffer scaled up
+        // so that it has side lengths rounded up to the next power of two. (If
+        // the depth buffer already has a side length that's a power of two,
+        // then we double it anyway; this ensures that we don't lose any
+        // precision in the top level of the depth pyramid.) The
+        // `downsample_depth` shader's `load_mip_0` function returns the value
+        // that sampling such a depth buffer would yield, without actually
+        // having to construct such a scaled depth buffer.
+        let virtual_view_size = uvec2(
+            (view_size.x + 1).next_power_of_two(),
+            (view_size.y + 1).next_power_of_two(),
+        );
+
         let command_encoder = render_context.command_encoder();
         let mut downsample_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor {
             label: Some(label),
@@ -625,7 +641,11 @@ impl ViewDepthPyramid {
         // Pass the mip count as a push constant, for simplicity.
         downsample_pass.set_push_constants(0, &self.mip_count.to_le_bytes());
         downsample_pass.set_bind_group(0, downsample_depth_bind_group, &[]);
-        downsample_pass.dispatch_workgroups(view_size.x.div_ceil(64), view_size.y.div_ceil(64), 1);
+        downsample_pass.dispatch_workgroups(
+            virtual_view_size.x.div_ceil(64),
+            virtual_view_size.y.div_ceil(64),
+            1,
+        );
 
         if self.mip_count >= 7 {
             downsample_pass.set_pipeline(downsample_depth_second_pipeline);
@@ -712,3 +732,9 @@ pub(crate) fn prepare_downsample_depth_view_bind_groups(
             ));
     }
 }
+
+/// Returns the previous power of two of x, or, if x is exactly a power of two,
+/// returns x unchanged.
+fn previous_power_of_two(x: u32) -> u32 {
+    1 << (31 - x.leading_zeros())
+}
diff --git a/crates/bevy_core_pipeline/src/mip_generation/experimental/downsample_depth.wgsl b/crates/bevy_core_pipeline/src/mip_generation/experimental/downsample_depth.wgsl
@@ -29,6 +29,9 @@ var<push_constant> constants: Constants;
 
 /// Generates a hierarchical depth buffer.
 /// Based on FidelityFX SPD v2.1 https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/d7531ae47d8b36a5d4025663e731a47a38be882f/sdk/include/FidelityFX/gpu/spd/ffx_spd.h#L528
+///
+/// `mip_0` may be of any size, but `mip_1` and down must have side lengths that
+/// are powers of two.
 
 // TODO:
 // * Subgroup support
@@ -307,32 +310,94 @@ fn reduce_load_mip_6(tex: vec2u) -> f32 {
     ));
 }
 
+// Loads the top mip level at virtual position (x, y).
+//
+// This is the value that *would be* returned from sampling a scaled depth
+// buffer with side lengths rounded up to the next power of two, without
+// actually constructing such a depth buffer.
+//
+// See the comments in `ViewDepthPyramid::downsample_depth` for more
+// information.
 fn load_mip_0(x: u32, y: u32) -> f32 {
+    let actual_size = textureDimensions(mip_0).xy;
+    let virtual_size = vec2<u32>(
+        next_power_of_two(actual_size.x),
+        next_power_of_two(actual_size.y)
+    );
+    let virtual_uv = (vec2<f32>(f32(x), f32(y)) + 0.5) / vec2<f32>(virtual_size);
 #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
-    let visibility = textureLoad(mip_0, vec2(x, y)).r;
-    return bitcast<f32>(u32(visibility >> 32u));
+    let virtual_st = virtual_uv * vec2<f32>(actual_size);
+    let visibility = load_mip_0_meshlet(virtual_st, 32u);
+    return reduce_4(visibility);
 #else   // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
 #ifdef MESHLET
-    let visibility = textureLoad(mip_0, vec2(x, y)).r;
-    return bitcast<f32>(visibility);
+    let virtual_st = virtual_uv * vec2<f32>(actual_size);
+    let visibility = load_mip_0_meshlet(virtual_st, 0u);
+    return reduce_4(visibility);
 #else   // MESHLET
     // Downsample the top level.
 #ifdef MULTISAMPLE
     // The top level is multisampled, so we need to loop over all the samples
     // and reduce them to 1.
-    var result = textureLoad(mip_0, vec2(x, y), 0);
+    let virtual_st = virtual_uv * vec2<f32>(actual_size);
+    var result = load_mip_0_single_sample(virtual_st, 0);
     let sample_count = i32(textureNumSamples(mip_0));
     for (var sample = 1; sample < sample_count; sample += 1) {
-        result = min(result, textureLoad(mip_0, vec2(x, y), sample));
+        result = min(result, load_mip_0_single_sample(virtual_st, sample));
     }
     return result;
 #else   // MULTISAMPLE
-    return textureLoad(mip_0, vec2(x, y), 0);
+    return reduce_4(textureGather(mip_0, samplr, virtual_uv));
 #endif  // MULTISAMPLE
 #endif  // MESHLET
 #endif  // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
 }
 
+#ifdef MESHLET
+// Loads a single 2×2 square of texels at the given position from the source
+// image and returns all four (like `textureGather` does).
+//
+// `st` should be in texels, not in the [0, 1] range like UVs. That is, `st` is
+// `uv * textureDimensions(mip_0).xy`.
+fn load_mip_0_meshlet(st: vec2<f32>, shift: u32) -> vec4<f32> {
+    let st0 = vec2<u32>(floor(st - 0.5));
+    let st1 = st0 + 1u;
+    return vec4<f32>(
+        bitcast<f32>(u32(textureLoad(mip_0, vec2<u32>(st0.x, st0.y)).r) >> shift),
+        bitcast<f32>(u32(textureLoad(mip_0, vec2<u32>(st0.x, st1.y)).r) >> shift),
+        bitcast<f32>(u32(textureLoad(mip_0, vec2<u32>(st1.x, st0.y)).r) >> shift),
+        bitcast<f32>(u32(textureLoad(mip_0, vec2<u32>(st1.x, st1.y)).r) >> shift)
+    );
+}
+#endif  // MESHLET
+
+#ifdef MULTISAMPLE
+// Loads a single 2×2 square of texels at the given position from the source
+// image, reduces them, and returns the result.
+//
+// `st` should be in texels, not in the [0, 1] range like UVs. That is, `st` is
+// `uv * textureDimensions(mip_0).xy`.
+fn load_mip_0_single_sample(st: vec2<f32>, sample: i32) -> f32 {
+    let st0 = vec2<u32>(floor(st - 0.5));
+    let st1 = st0 + 1u;
+    let v = vec4<f32>(
+        textureLoad(mip_0, vec2<u32>(st0.x, st0.y), sample),
+        textureLoad(mip_0, vec2<u32>(st0.x, st1.y), sample),
+        textureLoad(mip_0, vec2<u32>(st1.x, st0.y), sample),
+        textureLoad(mip_0, vec2<u32>(st1.x, st1.y), sample)
+    );
+    return reduce_4(v);
+}
+#endif  // MULTISAMPLE
+
 fn reduce_4(v: vec4f) -> f32 {
     return min(min(v.x, v.y), min(v.z, v.w));
 }
+
+// Returns the next power of two of x.
+//
+// If x is itself a power of two, this still returns the *next* power of two.
+// This is different from Rust's `next_power_of_two` function.
+fn next_power_of_two(x: u32) -> u32 {
+    return 1u << (32u - countLeadingZeros(x));
+}
diff --git a/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl
@@ -143,9 +143,8 @@ fn sample_hzb_row(sx: vec4<u32>, sy: u32, mip: i32) -> f32 {
     return min(min(a, b), min(c, d));
 }
 
-// TODO: We should probably be using a POT HZB texture?
 fn occlusion_cull_screen_aabb(aabb: ScreenAabb, screen: vec2<f32>) -> bool {
-    let hzb_size = ceil(screen * 0.5);
+    let hzb_size = vec2<f32>(textureDimensions(depth_pyramid).xy);
     let aabb_min = aabb.min.xy * hzb_size;
     let aabb_max = aabb.max.xy * hzb_size;
 
@@ -157,7 +156,6 @@ fn occlusion_cull_screen_aabb(aabb: ScreenAabb, screen: vec2<f32>) -> bool {
     // note: add 1 before max because the unsigned overflow behavior is intentional
     // it wraps around firstLeadingBit(0) = ~0 to 0
     // TODO: we actually sample a 4x4 block, so ideally this would be `max(..., 3u) - 3u`.
-    // However, since our HZB is not a power of two, we need to be extra-conservative to not over-cull, so we go up a mip.
     var mip = max(firstLeadingBit(max_size) + 1u, 2u) - 2u;
     
     if any((max_texel >> vec2(mip)) > (min_texel >> vec2(mip)) + 3) {