Skip to content

Commit 5066b03

Browse files
aevyriekfc35
andauthored
Parallel GPU buffer writes (#22314)
# Objective - After a series of optimizations making render and postupdate more parallel, `write_batched_instance_buffers` was regularly one of the largest spans with very low thread use, sitting at 4ms in 1 4ms frame. This makes it an ideal target to improve throughput. Note this screenshot doesn't include some visibility system optimizations: <img width="650" height="718" alt="image" src="https://github.com/user-attachments/assets/bbd6762b-5145-48f8-a427-5da3cb11a04a" /> ## Solution - Spawn tasks for writing buffers to the GPU. This is especially helpful for `current_input_buffer` and `previous_input_buffer`, which take about the same time and are the longest buffer writes - moving these to tasks effectively halves the time spent in the system. <img width="588" height="251" alt="image" src="https://github.com/user-attachments/assets/0a086e7a-1d3c-4c17-9d66-eff94196943d" /> - In the 250k bevymark_3d stress test, this saves 1.7ms in the system, and 2.8ms in frame time frametime <img width="620" height="376" alt="image" src="https://github.com/user-attachments/assets/a4c106ac-7668-4f8a-970f-71cbb8be851c" /> system <img width="1384" height="744" alt="image" src="https://github.com/user-attachments/assets/5c42227d-8ee5-4b84-bc1a-c04768356255" /> ## Testing - `cargo rer bevymark_3d --features=debug,trace_tracy -- --benchmark --waves 250 --per-wave 1000` --------- Co-authored-by: Kevin Chen <chen.kevin.f@gmail.com>
1 parent 8fbc556 commit 5066b03

File tree

1 file changed

+129
-82
lines changed

1 file changed

+129
-82
lines changed

crates/bevy_render/src/batching/gpu_preprocessing.rs

Lines changed: 129 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use bevy_ecs::{
1515
use bevy_encase_derive::ShaderType;
1616
use bevy_math::UVec4;
1717
use bevy_platform::collections::{hash_map::Entry, HashMap, HashSet};
18+
use bevy_tasks::ComputeTaskPool;
1819
use bevy_utils::{default, TypeIdMap};
1920
use bytemuck::{Pod, Zeroable};
2021
use encase::{internal::WriteInto, ShaderSize};
@@ -2017,56 +2018,74 @@ pub fn write_batched_instance_buffers<GFBD>(
20172018
phase_instance_buffers,
20182019
} = gpu_array_buffer.into_inner();
20192020

2020-
current_input_buffer
2021-
.buffer
2022-
.write_buffer(&render_device, &render_queue);
2023-
previous_input_buffer
2024-
.buffer
2025-
.write_buffer(&render_device, &render_queue);
2026-
2027-
for phase_instance_buffers in phase_instance_buffers.values_mut() {
2028-
let UntypedPhaseBatchedInstanceBuffers {
2029-
ref mut data_buffer,
2030-
ref mut work_item_buffers,
2031-
ref mut late_indexed_indirect_parameters_buffer,
2032-
ref mut late_non_indexed_indirect_parameters_buffer,
2033-
} = *phase_instance_buffers;
2034-
2035-
data_buffer.write_buffer(&render_device);
2036-
late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
2037-
late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
2038-
2039-
for phase_work_item_buffers in work_item_buffers.values_mut() {
2040-
match *phase_work_item_buffers {
2041-
PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
2042-
buffer_vec.write_buffer(&render_device, &render_queue);
2043-
}
2044-
PreprocessWorkItemBuffers::Indirect {
2045-
ref mut indexed,
2046-
ref mut non_indexed,
2047-
ref mut gpu_occlusion_culling,
2048-
} => {
2049-
indexed.write_buffer(&render_device, &render_queue);
2050-
non_indexed.write_buffer(&render_device, &render_queue);
2051-
2052-
if let Some(GpuOcclusionCullingWorkItemBuffers {
2053-
ref mut late_indexed,
2054-
ref mut late_non_indexed,
2055-
late_indirect_parameters_indexed_offset: _,
2056-
late_indirect_parameters_non_indexed_offset: _,
2057-
}) = *gpu_occlusion_culling
2058-
{
2059-
if !late_indexed.is_empty() {
2060-
late_indexed.write_buffer(&render_device);
2021+
let render_device = &*render_device;
2022+
let render_queue = &*render_queue;
2023+
2024+
ComputeTaskPool::get().scope(|scope| {
2025+
scope.spawn(async {
2026+
let _span = tracing::info_span!("write_current_input_buffers").entered();
2027+
current_input_buffer
2028+
.buffer
2029+
.write_buffer(render_device, render_queue);
2030+
});
2031+
scope.spawn(async {
2032+
let _span = tracing::info_span!("write_previous_input_buffers").entered();
2033+
previous_input_buffer
2034+
.buffer
2035+
.write_buffer(render_device, render_queue);
2036+
});
2037+
2038+
for phase_instance_buffers in phase_instance_buffers.values_mut() {
2039+
let UntypedPhaseBatchedInstanceBuffers {
2040+
ref mut data_buffer,
2041+
ref mut work_item_buffers,
2042+
ref mut late_indexed_indirect_parameters_buffer,
2043+
ref mut late_non_indexed_indirect_parameters_buffer,
2044+
} = *phase_instance_buffers;
2045+
2046+
scope.spawn(async {
2047+
let _span = tracing::info_span!("write_phase_instance_buffers").entered();
2048+
data_buffer.write_buffer(render_device);
2049+
late_indexed_indirect_parameters_buffer.write_buffer(render_device, render_queue);
2050+
late_non_indexed_indirect_parameters_buffer
2051+
.write_buffer(render_device, render_queue);
2052+
});
2053+
2054+
for phase_work_item_buffers in work_item_buffers.values_mut() {
2055+
scope.spawn(async {
2056+
let _span = tracing::info_span!("write_work_item_buffers").entered();
2057+
match *phase_work_item_buffers {
2058+
PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
2059+
buffer_vec.write_buffer(render_device, render_queue);
20612060
}
2062-
if !late_non_indexed.is_empty() {
2063-
late_non_indexed.write_buffer(&render_device);
2061+
PreprocessWorkItemBuffers::Indirect {
2062+
ref mut indexed,
2063+
ref mut non_indexed,
2064+
ref mut gpu_occlusion_culling,
2065+
} => {
2066+
indexed.write_buffer(render_device, render_queue);
2067+
non_indexed.write_buffer(render_device, render_queue);
2068+
2069+
if let Some(GpuOcclusionCullingWorkItemBuffers {
2070+
ref mut late_indexed,
2071+
ref mut late_non_indexed,
2072+
late_indirect_parameters_indexed_offset: _,
2073+
late_indirect_parameters_non_indexed_offset: _,
2074+
}) = *gpu_occlusion_culling
2075+
{
2076+
if !late_indexed.is_empty() {
2077+
late_indexed.write_buffer(render_device);
2078+
}
2079+
if !late_non_indexed.is_empty() {
2080+
late_non_indexed.write_buffer(render_device);
2081+
}
2082+
}
20642083
}
20652084
}
2066-
}
2085+
});
20672086
}
20682087
}
2069-
}
2088+
});
20702089
}
20712090

20722091
pub fn clear_indirect_parameters_buffers(
@@ -2082,43 +2101,71 @@ pub fn write_indirect_parameters_buffers(
20822101
render_queue: Res<RenderQueue>,
20832102
mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
20842103
) {
2085-
for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2086-
phase_indirect_parameters_buffers
2087-
.indexed
2088-
.data
2089-
.write_buffer(&render_device);
2090-
phase_indirect_parameters_buffers
2091-
.non_indexed
2092-
.data
2093-
.write_buffer(&render_device);
2094-
2095-
phase_indirect_parameters_buffers
2096-
.indexed
2097-
.cpu_metadata
2098-
.write_buffer(&render_device, &render_queue);
2099-
phase_indirect_parameters_buffers
2100-
.non_indexed
2101-
.cpu_metadata
2102-
.write_buffer(&render_device, &render_queue);
2103-
2104-
phase_indirect_parameters_buffers
2105-
.non_indexed
2106-
.gpu_metadata
2107-
.write_buffer(&render_device);
2108-
phase_indirect_parameters_buffers
2109-
.indexed
2110-
.gpu_metadata
2111-
.write_buffer(&render_device);
2112-
2113-
phase_indirect_parameters_buffers
2114-
.indexed
2115-
.batch_sets
2116-
.write_buffer(&render_device, &render_queue);
2117-
phase_indirect_parameters_buffers
2118-
.non_indexed
2119-
.batch_sets
2120-
.write_buffer(&render_device, &render_queue);
2121-
}
2104+
let render_device = &*render_device;
2105+
let render_queue = &*render_queue;
2106+
ComputeTaskPool::get().scope(|scope| {
2107+
for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2108+
scope.spawn(async {
2109+
let _span = tracing::info_span!("indexed_data").entered();
2110+
phase_indirect_parameters_buffers
2111+
.indexed
2112+
.data
2113+
.write_buffer(render_device);
2114+
});
2115+
scope.spawn(async {
2116+
let _span = tracing::info_span!("non_indexed_data").entered();
2117+
phase_indirect_parameters_buffers
2118+
.non_indexed
2119+
.data
2120+
.write_buffer(render_device);
2121+
});
2122+
2123+
scope.spawn(async {
2124+
let _span = tracing::info_span!("indexed_cpu_metadata").entered();
2125+
phase_indirect_parameters_buffers
2126+
.indexed
2127+
.cpu_metadata
2128+
.write_buffer(render_device, render_queue);
2129+
});
2130+
scope.spawn(async {
2131+
let _span = tracing::info_span!("non_indexed_cpu_metadata").entered();
2132+
phase_indirect_parameters_buffers
2133+
.non_indexed
2134+
.cpu_metadata
2135+
.write_buffer(render_device, render_queue);
2136+
});
2137+
2138+
scope.spawn(async {
2139+
let _span = tracing::info_span!("non_indexed_gpu_metadata").entered();
2140+
phase_indirect_parameters_buffers
2141+
.non_indexed
2142+
.gpu_metadata
2143+
.write_buffer(render_device);
2144+
});
2145+
scope.spawn(async {
2146+
let _span = tracing::info_span!("indexed_gpu_metadata").entered();
2147+
phase_indirect_parameters_buffers
2148+
.indexed
2149+
.gpu_metadata
2150+
.write_buffer(render_device);
2151+
});
2152+
2153+
scope.spawn(async {
2154+
let _span = tracing::info_span!("indexed_batch_sets").entered();
2155+
phase_indirect_parameters_buffers
2156+
.indexed
2157+
.batch_sets
2158+
.write_buffer(render_device, render_queue);
2159+
});
2160+
scope.spawn(async {
2161+
let _span = tracing::info_span!("non_indexed_batch_sets").entered();
2162+
phase_indirect_parameters_buffers
2163+
.non_indexed
2164+
.batch_sets
2165+
.write_buffer(render_device, render_queue);
2166+
});
2167+
}
2168+
});
21222169
}
21232170

21242171
#[cfg(test)]

0 commit comments

Comments
 (0)