From 47407e967f9357d421fe124b8bdae8388b891c70 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 17:28:43 +0000 Subject: [PATCH 1/6] vk: move parallel aggregator to cpu-side cmake: build with -O3 --- CMakeLists.txt | 6 ++ rpcs3/Emu/RSX/VK/VKCompute.cpp | 181 +++++++++++++++++++++++++++++++-- 2 files changed, 179 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0492d13184e8..91cd7c9b15b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,6 +132,12 @@ if(MSVC) add_compile_options(/MP) endif() +if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3) +elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + add_compile_options(/arch:AVX2) +endif() + if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message( FATAL_ERROR "RPCS3 can only be compiled on 64-bit platforms." ) endif() diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index 9e9e2a474640..e1649ef0cc72 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -3,11 +3,122 @@ #include "VKRenderPass.h" #include "vkutils/buffer_object.h" #include "VKPipelineCompiler.h" +#include +#include +#include +#include +#include +#include +#include +#include #define VK_MAX_COMPUTE_TASKS 8192 // Max number of jobs per frame namespace vk { +namespace cpu_compute { + +template +inline void parallel_for(size_t n, size_t numThreads, Worker&& worker) +{ + if (n == 0) return; + numThreads = std::max(1, std::min(numThreads, n)); + std::vector threads; + threads.reserve(numThreads); + size_t chunk = n / numThreads; + size_t rem = n % numThreads; + size_t start = 0; + for (size_t t = 0; t < numThreads; ++t) + { + size_t sz = chunk + (t < rem ? 1 : 0); + size_t s = start; + size_t e = s + sz; + threads.emplace_back([s,e, &worker](){ for (size_t i = s; i < e; ++i) worker(i); }); + start = e; + } + for (auto &th : threads) th.join(); +} + +inline void scatter_d24x8_cpu(uint32_t* base_words, + size_t total_words, + size_t block_length_words, + size_t z_offset_words, + size_t s_offset_words, + size_t numThreads = std::thread::hardware_concurrency()) +{ + if (!base_words) throw std::runtime_error("scatter_d24x8_cpu: null buffer"); + if (block_length_words == 0) return; + + if ((block_length_words + z_offset_words) > total_words) { + throw std::runtime_error("scatter_d24x8_cpu: z-offset out of range"); + } + + size_t stencil_word_count = (block_length_words + 3) / 4; + if ((stencil_word_count + s_offset_words) > total_words) { + throw std::runtime_error("scatter_d24x8_cpu: stencil offset out of range"); + } + + cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) { + uint32_t value = base_words[index]; + base_words[index + z_offset_words] = (value >> 8); + + size_t stencil_offset = index >> 2; + uint32_t stencil_shift = static_cast((index & 3) << 3); + uint32_t stencil_mask = (value & 0xFFu) << stencil_shift; + + uint32_t* stencil_word_ptr = base_words + (stencil_offset + s_offset_words); + +#if defined(__cpp_lib_atomic_ref) + std::atomic_ref ref(*stencil_word_ptr); + ref.fetch_or(stencil_mask, std::memory_order_relaxed); +#else + std::atomic* atomic_ptr = reinterpret_cast*>(stencil_word_ptr); + uint32_t oldv = atomic_ptr->load(std::memory_order_relaxed); + uint32_t newv; + do { + newv = oldv | stencil_mask; + } while (!atomic_ptr->compare_exchange_weak(oldv, newv, std::memory_order_relaxed)); +#endif + }); +} + +// parallel sum for u32 source; returns 64-bit sum +inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numThreads = std::thread::hardware_concurrency()) +{ + if (count == 0) return 0; + numThreads = std::max(1, std::min(numThreads, count)); + std::vector partials(numThreads, 0); + + size_t chunk = count / numThreads; + size_t rem = count % numThreads; + size_t start = 0; + std::vector threads; + threads.reserve(numThreads); + + for (size_t t = 0; t < numThreads; ++t) + { + size_t sz = chunk + (t < rem ? 1 : 0); + size_t s = start; + size_t e = s + sz; + threads.emplace_back([=, &partials]() { + uint64_t acc = 0; + const uint32_t* p = src + s; + size_t len = e - s; + // plain loop — compiler will usually vectorize + for (size_t i = 0; i < len; ++i) acc += p[i]; + partials[t] = acc; + }); + start = e; + } + for (auto &th : threads) th.join(); + + uint64_t total = 0; + for (auto v : partials) total += v; + return total; +} + +} // namespace cpu_compute + std::vector compute_task::get_inputs() { std::vector result; @@ -68,7 +179,7 @@ namespace vk // Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample) unroll_loops = true; optimal_kernel_size = 1; - optimal_group_size = 32; + optimal_group_size = 128; //32; break; case vk::driver_vendor::AMD: case vk::driver_vendor::RADV: @@ -299,13 +410,51 @@ namespace vk m_program->bind_uniform({ *m_data, m_data_offset, m_ssbo_length }, 0, 0); } - void cs_interleave_task::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) + void cs_interleave_task::run(const vk::command_buffer& /*cmd*/, + const vk::buffer* data, + u32 data_offset, + u32 data_length, + u32 zeta_offset, + u32 stencil_offset) { - m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; + size_t hw = std::max(1, std::thread::hardware_concurrency()); + size_t numThreads = std::min(hw, 8); + m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; ensure(stencil_offset > data_offset); - m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; - cs_shuffle_base::run(cmd, data, data_length, data_offset); + + void* mapped = const_cast(data)->map(data_offset, data_length); + if (!mapped) throw std::runtime_error("cs_interleave_task::run: failed to map buffer"); + + uint8_t* base_bytes = reinterpret_cast(mapped); + uint8_t* region_start = base_bytes + data_offset; + uint32_t* words = reinterpret_cast(region_start); + + const size_t region_total_words = (data->size() - data_offset) / 4; + const size_t block_length_words = data_length / 4; + const size_t z_offset_words = (zeta_offset - data_offset) / 4; + const size_t s_offset_words = (stencil_offset - data_offset) / 4; + + if ((block_length_words + z_offset_words) > region_total_words) { + const_cast(data)->unmap(); + throw std::runtime_error("cs_interleave_task::run: z_offset out of bounds"); + } + size_t stencil_word_count = (block_length_words + 3) / 4; + if ((stencil_word_count + s_offset_words) > region_total_words) { + const_cast(data)->unmap(); + throw std::runtime_error("cs_interleave_task::run: stencil offset out of bounds"); + } + + cpu_compute::scatter_d24x8_cpu( + /*base_words=*/ words, + /*total_words=*/ region_total_words, + /*block_length_words=*/ block_length_words, + /*z_offset_words=*/ z_offset_words, + /*s_offset_words=*/ s_offset_words, + /*numThreads=*/ numThreads + ); + + const_cast(data)->unmap(); } cs_scatter_d24x8::cs_scatter_d24x8() @@ -359,14 +508,30 @@ namespace vk m_program->bind_uniform({ *dst, 0, 4 }, 0, 1); } - void cs_aggregator::run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) + void cs_aggregator::run(const vk::command_buffer& /*cmd*/, const vk::buffer* dst, const vk::buffer* src, u32 num_words) { this->dst = dst; this->src = src; word_count = num_words; block_length = num_words * 4; - const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); - compute_task::run(cmd, linear_invocations); + size_t hw = std::max(1, std::thread::hardware_concurrency()); + size_t numThreads = std::min(hw, 8); + + void* src_map = const_cast(src)->map(0, num_words * 4); + if (!src_map) throw std::runtime_error("cs_aggregator::run: failed to map src buffer"); + + const uint32_t* src_words = reinterpret_cast(src_map); + uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast(num_words), numThreads); + + const_cast(src)->unmap(); + + void* dst_map = const_cast(dst)->map(0, 4); + if (!dst_map) throw std::runtime_error("cs_aggregator::run: failed to map dst buffer"); + + uint32_t result32 = static_cast(sum); + std::memcpy(dst_map, &result32, sizeof(result32)); + + const_cast(dst)->unmap(); } } From fcc295ffac47ed12d3575700b25b6257bde7b180 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 17:46:03 +0000 Subject: [PATCH 2/6] chore: dont throw! --- rpcs3/Emu/RSX/VK/VKCompute.cpp | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index e1649ef0cc72..913874060f2b 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -46,16 +46,21 @@ inline void scatter_d24x8_cpu(uint32_t* base_words, size_t s_offset_words, size_t numThreads = std::thread::hardware_concurrency()) { - if (!base_words) throw std::runtime_error("scatter_d24x8_cpu: null buffer"); + if (!base_words) { + rsx_log.error("scatter_d24x8_cpu: null buffer"); + return; + } if (block_length_words == 0) return; if ((block_length_words + z_offset_words) > total_words) { - throw std::runtime_error("scatter_d24x8_cpu: z-offset out of range"); + rsx_log.error("scatter_d24x8_cpu: z-offset out of range"); + return; } size_t stencil_word_count = (block_length_words + 3) / 4; if ((stencil_word_count + s_offset_words) > total_words) { - throw std::runtime_error("scatter_d24x8_cpu: stencil offset out of range"); + rsx_log.error("scatter_d24x8_cpu: stencil offset out of range"); + return; } cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) { @@ -424,7 +429,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh ensure(stencil_offset > data_offset); void* mapped = const_cast(data)->map(data_offset, data_length); - if (!mapped) throw std::runtime_error("cs_interleave_task::run: failed to map buffer"); + if (!mapped) { + rsx_log.error("cs_interleave_task::run: failed to map buffer"); + return; + } uint8_t* base_bytes = reinterpret_cast(mapped); uint8_t* region_start = base_bytes + data_offset; @@ -437,12 +445,14 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh if ((block_length_words + z_offset_words) > region_total_words) { const_cast(data)->unmap(); - throw std::runtime_error("cs_interleave_task::run: z_offset out of bounds"); + rsx_log.error("cs_interleave_task::run: z_offset out of bounds"); + return; } size_t stencil_word_count = (block_length_words + 3) / 4; if ((stencil_word_count + s_offset_words) > region_total_words) { const_cast(data)->unmap(); - throw std::runtime_error("cs_interleave_task::run: stencil offset out of bounds"); + rsx_log.error("cs_interleave_task::run: stencil offset out of bounds"); + return; } cpu_compute::scatter_d24x8_cpu( @@ -519,7 +529,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh size_t numThreads = std::min(hw, 8); void* src_map = const_cast(src)->map(0, num_words * 4); - if (!src_map) throw std::runtime_error("cs_aggregator::run: failed to map src buffer"); + if (!src_map) { + rsx_log.error("cs_aggregator::run: failed to map src buffer"); + return; + } const uint32_t* src_words = reinterpret_cast(src_map); uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast(num_words), numThreads); @@ -527,7 +540,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh const_cast(src)->unmap(); void* dst_map = const_cast(dst)->map(0, 4); - if (!dst_map) throw std::runtime_error("cs_aggregator::run: failed to map dst buffer"); + if (!dst_map) { + rsx_log.error("cs_aggregator::run: failed to map dst buffer"); + return; + } uint32_t result32 = static_cast(sum); std::memcpy(dst_map, &result32, sizeof(result32)); From 68644efb57a8bc5b3e95f2baa489905b81498aa0 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 18:48:50 +0000 Subject: [PATCH 3/6] vk: revert gpu/cpu compute changes cmake: dont use sse and avx and other flags on arm64/aarch64 --- CMakeLists.txt | 8 +- rpcs3/Emu/RSX/VK/VKCompute.cpp | 199 ++------------------------------- 2 files changed, 15 insertions(+), 192 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 91cd7c9b15b3..1c5fcf763c83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,9 +133,13 @@ if(MSVC) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3) + if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3) + else() + add_compile_options(-O3) + endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - add_compile_options(/arch:AVX2) + add_compile_options(/arch:AVX2 /O2) endif() if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index 913874060f2b..ac056cddb962 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -3,127 +3,11 @@ #include "VKRenderPass.h" #include "vkutils/buffer_object.h" #include "VKPipelineCompiler.h" -#include -#include -#include -#include -#include -#include -#include -#include #define VK_MAX_COMPUTE_TASKS 8192 // Max number of jobs per frame namespace vk { -namespace cpu_compute { - -template -inline void parallel_for(size_t n, size_t numThreads, Worker&& worker) -{ - if (n == 0) return; - numThreads = std::max(1, std::min(numThreads, n)); - std::vector threads; - threads.reserve(numThreads); - size_t chunk = n / numThreads; - size_t rem = n % numThreads; - size_t start = 0; - for (size_t t = 0; t < numThreads; ++t) - { - size_t sz = chunk + (t < rem ? 1 : 0); - size_t s = start; - size_t e = s + sz; - threads.emplace_back([s,e, &worker](){ for (size_t i = s; i < e; ++i) worker(i); }); - start = e; - } - for (auto &th : threads) th.join(); -} - -inline void scatter_d24x8_cpu(uint32_t* base_words, - size_t total_words, - size_t block_length_words, - size_t z_offset_words, - size_t s_offset_words, - size_t numThreads = std::thread::hardware_concurrency()) -{ - if (!base_words) { - rsx_log.error("scatter_d24x8_cpu: null buffer"); - return; - } - if (block_length_words == 0) return; - - if ((block_length_words + z_offset_words) > total_words) { - rsx_log.error("scatter_d24x8_cpu: z-offset out of range"); - return; - } - - size_t stencil_word_count = (block_length_words + 3) / 4; - if ((stencil_word_count + s_offset_words) > total_words) { - rsx_log.error("scatter_d24x8_cpu: stencil offset out of range"); - return; - } - - cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) { - uint32_t value = base_words[index]; - base_words[index + z_offset_words] = (value >> 8); - - size_t stencil_offset = index >> 2; - uint32_t stencil_shift = static_cast((index & 3) << 3); - uint32_t stencil_mask = (value & 0xFFu) << stencil_shift; - - uint32_t* stencil_word_ptr = base_words + (stencil_offset + s_offset_words); - -#if defined(__cpp_lib_atomic_ref) - std::atomic_ref ref(*stencil_word_ptr); - ref.fetch_or(stencil_mask, std::memory_order_relaxed); -#else - std::atomic* atomic_ptr = reinterpret_cast*>(stencil_word_ptr); - uint32_t oldv = atomic_ptr->load(std::memory_order_relaxed); - uint32_t newv; - do { - newv = oldv | stencil_mask; - } while (!atomic_ptr->compare_exchange_weak(oldv, newv, std::memory_order_relaxed)); -#endif - }); -} - -// parallel sum for u32 source; returns 64-bit sum -inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numThreads = std::thread::hardware_concurrency()) -{ - if (count == 0) return 0; - numThreads = std::max(1, std::min(numThreads, count)); - std::vector partials(numThreads, 0); - - size_t chunk = count / numThreads; - size_t rem = count % numThreads; - size_t start = 0; - std::vector threads; - threads.reserve(numThreads); - - for (size_t t = 0; t < numThreads; ++t) - { - size_t sz = chunk + (t < rem ? 1 : 0); - size_t s = start; - size_t e = s + sz; - threads.emplace_back([=, &partials]() { - uint64_t acc = 0; - const uint32_t* p = src + s; - size_t len = e - s; - // plain loop — compiler will usually vectorize - for (size_t i = 0; i < len; ++i) acc += p[i]; - partials[t] = acc; - }); - start = e; - } - for (auto &th : threads) th.join(); - - uint64_t total = 0; - for (auto v : partials) total += v; - return total; -} - -} // namespace cpu_compute - std::vector compute_task::get_inputs() { std::vector result; @@ -184,7 +68,7 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh // Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample) unroll_loops = true; optimal_kernel_size = 1; - optimal_group_size = 128; //32; + optimal_group_size = 32; break; case vk::driver_vendor::AMD: case vk::driver_vendor::RADV: @@ -415,56 +299,13 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh m_program->bind_uniform({ *m_data, m_data_offset, m_ssbo_length }, 0, 0); } - void cs_interleave_task::run(const vk::command_buffer& /*cmd*/, - const vk::buffer* data, - u32 data_offset, - u32 data_length, - u32 zeta_offset, - u32 stencil_offset) + void cs_interleave_task::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) { - size_t hw = std::max(1, std::thread::hardware_concurrency()); - size_t numThreads = std::min(hw, 8); - m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; - ensure(stencil_offset > data_offset); - void* mapped = const_cast(data)->map(data_offset, data_length); - if (!mapped) { - rsx_log.error("cs_interleave_task::run: failed to map buffer"); - return; - } - - uint8_t* base_bytes = reinterpret_cast(mapped); - uint8_t* region_start = base_bytes + data_offset; - uint32_t* words = reinterpret_cast(region_start); - - const size_t region_total_words = (data->size() - data_offset) / 4; - const size_t block_length_words = data_length / 4; - const size_t z_offset_words = (zeta_offset - data_offset) / 4; - const size_t s_offset_words = (stencil_offset - data_offset) / 4; - - if ((block_length_words + z_offset_words) > region_total_words) { - const_cast(data)->unmap(); - rsx_log.error("cs_interleave_task::run: z_offset out of bounds"); - return; - } - size_t stencil_word_count = (block_length_words + 3) / 4; - if ((stencil_word_count + s_offset_words) > region_total_words) { - const_cast(data)->unmap(); - rsx_log.error("cs_interleave_task::run: stencil offset out of bounds"); - return; - } - - cpu_compute::scatter_d24x8_cpu( - /*base_words=*/ words, - /*total_words=*/ region_total_words, - /*block_length_words=*/ block_length_words, - /*z_offset_words=*/ z_offset_words, - /*s_offset_words=*/ s_offset_words, - /*numThreads=*/ numThreads - ); - - const_cast(data)->unmap(); + ensure(stencil_offset > data_offset); + m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; + cs_shuffle_base::run(cmd, data, data_length, data_offset); } cs_scatter_d24x8::cs_scatter_d24x8() @@ -518,36 +359,14 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh m_program->bind_uniform({ *dst, 0, 4 }, 0, 1); } - void cs_aggregator::run(const vk::command_buffer& /*cmd*/, const vk::buffer* dst, const vk::buffer* src, u32 num_words) + void cs_aggregator::run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) { this->dst = dst; this->src = src; word_count = num_words; block_length = num_words * 4; - size_t hw = std::max(1, std::thread::hardware_concurrency()); - size_t numThreads = std::min(hw, 8); - - void* src_map = const_cast(src)->map(0, num_words * 4); - if (!src_map) { - rsx_log.error("cs_aggregator::run: failed to map src buffer"); - return; - } - - const uint32_t* src_words = reinterpret_cast(src_map); - uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast(num_words), numThreads); - - const_cast(src)->unmap(); - - void* dst_map = const_cast(dst)->map(0, 4); - if (!dst_map) { - rsx_log.error("cs_aggregator::run: failed to map dst buffer"); - return; - } - - uint32_t result32 = static_cast(sum); - std::memcpy(dst_map, &result32, sizeof(result32)); - - const_cast(dst)->unmap(); + const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); + compute_task::run(cmd, linear_invocations); } -} +} \ No newline at end of file From f932632d9a2aab6c78376f29ac67cd7cb2c100c7 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 22:44:31 +0300 Subject: [PATCH 4/6] chore: fix clang build fail --- 3rdparty/OpenAL/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/3rdparty/OpenAL/CMakeLists.txt b/3rdparty/OpenAL/CMakeLists.txt index b9fee23ce5fc..368b80c54fa0 100644 --- a/3rdparty/OpenAL/CMakeLists.txt +++ b/3rdparty/OpenAL/CMakeLists.txt @@ -13,6 +13,7 @@ else() option(ALSOFT_EXAMPLES "Build example programs" OFF) set(LIBTYPE "STATIC") add_subdirectory(openal-soft EXCLUDE_FROM_ALL) + target_compile_options(alsoft.fmt PRIVATE -include cstdlib) add_library(3rdparty_openal INTERFACE) target_link_libraries(3rdparty_openal INTERFACE OpenAL::OpenAL) endif() From e1b4d963afd4b5a463f8a9d590d2861bf9fa53f7 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 23:00:01 +0300 Subject: [PATCH 5/6] fix win clang --- 3rdparty/OpenAL/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/3rdparty/OpenAL/CMakeLists.txt b/3rdparty/OpenAL/CMakeLists.txt index 368b80c54fa0..b9fee23ce5fc 100644 --- a/3rdparty/OpenAL/CMakeLists.txt +++ b/3rdparty/OpenAL/CMakeLists.txt @@ -13,7 +13,6 @@ else() option(ALSOFT_EXAMPLES "Build example programs" OFF) set(LIBTYPE "STATIC") add_subdirectory(openal-soft EXCLUDE_FROM_ALL) - target_compile_options(alsoft.fmt PRIVATE -include cstdlib) add_library(3rdparty_openal INTERFACE) target_link_libraries(3rdparty_openal INTERFACE OpenAL::OpenAL) endif() From 4cc1a3952b10300e54ee664c73f31f8b607f15a6 Mon Sep 17 00:00:00 2001 From: Steve <61917452+awildergoose@users.noreply.github.com> Date: Sun, 19 Oct 2025 23:04:54 +0300 Subject: [PATCH 6/6] Add compiler options for OpenAL based on compiler --- 3rdparty/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 46307746a319..2e6207e96be6 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -244,6 +244,12 @@ if(USE_SDL) endif() # OpenAL +if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") + add_compile_options(-include cstdlib) +elseif(MSVC) + add_compile_options(/FI cstdlib) +endif() + if (NOT ANDROID) add_subdirectory(OpenAL EXCLUDE_FROM_ALL) else()