diff --git a/.gitmodules b/.gitmodules index 8f9772d342..82e532096b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -18,6 +18,10 @@ path = dependencies/imgui url = https://github.com/ocornut/imgui shallow = true +[submodule "dependencies/metal-cpp"] + path = dependencies/metal-cpp + url = https://github.com/bkaradzic/metal-cpp.git + shallow = true [submodule "dependencies/xbyak_aarch64"] path = dependencies/xbyak_aarch64 url = https://github.com/fujitsu/xbyak_aarch64 diff --git a/CMakeLists.txt b/CMakeLists.txt index 33a994d3ef..f71595f64d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,10 +102,20 @@ if (UNIX AND NOT APPLE) option(ENABLE_BLUEZ "Build with Bluez support" ON) endif() +if (APPLE) + set(ENABLE_METAL_DEFAULT ON) +else() + set(ENABLE_METAL_DEFAULT OFF) +endif() + option(ENABLE_OPENGL "Enables the OpenGL backend" ON) option(ENABLE_VULKAN "Enables the Vulkan backend" ON) +option(ENABLE_METAL "Enables the Metal backend" ${ENABLE_METAL_DEFAULT}) option(ENABLE_DISCORD_RPC "Enables the Discord Rich Presence feature" ON) +if (ENABLE_METAL AND NOT APPLE) + message(FATAL_ERROR "Metal backend is only supported on Apple platforms") +endif() # input backends if (WIN32) @@ -180,6 +190,12 @@ if (ENABLE_OPENGL) find_package(OpenGL REQUIRED) endif() +if (ENABLE_METAL) + include_directories(${CMAKE_SOURCE_DIR}/dependencies/metal-cpp) + + add_definitions(-DENABLE_METAL=1) +endif() + if (ENABLE_DISCORD_RPC) add_compile_definitions(ENABLE_DISCORD_RPC) endif() @@ -204,7 +220,7 @@ endif() if (ENABLE_CUBEB) if (NOT ENABLE_VCPKG) - find_package(cubeb) + find_package(cubeb) endif() if (NOT cubeb_FOUND) option(BUILD_TESTS "" OFF) diff --git a/dependencies/metal-cpp b/dependencies/metal-cpp new file mode 160000 index 0000000000..a63bd172dd --- /dev/null +++ b/dependencies/metal-cpp @@ -0,0 +1 @@ +Subproject commit a63bd172ddcba73a3d87ca32032b66ad41ddb9a6 diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 1b0def8496..596d7139ae 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -534,7 +534,69 @@ add_library(CemuCafe ) if(APPLE) - target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm") + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Vulkan/CocoaSurface.mm + HW/Latte/Renderer/MetalView.mm + HW/Latte/Renderer/MetalView.h + ) +endif() + +if(ENABLE_METAL) + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Metal/CachedFBOMtl.cpp + HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/LatteTextureMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureMtl.h + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h + HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureViewMtl.h + HW/Latte/Renderer/Metal/LatteToMtl.cpp + HW/Latte/Renderer/Metal/LatteToMtl.h + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h + HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp + HW/Latte/Renderer/Metal/MetalBufferAllocator.h + HW/Latte/Renderer/Metal/MetalCommon.h + HW/Latte/Renderer/Metal/MetalCppImpl.cpp + HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp + HW/Latte/Renderer/Metal/MetalDepthStencilCache.h + HW/Latte/Renderer/Metal/MetalLayer.h + HW/Latte/Renderer/Metal/MetalLayer.mm + HW/Latte/Renderer/Metal/MetalLayerHandle.cpp + HW/Latte/Renderer/Metal/MetalLayerHandle.h + HW/Latte/Renderer/Metal/MetalMemoryManager.cpp + HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp + HW/Latte/Renderer/Metal/MetalOutputShaderCache.h + HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h + HW/Latte/Renderer/Metal/MetalPipelineCache.cpp + HW/Latte/Renderer/Metal/MetalPipelineCache.h + HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp + HW/Latte/Renderer/Metal/MetalPipelineCompiler.h + HW/Latte/Renderer/Metal/MetalQuery.cpp + HW/Latte/Renderer/Metal/MetalQuery.h + HW/Latte/Renderer/Metal/MetalRenderer.cpp + HW/Latte/Renderer/Metal/MetalRenderer.h + HW/Latte/Renderer/Metal/MetalSamplerCache.cpp + HW/Latte/Renderer/Metal/MetalSamplerCache.h + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h + HW/Latte/Renderer/Metal/RendererShaderMtl.cpp + HW/Latte/Renderer/Metal/RendererShaderMtl.h + HW/Latte/Renderer/Metal/UtilityShaderSource.h + ) + + target_sources(CemuCafe PRIVATE + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp + ) + + target_link_libraries(CemuCafe PRIVATE + "-framework Metal" + "-framework QuartzCore" + ) endif() if(CEMU_ARCHITECTURE MATCHES "(aarch64)|(AARCH64)|(arm64)|(ARM64)") diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index abfda232ae..49650017a3 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -249,7 +249,17 @@ void InfoLog_PrintActiveSettings() if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kVulkan) { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); - if(!GetConfig().vk_accurate_barriers.GetValue()) + if (!GetConfig().vk_accurate_barriers.GetValue()) + cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); + } + else if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kMetal) + { + cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Force mesh shaders: {}", GetConfig().force_mesh_shaders.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetShaderFastMath() ? "true" : "false"); + cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheMode()); + cemuLog_log(LogType::Force, "Position invariance: {}", g_current_game_profile->GetPositionInvariance()); + if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } cemuLog_log(LogType::Force, "Console language: {}", stdx::to_underlying(config.console_language.GetValue())); @@ -1027,7 +1037,7 @@ namespace CafeSystem { // starting with Cemu 1.27.0 /vol/storage_mlc01/ is virtualized, meaning that it doesn't point to one singular host os folder anymore // instead it now uses a more complex solution to source titles with various formats (folder, wud, wua) from the game paths and host mlc path - + // todo - mount /vol/storage_mlc01/ with base priority to the host mlc? // since mounting titles is an expensive operation we have to avoid mounting all titles at once diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 286f341158..b1a0ad7080 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -127,7 +127,7 @@ bool gameProfile_loadIntegerOption(IniParser& iniParser, const char* optionName, { cemuLog_log(LogType::Force, "Value '{}' is out of range for option '{}' in game profile", *option_value, optionName); return false; - } + } } template @@ -224,8 +224,11 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadIntegerOption(&iniParser, "graphics_api", &graphicsApi, -1, 0, 1); if (graphicsApi.value != -1) m_graphics_api = (GraphicAPI)graphicsApi.value; - + gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); + gameProfile_loadBooleanOption2(iniParser, "shaderFastMath", m_shaderFastMath); + gameProfile_loadEnumOption(iniParser, "metalBufferCacheMode2", m_metalBufferCacheMode); + gameProfile_loadEnumOption(iniParser, "positionInvariance2", m_positionInvariance); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -277,7 +280,7 @@ bool GameProfile::Load(uint64_t title_id) void GameProfile::Save(uint64_t title_id) { auto gameProfileDir = ActiveSettings::GetConfigPath("gameProfiles"); - if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) + if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) fs::create_directories(gameProfileDir, ex_ec); auto gameProfilePath = gameProfileDir / fmt::format("{:016x}.ini", title_id); FileStream* fs = FileStream::createFile2(gameProfilePath); @@ -292,22 +295,23 @@ void GameProfile::Save(uint64_t title_id) #define WRITE_OPTIONAL_ENTRY(__NAME) if (m_##__NAME) fs->writeLine(fmt::format("{} = {}", #__NAME, m_##__NAME.value()).c_str()); #define WRITE_ENTRY(__NAME) fs->writeLine(fmt::format("{} = {}", #__NAME, m_##__NAME).c_str()); +#define WRITE_ENTRY_NUMBERED(__NAME, __NUM) fs->writeLine(fmt::format("{} = {}", #__NAME #__NUM, m_##__NAME).c_str()); fs->writeLine("[General]"); WRITE_OPTIONAL_ENTRY(loadSharedLibraries); WRITE_ENTRY(startWithPadView); - fs->writeLine(""); - fs->writeLine("[CPU]"); WRITE_OPTIONAL_ENTRY(cpuMode); WRITE_ENTRY(threadQuantum); - fs->writeLine(""); fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); + WRITE_ENTRY(shaderFastMath); + WRITE_ENTRY_NUMBERED(metalBufferCacheMode, 2); + WRITE_ENTRY_NUMBERED(positionInvariance, 2); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -323,6 +327,7 @@ void GameProfile::Save(uint64_t title_id) #undef WRITE_OPTIONAL_ENTRY #undef WRITE_ENTRY +#undef WRITE_ENTRY_NUMBERED delete fs; } @@ -337,6 +342,9 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_shaderFastMath = true; + m_metalBufferCacheMode = MetalBufferCacheMode::Auto; + m_positionInvariance = PositionInvariance::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -354,9 +362,12 @@ void GameProfile::Reset() // general settings m_loadSharedLibraries = true; m_startWithPadView = false; - + // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_shaderFastMath = true; + m_metalBufferCacheMode = MetalBufferCacheMode::Auto; + m_positionInvariance = PositionInvariance::Auto; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 6a1f2ebd6d..4885d4fc09 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,6 +31,9 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } + [[nodiscard]] bool GetShaderFastMath() const { return m_shaderFastMath; } + [[nodiscard]] MetalBufferCacheMode GetBufferCacheMode() const { return m_metalBufferCacheMode; } + [[nodiscard]] PositionInvariance GetPositionInvariance() const { return m_positionInvariance; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -54,6 +57,9 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; + bool m_shaderFastMath = true; + MetalBufferCacheMode m_metalBufferCacheMode = MetalBufferCacheMode::Auto; + PositionInvariance m_positionInvariance = PositionInvariance::Auto; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/GraphicPack/GraphicPack2.cpp b/src/Cafe/GraphicPack/GraphicPack2.cpp index 98149c938e..aded91885b 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.cpp +++ b/src/Cafe/GraphicPack/GraphicPack2.cpp @@ -111,7 +111,7 @@ bool GraphicPack2::LoadGraphicPack(const fs::path& rulesPath, IniParser& rules) gp->SetActivePreset(kv.first, kv.second, false); } - + gp->SetEnabled(enabled); } @@ -144,7 +144,7 @@ bool GraphicPack2::DeactivateGraphicPack(const std::shared_ptr& gr if (!graphic_pack->IsActivated()) return false; - const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), + const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), [graphic_pack](const GraphicPackPtr& gp) { return gp->GetNormalizedPathString() == graphic_pack->GetNormalizedPathString(); @@ -273,6 +273,8 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) m_renderer_api = RendererAPI::Vulkan; else if (boost::iequals(*option_rendererFilter, "opengl")) m_renderer_api = RendererAPI::OpenGL; + else if (boost::iequals(*option_rendererFilter, "metal")) + m_renderer_api = RendererAPI::Metal; else cemuLog_log(LogType::Force, "Unknown value '{}' for rendererFilter option", *option_rendererFilter); } @@ -352,7 +354,7 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) cemuLog_log(LogType::Force, "Graphic pack \"{}\": Preset in line {} skipped because it has no name option defined", GetNormalizedPathString(), rules.GetCurrentSectionLineNumber()); continue; } - + const auto category = rules.FindOption("category"); const auto condition = rules.FindOption("condition"); const auto default_selected = rules.FindOption("default"); @@ -424,13 +426,13 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) { // store by category std::unordered_map> tmp_map; - + // all vars must be defined in the default preset vars before std::vector> mismatchingPresetVars; for (const auto& presetEntry : m_presets) { tmp_map[presetEntry->category].emplace_back(presetEntry); - + for (auto& presetVar : presetEntry->variables) { const auto it = m_preset_vars.find(presetVar.first); @@ -572,7 +574,7 @@ void GraphicPack2::ValidatePresetSelections() // // example: a preset category might be hidden entirely (e.g. due to a separate advanced options dropdown) // how to handle: leave the previously selected preset - // + // // the logic is therefore as follows: // if there is a preset category with at least 1 visible preset entry then make sure one of those is actually selected // for completely hidden preset categories we leave the selection as-is @@ -636,17 +638,17 @@ bool GraphicPack2::SetActivePreset(std::string_view category, std::string_view n // disable currently active preset std::for_each(m_presets.begin(), m_presets.end(), [category](PresetPtr& p) { - if(p->category == category) + if(p->category == category) p->active = false; }); - + if (name.empty()) return true; - + // enable new preset const auto it = std::find_if(m_presets.cbegin(), m_presets.cend(), [category, name](const PresetPtr& preset) { - return preset->category == category && preset->name == name; + return preset->category == category && preset->name == name; }); bool result; @@ -685,12 +687,14 @@ void GraphicPack2::LoadShaders() wchar_t shader_type[256]{}; if (filename.size() < 256 && swscanf(filename.c_str(), L"%" SCNx64 "_%" SCNx64 "_%ls", &shader_base_hash, &shader_aux_hash, shader_type) == 3) { + bool isMetalShader = (shader_type[2] == '_' && shader_type[3] == 'm' && shader_type[4] == 's' && shader_type[5] == 'l'); + if (shader_type[0] == 'p' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL, isMetalShader)); else if (shader_type[0] == 'v' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX, isMetalShader)); else if (shader_type[0] == 'g' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY, isMetalShader)); } else if (filename == L"output.glsl") { @@ -787,7 +791,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + for (const auto& preset : presets) { if (!preset->visible) @@ -797,7 +801,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + const auto it = std::find_if(m_preset_vars.cbegin(), m_preset_vars.cend(), [&var_name](auto p) { return p.first == var_name; }); if (it != m_preset_vars.cend()) { @@ -843,7 +847,7 @@ void GraphicPack2::_iterateReplacedFiles(const fs::path& currentPath, bool isAOC virtualMountPath = fs::path(virtualMountBase) / virtualMountPath; } fscDeviceRedirect_add(virtualMountPath.generic_string(), it.file_size(), it.path().generic_string(), m_fs_priority); - } + } } } @@ -863,7 +867,7 @@ void GraphicPack2::LoadReplacedFiles() std::error_code ec; if (fs::exists(contentPath, ec)) { - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(contentPath, false, "vol/content/"); } @@ -876,7 +880,7 @@ void GraphicPack2::LoadReplacedFiles() uint64 aocTitleId = CafeSystem::GetForegroundTitleId(); aocTitleId = aocTitleId & 0xFFFFFFFFULL; aocTitleId |= 0x0005000c00000000ULL; - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(aocPath, true, nullptr); } @@ -1003,7 +1007,7 @@ bool GraphicPack2::Activate() // enable patch groups EnablePatches(); - + // load replaced files LoadReplacedFiles(); @@ -1049,7 +1053,7 @@ bool GraphicPack2::Deactivate() m_output_shader_source.clear(); m_upscaling_shader_source.clear(); m_downscaling_shader_source.clear(); - + if (HasCustomVSyncFrequency()) { m_vsync_frequency = -1; @@ -1060,7 +1064,7 @@ bool GraphicPack2::Deactivate() return true; } -const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer) +const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer) { for (const auto& gp : GraphicPack2::GetActiveGraphicPacks()) { @@ -1070,9 +1074,12 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u if (it == gp->m_custom_shaders.end()) continue; - if(isVulkanRenderer && (*it).isPreVulkanShader) + if (isVulkanRenderer && (*it).isPreVulkanShader) continue; + if (isMetalRenderer != (*it).isMetalShader) + continue; + return &it->source; } return nullptr; @@ -1081,7 +1088,7 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u std::unordered_map> GraphicPack2::GetCategorizedPresets(std::vector& order) const { order.clear(); - + std::unordered_map> result; for(const auto& entry : m_presets) { @@ -1090,13 +1097,13 @@ std::unordered_map> GraphicPac if (it == order.cend()) order.emplace_back(entry->category); } - + return result; } bool GraphicPack2::HasShaders() const { - return !GetCustomShaders().empty() + return !GetCustomShaders().empty() || !m_output_shader_source.empty() || !m_upscaling_shader_source.empty() || !m_downscaling_shader_source.empty(); } @@ -1230,7 +1237,7 @@ void GraphicPack2::ApplyShaderPresets(std::string& shader_source) const } } -GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const +GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const { CustomShader shader; @@ -1249,6 +1256,7 @@ GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader.shader_aux_hash = shader_aux_hash; shader.type = shader_type; shader.isPreVulkanShader = this->m_version <= 3; + shader.isMetalShader = isMetalShader; return shader; } diff --git a/src/Cafe/GraphicPack/GraphicPack2.h b/src/Cafe/GraphicPack/GraphicPack2.h index fc9603cd7a..b83ac66cfe 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.h +++ b/src/Cafe/GraphicPack/GraphicPack2.h @@ -57,7 +57,7 @@ class GraphicPack2 sint32 lod_bias = -1; // in 1/64th steps sint32 relative_lod_bias = -1; // in 1/64th steps sint32 anistropic_value = -1; // 1< vars) : name(name), variables(std::move(vars)) {} Preset(std::string_view category, std::string_view name, std::unordered_map vars) : category(category), name(name), variables(std::move(vars)) {} - + Preset(std::string_view category, std::string_view name, std::string_view condition, std::unordered_map vars) : category(category), name(name), condition(condition), variables(std::move(vars)) {} }; @@ -136,19 +137,19 @@ class GraphicPack2 bool SetActivePreset(std::string_view category, std::string_view name, bool update_visibility = true); bool SetActivePreset(std::string_view name); void UpdatePresetVisibility(); - + void AddConstantsForCurrentPreset(ExpressionParser& ep); bool ResolvePresetConstant(const std::string& varname, double& value) const; [[nodiscard]] const std::vector& GetPresets() const { return m_presets; } [[nodiscard]] std::unordered_map> GetCategorizedPresets(std::vector& order) const; - + // shaders void LoadShaders(); bool HasShaders() const; const std::vector& GetCustomShaders() const { return m_custom_shaders; } - static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer); + static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer); const std::string& GetOutputShaderSource() const { return m_output_shader_source; } const std::string& GetDownscalingShaderSource() const { return m_downscaling_shader_source; } @@ -194,7 +195,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.AddConstant(var.first, (TType)var.second.second); - } + } } for(const auto& preset : active_presets) { @@ -202,7 +203,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.TryAddConstant(var.first, (TType)var.second.second); - } + } } for (auto& var : m_preset_vars) @@ -228,7 +229,7 @@ class GraphicPack2 bool m_activated = false; // set if the graphic pack is currently used by the running game std::vector m_title_ids; bool m_patchedFilesLoaded = false; // set to true once patched files are loaded - + sint32 m_vsync_frequency = -1; sint32 m_fs_priority = 100; @@ -241,12 +242,12 @@ class GraphicPack2 std::vector m_presets; // default preset vars std::unordered_map m_preset_vars; - + std::vector m_custom_shaders; std::vector m_texture_rules; std::string m_output_shader_source, m_upscaling_shader_source, m_downscaling_shader_source; std::unique_ptr m_output_shader, m_upscaling_shader, m_downscaling_shader, m_output_shader_ud, m_upscaling_shader_ud, m_downscaling_shader_ud; - + template bool ParseRule(const ExpressionParser& parser, IniParser& iniParser, const char* option_name, T* value_out) const; @@ -257,7 +258,7 @@ class GraphicPack2 std::vector ParseTitleIds(IniParser& rules, const char* option_name) const; - CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const; + CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const; void ApplyShaderPresets(std::string& shader_source) const; void LoadReplacedFiles(); void _iterateReplacedFiles(const fs::path& currentPath, bool isAOC, const char* virtualMountBase); @@ -330,6 +331,6 @@ std::vector GraphicPack2::ParseList(const ExpressionParser& parser, IniParser } catch (const std::invalid_argument&) {} } - + return result; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 6c9893f922..96b78d8395 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -8,8 +8,12 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include /* SHA1_DIGEST_LENGTH */ #include /* EVP_Digest */ @@ -104,11 +108,30 @@ void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) key = std::rotl(key, 8); key += (uint64)attrib->semanticId; key = std::rotl(key, 8); - key += (uint64)(attrib->offset & 3); - key = std::rotl(key, 2); + if (g_renderer->GetType() == RendererAPI::Metal) + { + key += (uint64)attrib->offset; + key = std::rotl(key, 7); + } + else + { + key += (uint64)(attrib->offset & 3); + key = std::rotl(key, 2); + } } } // todo - also hash invalid buffer groups? + + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = fetchShader->bufferGroups[g]; + key += (uint64)group.attributeBufferIndex; + key = std::rotl(key, 5); + } + } + fetchShader->key = key; } @@ -146,6 +169,29 @@ void LatteFetchShader::CalculateFetchShaderVkHash() this->vkPipelineHashFragment = h; } +void LatteFetchShader::CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister) +{ +#if ENABLE_METAL + for (sint32 g = 0; g < bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + if (bufferStride % 4 != 0) + mtlFetchVertexManually = true; + + for (sint32 f = 0; f < group.attribCount; f++) + { + auto& attr = group.attrib[f]; + if (attr.offset + GetMtlVertexFormatSize(attr.format) > bufferStride) + mtlFetchVertexManually = true; + } + } +#endif +} + void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr) { uint32 semanticId = instr->getFieldSEM_SEMANTIC_ID(); // location (attribute index inside shader) @@ -161,7 +207,7 @@ void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* pars auto nfa = instr->getField_NUM_FORMAT_ALL(); bool isSigned = instr->getField_FORMAT_COMP_ALL() == LatteClauseInstruction_VTX::FORMAT_COMP::COMP_SIGNED; auto endianSwap = instr->getField_ENDIAN_SWAP(); - + // get buffer cemu_assert_debug(bufferId >= 0xA0 && bufferId < 0xB0); uint32 bufferIndex = (bufferId - 0xA0); @@ -316,7 +362,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach // {0x00000002, 0x01800c00, 0x00000000, 0x8a000000, 0x2c00a001, 0x2c151000, 0x000a0000, ...} // size 0x50 // {0x00000002, 0x01801000, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x60 // {0x00000002, 0x01801c00, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x90 - + // our new implementation: // {0x00000002, 0x01800400, 0x00000000, 0x8a000000, 0x0000a001, 0x2c151000, 0x00020000, ...} @@ -328,6 +374,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach // these only make sense when vertex shader does not call FS? LatteShader_calculateFSKey(newFetchShader); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); return newFetchShader; } @@ -387,6 +434,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach } LatteShader_calculateFSKey(newFetchShader); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); // register in cache // its possible that during multi-threaded shader cache loading, two identical (same hash) fetch shaders get created simultaneously @@ -411,7 +459,7 @@ LatteFetchShader::~LatteFetchShader() UnregisterInCache(); } -struct FetchShaderLookupInfo +struct FetchShaderLookupInfo { LatteFetchShader* fetchShader; uint32 programSize; diff --git a/src/Cafe/HW/Latte/Core/FetchShader.h b/src/Cafe/HW/Latte/Core/FetchShader.h index ac57714d01..1e580f4309 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.h +++ b/src/Cafe/HW/Latte/Core/FetchShader.h @@ -46,13 +46,17 @@ struct LatteFetchShader // Vulkan uint64 vkPipelineHashFragment{}; // hash of all fetch shader state that influences the Vulkan graphics pipeline + // Metal + bool mtlFetchVertexManually{}; + // cache info CacheHash m_cacheHash{}; bool m_isRegistered{}; // if true, fetch shader is referenced by cache (RegisterInCache() succeeded) - void CalculateFetchShaderVkHash(); + void CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister); + uint64 getVkPipelineHashFragment() const { return vkPipelineHashFragment; }; static bool isValidBufferIndex(const uint32 index) { return index < 0x10; }; @@ -69,4 +73,4 @@ struct LatteFetchShader static std::unordered_map s_fetchShaderByHash; }; -LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); \ No newline at end of file +LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp index 0395771e0f..6c36ddd320 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp @@ -441,7 +441,7 @@ class BufferCacheNode if (uploadBegin >= uploadEnd) return; // reserve range not within invalidation or range is zero sized - + if (uploadBegin == m_invalidationRangeBegin) { m_invalidationRangeBegin = uploadEnd; @@ -536,7 +536,7 @@ class BufferCacheNode MPTR m_invalidationRangeBegin; MPTR m_invalidationRangeEnd; - BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) + BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) { flagInUse(); cemu_assert_debug(rangeBegin < rangeEnd); @@ -740,7 +740,7 @@ class BufferCacheNode cemu_assert_debug(rangeEnd <= pageRangeEnd); cemu_assert_debug((rangeBegin & 0xF) == 0); cemu_assert_debug((rangeEnd & 0xF) == 0); - + auto pageInfo = m_pageInfo.data() + pageIndex; pageInfo->hasStreamoutData = true; @@ -805,7 +805,7 @@ class BufferCacheNode s_allCacheNodes.clear(); g_deallocateQueue.clear(); } - + static void ProcessDeallocations() { for(auto& itr : g_deallocateQueue) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp index 85d4cdf7a5..7620e6a778 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp @@ -62,7 +62,7 @@ void rectGenerate4thVertex(uint32be* output, uint32be* input0, uint32be* input1, // order of rectangle vertices is // v0 v1 - // v2 v3 + // v2 v3 for (sint32 f = 0; f < vectorLen*4; f++) output[f] = _swapEndianU32(output[f]); @@ -199,11 +199,14 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance #if BOOST_OS_MACOS if(bufferStride % 4 != 0) { - if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + if (g_renderer->GetType() == RendererAPI::Vulkan) { - auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); - vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); - continue; + if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + { + auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); + vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); + continue; + } } } #endif @@ -222,4 +225,4 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance if (pixelShader) LatteBufferCache_syncGPUUniformBuffers(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START, LatteConst::ShaderType::Pixel); return true; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 2bbb617d7c..260c6cd6b0 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -10,7 +10,7 @@ #include #endif -struct +struct { struct CacheEntry { @@ -115,6 +115,21 @@ uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, L cemu_assert_suspicious(); return 0; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + return count * sizeof(uint16); + return count * sizeof(uint32); + } + if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) + return count * sizeof(uint16); + if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE) + return count * sizeof(uint32); + cemu_assert_suspicious(); + return 0; + } else if(indexType == LatteIndexType::AUTO) return 0; else if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) @@ -308,6 +323,44 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun indexMax = std::max(count, 1u) - 1; } +template +void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + // TODO: check this + for (sint32 i = 0; i < count; i++) + { + uint32 i0; + if (i % 2 == 0) + i0 = i / 2; + else + i0 = count - 1 - i / 2; + T idx = src[i0]; + indexMin = std::min(indexMin, (uint32)idx); + indexMax = std::max(indexMax, (uint32)idx); + dst[i] = idx; + } +} + +template +void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + for (sint32 i = 0; i < count; i++) + { + T idx = i; + if (idx % 2 == 0) + idx = idx / 2; + else + idx = count - 1 - idx / 2; + dst[i] = idx; + } + indexMin = 0; + indexMax = std::max(count, 1u) - 1; +} + #if defined(ARCH_X86_64) ATTRIBUTE_AVX2 void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) @@ -319,7 +372,7 @@ void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDat sint32 countRemaining = count & 15; if (count16) { - __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, + __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF); __m256i mMax = _mm256_set_epi16(0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000); __m256i mShuffle16Swap = _mm256_set_epi8(30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); @@ -794,6 +847,29 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count + 1; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U16; + } + else + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U32; + } + } + else if (indexType == LatteIndexType::U16_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else if (indexType == LatteIndexType::U32_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else + cemu_assert_debug(false); + outputCount = count; + } else { if (indexType == LatteIndexType::U16_BE) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index be9917e12a..1bdbc4bba5 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -449,14 +449,6 @@ bool LatteMRT::UpdateCurrentFBO() uint8 colorBufferMask = GetActiveColorBufferMask(pixelShader, LatteGPUState.contextNew); bool depthBufferMask = GetActiveDepthBufferMask(LatteGPUState.contextNew); - // if depth test is not used then detach the depth buffer - bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); - bool stencilTestEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - - if (!depthEnable && !stencilTestEnable && !backStencilEnable) - depthBufferMask = false; - bool hasResizedTexture = false; // set to true if any of the color buffers or the depth buffer reference a resized texture (via graphic pack texture rules) sLatteRenderTargetState.renderTargetIsResized = false; // real size @@ -723,8 +715,8 @@ void LatteRenderTarget_applyTextureColorClear(LatteTexture* texture, uint32 slic void LatteRenderTarget_applyTextureDepthClear(LatteTexture* texture, uint32 sliceIndex, uint32 mipIndex, bool hasDepthClear, bool hasStencilClear, float depthValue, uint8 stencilValue, uint64 eventCounter) { - if(texture->isDepth) - { + if(texture->isDepth) + { g_renderer->texture_clearDepthSlice(texture, sliceIndex, mipIndex, hasDepthClear, hasStencilClear, depthValue, stencilValue); } else @@ -883,7 +875,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa textureView->baseTexture->GetEffectiveSize(effectiveWidth, effectiveHeight, 0); _currentOutputImageWidth = effectiveWidth; _currentOutputImageHeight = effectiveHeight; - + sint32 imageX, imageY; sint32 imageWidth, imageHeight; sint32 fullscreenWidth, fullscreenHeight; @@ -1037,7 +1029,7 @@ void LatteRenderTarget_updateViewport() float vpX = LatteGPUState.contextNew.PA_CL_VPORT_XOFFSET.get_OFFSET() - LatteGPUState.contextNew.PA_CL_VPORT_XSCALE.get_SCALE(); float vpHeight = LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE() / -0.5f; float vpY = LatteGPUState.contextNew.PA_CL_VPORT_YOFFSET.get_OFFSET() + LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE(); - + bool halfZ = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF(); // calculate near/far diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index d9f0a5ddfc..087643e48e 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -9,10 +9,15 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency #include "Cafe/GraphicPack/GraphicPack2.h" +#include "HW/Latte/Core/Latte.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/helpers/StringParser.h" #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" #include "util/containers/flat_hash_map.hpp" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include // experimental new decompiler (WIP) @@ -77,7 +82,7 @@ inline ska::flat_hash_map& LatteSHRC_GetCacheByT if (shaderType == LatteConst::ShaderType::Vertex) return sVertexShaders; else if (shaderType == LatteConst::ShaderType::Geometry) - return sGeometryShaders; + return sGeometryShaders; cemu_assert_debug(shaderType == LatteConst::ShaderType::Pixel); return sPixelShaders; } @@ -205,11 +210,9 @@ void LatteShader_free(LatteDecompilerShader* shader) delete shader; } -// both vertex and geometry/pixel shader depend on PS inputs -// we prepare the PS import info in advance -void LatteShader_UpdatePSInputs(uint32* contextRegisters) +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters) { - // PS control + // PS control uint32 psControl0 = contextRegisters[mmSPI_PS_IN_CONTROL_0]; uint32 spi0_positionEnable = (psControl0 >> 8) & 1; uint32 spi0_positionCentroid = (psControl0 >> 9) & 1; @@ -238,12 +241,12 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) { key += std::rotr(spi0_paramGen, 7); key += std::rotr(spi0_paramGenAddr, 3); - _activePSImportTable.paramGen = spi0_paramGen; - _activePSImportTable.paramGenGPR = spi0_paramGenAddr; + psInputTable->paramGen = spi0_paramGen; + psInputTable->paramGenGPR = spi0_paramGenAddr; } else { - _activePSImportTable.paramGen = 0; + psInputTable->paramGen = 0; } // semantic imports from vertex shader @@ -277,9 +280,9 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) key = std::rotl(key, 7); if (spi0_positionEnable && f == spi0_positionAddr) { - _activePSImportTable.import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; - _activePSImportTable.import[f].isFlat = false; - _activePSImportTable.import[f].isNoPerspective = false; + psInputTable->import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; + psInputTable->import[f].isFlat = false; + psInputTable->import[f].isNoPerspective = false; key += (uint64)0x33; } else @@ -292,13 +295,20 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) semanticMask[psSemanticId >> 3] |= (1 << (psSemanticId & 7)); #endif - _activePSImportTable.import[f].semanticId = psSemanticId; - _activePSImportTable.import[f].isFlat = (psInputControl&(1 << 10)) != 0; - _activePSImportTable.import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; + psInputTable->import[f].semanticId = psSemanticId; + psInputTable->import[f].isFlat = (psInputControl&(1 << 10)) != 0; + psInputTable->import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; } } - _activePSImportTable.key = key; - _activePSImportTable.count = numPSInputs; + psInputTable->key = key; + psInputTable->count = numPSInputs; +} + +// both vertex and geometry/pixel shader depend on PS inputs +// we prepare the PS import info in advance +void LatteShader_UpdatePSInputs(uint32* contextRegisters) +{ + LatteShader_CreatePSInputTable(&_activePSImportTable, contextRegisters); } void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compileAsync) @@ -320,7 +330,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil { shaderType = RendererShader::ShaderType::kGeometry; gpShaderType = GraphicPack2::GP_SHADER_TYPE::GEOMETRY; - } + } else if (shader->shaderType == LatteConst::ShaderType::Pixel) { shaderType = RendererShader::ShaderType::kFragment; @@ -330,7 +340,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil // check if a custom shader is present std::string shaderSrc; - const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan); + const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan, g_renderer->GetType() == RendererAPI::Metal); if (customShaderSrc) { shaderSrc.assign(*customShaderSrc); @@ -443,7 +453,7 @@ void LatteShader_DumpShader(uint64 baseHash, uint64 auxHash, LatteDecompilerShad { if (!ActiveSettings::DumpShadersEnabled()) return; - + const char* suffix = ""; if (shader->shaderType == LatteConst::ShaderType::Vertex) suffix = "vs"; @@ -500,6 +510,7 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, vsHash += tmp; auto primitiveType = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + // TODO: include always in the hash in case of geometry shader or rect shader on Metal if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) { vsHash += 13ULL; @@ -514,6 +525,37 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) vsHash += 0x1537; +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + bool isRectVertexShader = (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + if ((usesGeometryShader || isRectVertexShader) || _activeFetchShader->mtlFetchVertexManually) + { + for (sint32 g = 0; g < _activeFetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = _activeFetchShader->bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + vsHash += (uint64)bufferStride; + vsHash = std::rotl(vsHash, 7); + } + } + + if (!(usesGeometryShader || isRectVertexShader)) + { + if (LatteGPUState.contextNew.IsRasterizationEnabled()) + vsHash += 51ULL; + + // Vertex fetch + if (_activeFetchShader->mtlFetchVertexManually) + vsHash += 349ULL; + } + } +#endif + _shaderBaseHash_vs = vsHash; } @@ -539,6 +581,7 @@ void LatteSHRC_UpdatePSBaseHash(uint8* pixelShaderPtr, uint32 pixelShaderSize, b _calculateShaderProgramHash(psProgramCode, pixelShaderSize, &hashCachePS, &psHash1, &psHash2); // get vertex shader uint64 psHash = psHash1 + psHash2 + _activePSImportTable.key + (usesGeometryShader ? hashCacheGS.prevHash1 : 0ULL); + _shaderBaseHash_ps = psHash; } @@ -572,6 +615,7 @@ uint64 LatteSHRC_CalcVSAuxHash(LatteDecompilerShader* vertexShader, uint32* cont auxHashTex += 0x333; } } + return auxHash + auxHashTex; } @@ -605,6 +649,37 @@ uint64 LatteSHRC_CalcPSAuxHash(LatteDecompilerShader* pixelShader, uint32* conte auxHash = (auxHash << 3) | (auxHash >> 61); auxHash += (uint64)dim; } + +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + // Textures as render targets + for (uint32 i = 0; i < pixelShader->textureUnitListCount; i++) + { + uint8 t = pixelShader->textureUnitList[i]; + auxHash = std::rotl(auxHash, 11); + auxHash += (uint64)pixelShader->textureRenderTargetIndex[t]; + } + + // Color buffers + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto format = LatteMRT::GetColorBufferFormat(i, LatteGPUState.contextNew); + uint8 dataType = (uint8)GetMtlPixelFormatInfo(format, false).dataType; + auxHash = std::rotl(auxHash, 7); + auxHash += (uint64)dataType; + } + + // Depth buffer + bool hasDepthBuffer = LatteMRT::GetActiveDepthBufferMask(LatteGPUState.contextNew); + if (hasDepthBuffer) + { + auxHash = std::rotl(auxHash, 5); + auxHash += 13u; + } + } +#endif + return auxHash; } @@ -613,10 +688,13 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi LatteDecompilerShader* shader = decompilerOutput.shader; shader->baseHash = baseHash; // copy resource mapping - if(g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() == RendererAPI::Vulkan) shader->resourceMapping = decompilerOutput.resourceMappingVK; - else + else if (g_renderer->GetType() == RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingGL; + else + shader->resourceMapping = decompilerOutput.resourceMappingMTL; // copy texture info shader->textureUnitMask2 = decompilerOutput.textureUnitMask; // copy streamout info @@ -624,7 +702,8 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi shader->hasStreamoutBufferWrite = decompilerOutput.streamoutBufferWriteMask.any(); // copy uniform offsets // for OpenGL these are retrieved in _prepareSeparableUniforms() - if (g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) { shader->uniform.loc_remapped = decompilerOutput.uniformOffsetsVK.offset_remapped; shader->uniform.loc_uniformRegister = decompilerOutput.uniformOffsetsVK.offset_uniformRegister; @@ -684,9 +763,9 @@ void LatteShader_GetDecompilerOptions(LatteDecompilerOptions& options, LatteCons { options.usesGeometryShader = geometryShaderEnabled; options.spirvInstrinsics.hasRoundingModeRTEFloat32 = false; + options.useTFViaSSBO = g_renderer->UseTFViaSSBO(); if (g_renderer->GetType() == RendererAPI::Vulkan) { - options.useTFViaSSBO = VulkanRenderer::GetInstance()->UseTFViaSSBO(); options.spirvInstrinsics.hasRoundingModeRTEFloat32 = VulkanRenderer::GetInstance()->HasSPRIVRoundingModeRTE32(); } options.strictMul = g_current_game_profile->GetAccurateShaderMul() != AccurateShaderMulOption::False; @@ -1009,4 +1088,4 @@ void LatteSHRC_UnloadAll() while(!sPixelShaders.empty()) LatteShader_free(sPixelShaders.begin()->second); cemu_assert_debug(sPixelShaders.empty()); -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteShader.h b/src/Cafe/HW/Latte/Core/LatteShader.h index f8dc6d1a3b..85d53b01b6 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.h +++ b/src/Cafe/HW/Latte/Core/LatteShader.h @@ -84,6 +84,7 @@ struct LatteShaderPSInputTable } }; +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters); void LatteShader_UpdatePSInputs(uint32* contextRegisters); LatteShaderPSInputTable* LatteSHRC_GetPSInputTable(); @@ -126,4 +127,4 @@ void LatteShaderCache_writeSeparableGeometryShader(uint64 shaderBaseHash, uint64 void LatteShaderCache_writeSeparablePixelShader(uint64 shaderBaseHash, uint64 shaderAuxHash, uint8* pixelShader, uint32 pixelShaderSize, uint32* contextRegisters, bool usesGeometryShader); // todo - refactor this -sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); \ No newline at end of file +sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 737e92012a..14a1f9b0b3 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,6 +11,10 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#endif #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" #include @@ -43,7 +47,7 @@ struct sint32 pixelShaderCount; }shaderCacheScreenStats; -struct +struct { ImTextureID textureTVId; ImTextureID textureDRCId; @@ -64,7 +68,7 @@ FileCache* s_shaderCacheGeneric = nullptr; // contains hardware and version inde #define SHADER_CACHE_TYPE_PIXEL (2) bool LatteShaderCache_readSeparableShader(uint8* shaderInfoData, sint32 shaderInfoSize); -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId); +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId); bool LatteShaderCache_updatePipelineLoadingProgress(); void LatteShaderCache_ShowProgress(const std::function & loadUpdateFunc, bool isPipelines); @@ -269,10 +273,14 @@ static BootSoundPlayer g_bootSndPlayer; void LatteShaderCache_finish() { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() == RendererAPI::Vulkan) RendererShaderVk::ShaderCacheLoading_end(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_end(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_end(); +#endif } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -355,8 +363,17 @@ void LatteShaderCache_Load() RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); +#endif + // get cache file name - const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); + fs::path pathGeneric; + if (g_renderer->GetType() == RendererAPI::Metal) + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlshaders.bin", cacheTitleId); + else + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); // calculate extraVersion for transferable and precompiled shader cache uint32 transferableExtraVersion = SHADER_CACHE_GENERIC_EXTRA_VERSION; @@ -440,7 +457,7 @@ void LatteShaderCache_Load() }; LatteShaderCache_ShowProgress(LoadShadersUpdate, false); - + LatteShaderCache_updateCompileQueue(0); // write load time and RAM usage to log file (in dev build) #if BOOST_OS_WINDOWS @@ -453,9 +470,9 @@ void LatteShaderCache_Load() cemuLog_log(LogType::Force, "Shader cache loaded with {} shaders. Commited mem {}MB. Took {}ms", numLoadedShaders, (sint32)(memCommited/1024/1024), timeLoad); #endif LatteShaderCache_finish(); - // if Vulkan then also load pipeline cache - if (g_renderer->GetType() == RendererAPI::Vulkan) - LatteShaderCache_LoadVulkanPipelineCache(cacheTitleId); + // if Vulkan or Metal then also load pipeline cache + if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) + LatteShaderCache_LoadPipelineCache(cacheTitleId); g_renderer->BeginFrame(true); @@ -488,7 +505,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF { const auto kPopupFlags = ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoDecoration | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_NoFocusOnAppearing | ImGuiWindowFlags_NoNav | ImGuiWindowFlags_AlwaysAutoResize; const auto textColor = 0xFF888888; - + auto lastFrameUpdate = tick_cached(); while (true) @@ -541,7 +558,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF std::string text; if (isPipelines) { - text = "Loading cached Vulkan pipelines..."; + text = "Loading cached pipelines..."; } else { @@ -615,19 +632,35 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF } } -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId) +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId) { - auto& pipelineCache = VulkanPipelineStableCache::GetInstance(); - g_shaderCacheLoaderState.pipelineFileCount = pipelineCache.BeginLoading(cacheTitleId); + if (g_renderer->GetType() == RendererAPI::Vulkan) + g_shaderCacheLoaderState.pipelineFileCount = VulkanPipelineStableCache::GetInstance().BeginLoading(cacheTitleId); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + g_shaderCacheLoaderState.pipelineFileCount = MetalPipelineCache::GetInstance().BeginLoading(cacheTitleId); +#endif g_shaderCacheLoaderState.loadedPipelines = 0; LatteShaderCache_ShowProgress(LatteShaderCache_updatePipelineLoadingProgress, true); - pipelineCache.EndLoading(); + if (g_renderer->GetType() == RendererAPI::Vulkan) + VulkanPipelineStableCache::GetInstance().EndLoading(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().EndLoading(); +#endif } bool LatteShaderCache_updatePipelineLoadingProgress() { uint32 pipelinesMissingShaders = 0; - return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); + if (g_renderer->GetType() == RendererAPI::Vulkan) + return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + return MetalPipelineCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#endif + + return false; } uint64 LatteShaderCache_getShaderNameInTransferableCache(uint64 baseHash, uint32 shaderType) @@ -886,11 +919,19 @@ void LatteShaderCache_Close() s_shaderCacheGeneric = nullptr; } if (g_renderer->GetType() == RendererAPI::Vulkan) - RendererShaderVk::ShaderCacheLoading_Close(); - else if (g_renderer->GetType() == RendererAPI::OpenGL) - RendererShaderGL::ShaderCacheLoading_Close(); + RendererShaderVk::ShaderCacheLoading_Close(); + else if (g_renderer->GetType() == RendererAPI::OpenGL) + RendererShaderGL::ShaderCacheLoading_Close(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_Close(); +#endif - // if Vulkan then also close pipeline cache + // if Vulkan or Metal then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) VulkanPipelineStableCache::GetInstance().Close(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().Close(); +#endif } diff --git a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp index b8cb0ce1be..09c484e686 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp @@ -26,7 +26,7 @@ bool gxShader_checkIfSuccessfullyLinked(GLuint glProgram) void LatteShader_prepareSeparableUniforms(LatteDecompilerShader* shader) { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() != RendererAPI::OpenGL) return; auto shaderGL = (RendererShaderGL*)shader->shader; diff --git a/src/Cafe/HW/Latte/Core/LatteTexture.cpp b/src/Cafe/HW/Latte/Core/LatteTexture.cpp index d885289101..4445fb26b8 100644 --- a/src/Cafe/HW/Latte/Core/LatteTexture.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTexture.cpp @@ -170,7 +170,7 @@ void LatteTexture_UnregisterTextureMemoryOccupancy(LatteTexture* texture) } // calculate the actually accessed data range -// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) +// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) void LatteTexture_EstimateMipSliceAccessedDataRange(LatteTexture* texture, sint32 sliceIndex, sint32 mipIndex, LatteTextureSliceMipInfo* sliceMipInfo) { uint32 estAddrStart; @@ -222,7 +222,7 @@ void LatteTexture_InitSliceAndMipInfo(LatteTexture* texture) LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo; LatteAddrLib::GX2CalculateSurfaceInfo(texture->format, texture->width, texture->height, texture->depth, texture->dim, Latte::MakeGX2TileMode(texture->tileMode), 0, mipIndex, &surfaceInfo); sliceMipInfo->tileMode = surfaceInfo.hwTileMode; - + if (mipIndex == 0) sliceMipInfo->pitch = texture->pitch; // for the base level, use the pitch value configured in hardware else @@ -877,7 +877,7 @@ VIEWCOMPATIBILITY LatteTexture_CanTextureBeRepresentedAsView(LatteTexture* baseT // check pitch if(sliceMipInfo->pitch != pitch) continue; - // check all slices + // check all slices if(LatteAddrLib::TM_IsThickAndMacroTiled(baseTexture->tileMode)) continue; // todo - check only every 4th slice? for (sint32 s=0; sGetMipDepth(m); s++) @@ -978,7 +978,7 @@ LatteTextureView* LatteTexture_CreateMapping(MPTR physAddr, MPTR physMipAddr, si } // note: When creating an existing texture, we only allow mip and slice expansion at the end cemu_assert_debug(depth); - + cemu_assert_debug(!(depth > 1 && dimBase == Latte::E_DIM::DIM_2D)); cemu_assert_debug(!(numSlice > 1 && dimView == Latte::E_DIM::DIM_2D)); // todo, depth and numSlice are redundant @@ -1308,6 +1308,40 @@ LatteTexture::LatteTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddre { this->enableReadback = true; } + + // calculate number of potential mip levels (from effective size) + sint32 effectiveWidth = width; + sint32 effectiveHeight = height; + sint32 effectiveDepth = depth; + if (this->overwriteInfo.hasResolutionOverwrite) + { + effectiveWidth = this->overwriteInfo.width; + effectiveHeight = this->overwriteInfo.height; + effectiveDepth = this->overwriteInfo.depth; + } + this->maxPossibleMipLevels = 1; + if (dim != Latte::E_DIM::DIM_3D) + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } + else + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } } LatteTexture::~LatteTexture() diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp index 50aa4d8769..25c9f54b38 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp @@ -13,7 +13,7 @@ struct TexScaleXY float xy[2]; }; -struct +struct { TexScaleXY perUnit[Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE]; // stores actualResolution/effectiveResolution ratio for each texture }LatteTextureScale[static_cast(LatteConst::ShaderType::TotalCount)] = { }; @@ -73,46 +73,16 @@ void LatteTexture_ReloadData(LatteTexture* tex) LatteTextureView* LatteTexture_CreateTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { const auto tex = g_renderer->texture_createTextureEx(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); + // init slice/mip info array LatteTexture_InitSliceAndMipInfo(tex); LatteTexture_RegisterTextureMemoryOccupancy(tex); cemu_assert_debug(mipLevels != 0); - // calculate number of potential mip levels (from effective size) - sint32 effectiveWidth = width; - sint32 effectiveHeight = height; - sint32 effectiveDepth = depth; - if (tex->overwriteInfo.hasResolutionOverwrite) - { - effectiveWidth = tex->overwriteInfo.width; - effectiveHeight = tex->overwriteInfo.height; - effectiveDepth = tex->overwriteInfo.depth; - } - tex->maxPossibleMipLevels = 1; - if (dim != Latte::E_DIM::DIM_3D) - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } - else - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } + LatteTexture_ReloadData(tex); LatteTC_MarkTextureStillInUse(tex); LatteTC_RegisterTexture(tex); + // create initial view that maps to the whole texture tex->baseView = tex->GetOrCreateView(0, tex->mipLevels, 0, tex->depth); return tex->baseView; @@ -371,4 +341,4 @@ uint64 LatteTexture_getNextUpdateEventCounter() void LatteTexture_init() { -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp index c06a3bf189..b80bd869c6 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp @@ -602,7 +602,7 @@ void LatteTextureLoader_loadTextureDataIntoSlice(LatteTexture* hostTexture, sint void LatteTextureLoader_UpdateTextureSliceData(LatteTexture* tex, uint32 sliceIndex, uint32 mipIndex, MPTR physImagePtr, MPTR physMipPtr, Latte::E_DIM dim, uint32 width, uint32 height, uint32 depth, uint32 mipLevels, uint32 pitch, Latte::E_HWTILEMODE tileMode, uint32 swizzle, bool dumpTex) { LatteTextureLoaderCtx textureLoader = { 0 }; - + Latte::E_GX2SURFFMT format = tex->format; LatteTextureLoader_begin(&textureLoader, sliceIndex, mipIndex, physImagePtr, physMipPtr, format, dim, width, height, depth, mipLevels, pitch, tileMode, swizzle); @@ -853,7 +853,7 @@ void LatteTextureLoader_writeReadbackTextureToMemory(LatteTextureDefinition* tex pixelInput += 4; } } - } + } else { cemuLog_logDebug(LogType::Force, "Texture readback unsupported format {:04x} for tileMode 0x{:02x}", (uint32)textureData->format, textureData->tileMode); diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index f6de57d681..7b2c109b3d 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -594,7 +594,7 @@ class TextureDecoder_R4_G4_UNORM_To_RGBA4 : public TextureDecoder, public Single } }; -class TextureDecoder_R4_G4_UNORM_To_RGBA4_vk : public TextureDecoder, public SingletonClass +class TextureDecoder_R4_G4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass { public: sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override @@ -679,6 +679,51 @@ class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public Singlet } }; +class TextureDecoder_R4G4_UNORM_To_RG8 : public TextureDecoder, public SingletonClass +{ +public: + sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override + { + return 2; + } + + void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override + { + for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) + { + sint32 yc = y; + for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) + { + uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); + sint32 pixelOffset = (x + yc * textureLoader->width) * 2; + uint8 v0 = (*(uint8*)(blockData + 0)); + + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + + *(uint8*)(outputData + pixelOffset + 0) = red4; + *(uint8*)(outputData + pixelOffset + 1) = green4; + } + } + } + + void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override + { + uint8 v0 = *(blockData + 0); + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + *(outputPixel + 0) = red4; + *(outputPixel + 1) = green4; + *(outputPixel + 2) = 0; + *(outputPixel + 3) = 255; + } +}; + class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public SingletonClass { public: @@ -723,7 +768,6 @@ class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public Singleton } }; - class TextureDecoder_R4G4B4A4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { public: @@ -2121,4 +2165,4 @@ class TextureDecoder_BC5 : public TextureDecoder, public SingletonClass Research which stages are disabled by DX_RASTERIZATION_KILL exactly + // for now we use a workaround: + if (!PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + uint32 cullFront = PA_SU_SC_MODE_CNTL.get_CULL_FRONT(); + uint32 cullBack = PA_SU_SC_MODE_CNTL.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + return rasterizationEnabled; + } }; static_assert(sizeof(LatteContextRegister) == 0x10000 * 4 + 9 * 4); @@ -1664,4 +1682,4 @@ static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_ES) == Latte::REGA static_assert(offsetof(LatteContextRegister, SQ_PGM_START_GS) == Latte::REGADDR::SQ_PGM_START_GS * 4); static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_GS) == Latte::REGADDR::SQ_PGM_RESOURCES_GS * 4); static_assert(offsetof(LatteContextRegister, SPI_VS_OUT_CONFIG) == Latte::REGADDR::SPI_VS_OUT_CONFIG * 4); -static_assert(offsetof(LatteContextRegister, LATTE_SPI_VS_OUT_ID_N) == Latte::REGADDR::SPI_VS_OUT_ID_0 * 4); \ No newline at end of file +static_assert(offsetof(LatteContextRegister, LATTE_SPI_VS_OUT_ID_N) == Latte::REGADDR::SPI_VS_OUT_ID_0 * 4); diff --git a/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h b/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h index b0e2cfb316..b54d6038e2 100644 --- a/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h +++ b/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h @@ -381,4 +381,4 @@ void optimizedDecodeLoops(LatteTextureLoaderCtx* textureLoader, uint8* outputDat } } } -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp index 5972aacc45..13188743e7 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp @@ -323,8 +323,8 @@ bool LatteDecompiler_IsALUTransInstruction(bool isOP3, uint32 opcode) } else if( opcode == ALU_OP2_INST_MOV || opcode == ALU_OP2_INST_ADD || - opcode == ALU_OP2_INST_NOP || - opcode == ALU_OP2_INST_MUL || + opcode == ALU_OP2_INST_NOP || + opcode == ALU_OP2_INST_MUL || opcode == ALU_OP2_INST_DOT4 || opcode == ALU_OP2_INST_DOT4_IEEE || opcode == ALU_OP2_INST_MAX || // Not sure if MIN/MAX are non-transcendental? @@ -929,7 +929,7 @@ void LatteDecompiler_ParseTEXClause(LatteDecompilerShader* shaderContext, LatteD texInstruction.memRead.format = dataFormat; texInstruction.memRead.nfa = nfa; texInstruction.memRead.isSigned = isSigned; - + cfInstruction->instructionsTEX.emplace_back(texInstruction); } else @@ -1068,9 +1068,16 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 LatteDecompiler_analyzeDataTypes(shaderContext); // emit code if (shaderContext->shader->hasError == false) - LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + { + if (g_renderer->GetType() == RendererAPI::OpenGL || g_renderer->GetType() == RendererAPI::Vulkan) + LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); +#if ENABLE_METAL + else + LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); +#endif + } LatteDecompiler_cleanup(shaderContext); - // fast access + // fast access _LatteDecompiler_GenerateDataForFastAccess(shaderContext->shader); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 1159614e59..475bacb0cc 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -36,7 +36,7 @@ typedef struct uint16 mappedIndexOffset; // index in remapped uniform array }LatteFastAccessRemappedUniformEntry_buffer_t; -typedef struct +typedef struct { uint32 texUnit; sint32 uniformLocation; @@ -57,12 +57,16 @@ struct LatteDecompilerShaderResourceMapping // texture sint8 textureUnitToBindingPoint[LATTE_NUM_MAX_TEX_UNITS]; // uniform buffer - sint8 uniformVarsBufferBindingPoint{}; // special block for uniform registers/remapped array/custom variables + sint8 uniformVarsBufferBindingPoint{-1}; // special block for uniform registers/remapped array/custom variables sint8 uniformBuffersBindingPoint[LATTE_NUM_MAX_UNIFORM_BUFFERS]; // shader storage buffer for transform feedback (if alternative mode is used) sint8 tfStorageBindingPoint{-1}; // attributes (vertex shader only) sint8 attributeMapping[LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS]; + // Metal exclusive + sint8 verticesPerInstanceBinding{-1}; + sint8 indexBufferBinding{-1}; + sint8 indexTypeBinding{-1}; sint32 getTextureCount() { @@ -179,9 +183,12 @@ struct LatteDecompilerShader std::bitset textureUnitMask2; uint16 textureUnitSamplerAssignment[LATTE_NUM_MAX_TEX_UNITS]{ 0 }; // LATTE_DECOMPILER_SAMPLER_NONE means undefined bool textureUsesDepthCompare[LATTE_NUM_MAX_TEX_UNITS]{}; + uint8 textureRenderTargetIndex[LATTE_NUM_MAX_TEX_UNITS]; // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) + // analyzer stage (depth output) + bool depthMask{ false }; // analyzer stage (geometry shader parameters/inputs) uint32 ringParameterCount{ 0 }; uint32 ringParameterCountFromPrevStage{ 0 }; // used in geometry shader to hold VS ringParameterCount @@ -198,7 +205,7 @@ struct LatteDecompilerShader // resource mapping (binding points) LatteDecompilerShaderResourceMapping resourceMapping{}; // uniforms - struct + struct { sint32 loc_remapped; // uf_remappedVS/uf_remappedGS/uf_remappedPS sint32 loc_uniformRegister; // uf_uniformRegisterVS/uf_uniformRegisterGS/uf_uniformRegisterPS @@ -215,7 +222,7 @@ struct LatteDecompilerShader sint32 uniformRangeSize; // entire size of uniform variable block }uniform{ 0 }; // fast access - struct _RemappedUniformBufferGroup + struct _RemappedUniformBufferGroup { _RemappedUniformBufferGroup(uint32 _kcacheBankIdOffset) : kcacheBankIdOffset(_kcacheBankIdOffset) {}; @@ -255,14 +262,14 @@ struct LatteDecompilerOutputUniformOffsets } }; -struct LatteDecompilerOptions +struct LatteDecompilerOptions { bool usesGeometryShader{ false }; // floating point math bool strictMul{}; // if true, 0*anything=0 rule is emulated // Vulkan-specific bool useTFViaSSBO{ false }; - struct + struct { bool hasRoundingModeRTEFloat32{ false }; }spirvInstrinsics; @@ -286,6 +293,7 @@ struct LatteDecompilerOutput_t // mapping and binding information LatteDecompilerShaderResourceMapping resourceMappingGL; LatteDecompilerShaderResourceMapping resourceMappingVK; + LatteDecompilerShaderResourceMapping resourceMappingMTL; }; struct LatteDecompilerSubroutineInfo; @@ -322,4 +330,4 @@ struct LatteParsedGSCopyShader }; LatteParsedGSCopyShader* LatteGSCopyShaderParser_parse(uint8* programData, uint32 programSize); -bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); \ No newline at end of file +bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index ff64988c24..8eb3c974dc 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -8,6 +8,14 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/MemPtr.h" +#include "HW/Latte/ISA/LatteReg.h" +#if ENABLE_METAL +#include "HW/Latte/Renderer/Metal/MetalCommon.h" +#endif + +// Defined in LatteTextureLegacy.cpp +Latte::E_GX2SURFFMT LatteTexture_ReconstructGX2Format(const Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N& texUnitWord1, const Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N& texUnitWord4); /* * Return index of used color attachment based on shader pixel export index (0-7) @@ -289,15 +297,15 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex LatteDecompilerShader* shader = shaderContext->shader; for(auto& texInstruction : cfInstruction->instructionsTEX) { - if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || - texInstruction.opcode == GPU7_TEX_INST_FETCH4 || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || + texInstruction.opcode == GPU7_TEX_INST_FETCH4 || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) { if (texInstruction.textureFetch.textureIndex < 0 || texInstruction.textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) @@ -315,7 +323,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] = texInstruction.textureFetch.samplerIndex; if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ) shader->textureUsesDepthCompare[texInstruction.textureFetch.textureIndex] = true; - + bool useTexelCoords = false; if (texInstruction.opcode == GPU7_TEX_INST_SAMPLE && (texInstruction.textureFetch.unnormalized[0] && texInstruction.textureFetch.unnormalized[1] && texInstruction.textureFetch.unnormalized[2] && texInstruction.textureFetch.unnormalized[3])) useTexelCoords = true; @@ -384,7 +392,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader = shaderContext->shader; if( shader->shaderType == LatteConst::ShaderType::Pixel ) { - if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8) { // remember color outputs that are written for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) @@ -393,9 +401,11 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, shader->pixelColorOutputMask |= (1<exportType == 0 && cfInstruction->exportArrayBase == 61 ) + else if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61) { - // writes pixel depth + // Only check for depth buffer mask on Metal, as its not in the PS hash on other backends + if (g_renderer->GetType() != RendererAPI::Metal || LatteMRT::GetActiveDepthBufferMask(*shaderContext->contextRegistersNew)) + shader->depthMask = true; } else debugBreakpoint(); @@ -421,7 +431,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, void LatteDecompiler_analyzeSubroutine(LatteDecompilerShaderContext* shaderContext, uint32 cfAddr) { // analyze CF and clauses up to RET statement - + // todo - find cfInstruction index from cfAddr cemu_assert_debug(false); @@ -500,6 +510,18 @@ namespace LatteDecompiler } } + void _initTextureBindingPointsMTL(LatteDecompilerShaderContext* decompilerContext) + { + // for Vulkan we use consecutive indices + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!decompilerContext->output->textureUnitMask[i] || decompilerContext->shader->textureRenderTargetIndex[i] != 255) + continue; + decompilerContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] = decompilerContext->currentTextureBindingPointMTL; + decompilerContext->currentTextureBindingPointMTL++; + } + } + void _initHasUniformVarBlock(LatteDecompilerShaderContext* decompilerContext) { decompilerContext->hasUniformVarBlock = false; @@ -507,9 +529,9 @@ namespace LatteDecompiler decompilerContext->hasUniformVarBlock = true; else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) decompilerContext->hasUniformVarBlock = true; - - bool hasAnyViewportScaleDisabled = - !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); // we currently only support all on/off. Individual component scaling is not supported @@ -537,6 +559,15 @@ namespace LatteDecompiler { decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance and uf_streamoutBufferBase* } +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + bool usesGeometryShader = UseGeometryShader(*decompilerContext->contextRegistersNew, decompilerContext->options->usesGeometryShader); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && usesGeometryShader) + decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance + } +#endif } void _initUniformBindingPoints(LatteDecompilerShaderContext* decompilerContext) @@ -554,14 +585,13 @@ namespace LatteDecompiler } } // assign binding point to uniform var block - decompilerContext->output->resourceMappingGL.uniformVarsBufferBindingPoint = -1; // OpenGL currently doesnt use a uniform block if (decompilerContext->hasUniformVarBlock) { decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } - else - decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = -1; // assign binding points to uniform buffers if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { @@ -580,6 +610,8 @@ namespace LatteDecompiler decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } // for OpenGL we use the relative buffer index for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) @@ -601,6 +633,8 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingVK.tfStorageBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } } @@ -617,6 +651,7 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingGL.attributeMapping[i] = bindingIndex; decompilerContext->output->resourceMappingVK.attributeMapping[i] = bindingIndex; + decompilerContext->output->resourceMappingMTL.attributeMapping[i] = bindingIndex; bindingIndex++; } } @@ -805,7 +840,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD for(sint32 i=0; ioutput->textureUnitMask[i]) + if (!shaderContext->output->textureUnitMask[i]) { // texture unit not used shader->textureUnitDim[i] = (Latte::E_DIM)0xFF; @@ -827,6 +862,78 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shader->textureUnitList[shader->textureUnitListCount] = i; shader->textureUnitListCount++; } + shader->textureRenderTargetIndex[i] = 255; + } + // check if textures are used as render targets + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + struct { + sint32 index; + MPTR physAddr; + Latte::E_GX2SURFFMT format; + Latte::E_HWTILEMODE tileMode; + } colorBuffers[LATTE_NUM_COLOR_TARGET]{}; + + uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); + sint32 colorBufferCount = 0; + for (sint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto& colorBuffer = colorBuffers[colorBufferCount]; + if (((colorBufferMask) & (1 << i)) == 0) + continue; // color buffer not enabled + + uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + i); + uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this + + uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; + + MPTR colorBufferPhysMem = regColorBufferBase; + Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); + + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(i, *shaderContext->contextRegistersNew); + + colorBuffer = {i, colorBufferPhysMem, colorBufferFormat, colorBufferTileMode}; + colorBufferCount++; + } + + for (sint32 i = 0; i < shader->textureUnitListCount; i++) + { + sint32 textureIndex = shader->textureUnitList[i]; + const auto& texRegister = texRegs[textureIndex]; + + // get physical address of texture data + MPTR physAddr = (texRegister.word2.get_BASE_ADDRESS() << 8); + if (physAddr == MPTR_NULL) + continue; // invalid data + + auto tileMode = texRegister.word0.get_TILE_MODE(); + + // Check for dimension + auto dim = shader->textureUnitDim[textureIndex]; + // TODO: 2D arrays could be supported as well + if (dim != Latte::E_DIM::DIM_2D) + continue; + + // Check for mip level + auto lastMip = texRegister.word5.get_LAST_LEVEL(); + // TODO: multiple mip levels could be supported as well + if (lastMip != 0) + continue; + + Latte::E_GX2SURFFMT format = LatteTexture_ReconstructGX2Format(texRegister.word1, texRegister.word4); + + // Check if the texture is used as render target + for (sint32 j = 0; j < colorBufferCount; j++) + { + const auto& colorBuffer = colorBuffers[j]; + + if (physAddr == colorBuffer.physAddr && format == colorBuffer.format && tileMode == colorBuffer.tileMode) + { + shader->textureRenderTargetIndex[textureIndex] = colorBuffer.index; + break; + } + } + } } // for geometry shaders check the copy shader for stream writes if (shader->shaderType == LatteConst::ShaderType::Geometry && shaderContext->parsedGSCopyShader->list_streamWrites.empty() == false) @@ -1002,6 +1109,10 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shaderContext->output->resourceMappingVK.setIndex = 2; LatteDecompiler::_initTextureBindingPointsGL(shaderContext); LatteDecompiler::_initTextureBindingPointsVK(shaderContext); + LatteDecompiler::_initTextureBindingPointsMTL(shaderContext); LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); + shaderContext->output->resourceMappingMTL.verticesPerInstanceBinding = shaderContext->currentBufferBindingPointMTL++; + shaderContext->output->resourceMappingMTL.indexBufferBinding = shaderContext->currentBufferBindingPointMTL++; + shaderContext->output->resourceMappingMTL.indexTypeBinding = shaderContext->currentBufferBindingPointMTL++; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp new file mode 100644 index 0000000000..cb512308e4 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -0,0 +1,4447 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "config/ActiveSettings.h" +#include "util/helpers/StringBuf.h" + +#include +#include + +#define _CRLF "\r\n" + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib); + +/* + * Variable names: + * R0-R127 temp + * Most variables are multi-typed and the respective type is appended to the name + * Type suffixes are: f (float), i (32bit int), ui (unsigned 32bit int) + * Examples: R13ui.x, tempf.z + */ + +// local prototypes +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount = 1); +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine); + +static const char* _getElementStrByIndex(uint32 channel) +{ + switch (channel) + { + case 0: + return "x"; + case 1: + return "y"; + case 2: + return "z"; + case 3: + return "w"; + } + return "UNDEFINED"; +} + +static char _tempGenString[64][256]; +static uint32 _tempGenStringIndex = 0; + +static char* _getTempString() +{ + char* str = _tempGenString[_tempGenStringIndex]; + _tempGenStringIndex = (_tempGenStringIndex+1)%64; + return str; +} + +static char* _getActiveMaskVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStack[%d]", index); + return varName; +} + +static char* _getActiveMaskCVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackCSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStackC[%d]", index); + return varName; +} + +static char* _getRegisterVarName(LatteDecompilerShaderContext* shaderContext, uint32 index, sint32 destRelIndexMode=-1) +{ + auto type = shaderContext->typeTracker.defaultDataType; + char* tempStr = _getTempString(); + if (shaderContext->typeTracker.useArrayGPRs == false) + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + sprintf(tempStr, "R%di", index); + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + sprintf(tempStr, "R%df", index); + } + else + { + char destRelOffset[32]; + if (destRelIndexMode >= 0) + { + if (destRelIndexMode == GPU7_INDEX_AR_X) + strcpy(destRelOffset, "ARi.x"); + else if (destRelIndexMode == GPU7_INDEX_AR_Y) + strcpy(destRelOffset, "ARi.y"); + else if (destRelIndexMode == GPU7_INDEX_AR_Z) + strcpy(destRelOffset, "ARi.z"); + else if (destRelIndexMode == GPU7_INDEX_AR_W) + strcpy(destRelOffset, "ARi.w"); + else + debugBreakpoint(); + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d+%s]", index, destRelOffset); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d+%s]", index, destRelOffset); + } + } + else + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d]", index); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d]", index); + } + } + } + return tempStr; +} + +static void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) +{ + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("i"); + else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("ui"); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("f"); + else + cemu_assert_unimplemented(); +} + +// appends x/y/z/w +static void _appendChannel(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add("x"); + return; + case 1: + src->add("y"); + return; + case 2: + src->add("z"); + return; + case 3: + src->add("w"); + return; + } +} + +// appends .x/.y/.z/.w +static void _appendChannelAccess(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add(".x"); + return; + case 1: + src->add(".y"); + return; + case 2: + src->add(".z"); + return; + case 3: + src->add(".w"); + return; + } +} + +static void _appendPVPS(LatteDecompilerShaderContext* shaderContext, StringBuf* src, uint32 groupIndex, uint8 aluUnit) +{ + cemu_assert_debug(aluUnit < 5); + if (aluUnit == 4) + { + src->addFmt("PS{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + return; + } + src->addFmt("PV{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + _appendChannel(src, aluUnit); +} + +std::string _FormatFloatAsConstant(float f) +{ + char floatAsStr[64]; + size_t floatAsStrLen = fmt::format_to_n(floatAsStr, 64, "{:#}", f).size; + size_t floatAsStrLenOrg = floatAsStrLen; + if(floatAsStrLen > 0 && floatAsStr[floatAsStrLen-1] == '.') + { + floatAsStr[floatAsStrLen] = '0'; + floatAsStrLen++; + } + cemu_assert(floatAsStrLen < 50); // constant suspiciously long? + floatAsStr[floatAsStrLen] = '\0'; + cemu_assert_debug(floatAsStrLen >= 3); // shortest possible form is "0.0" + return floatAsStr; +} + +// tracks PV/PS and register backups +struct ALUClauseTemporariesState +{ + struct PVPSAlias + { + enum class LOCATION_TYPE : uint8 + { + LOCATION_NONE, + LOCATION_GPR, + LOCATION_PVPS, + }; + + LOCATION_TYPE location{ LOCATION_TYPE::LOCATION_NONE }; + uint8 index; // GPR index or temporary index + uint8 aluUnit; // x,y,z,w (or 5 for PS) + + void SetLocationGPR(uint8 gprIndex, uint8 channel) + { + cemu_assert_debug(channel < 4); + this->location = LOCATION_TYPE::LOCATION_GPR; + this->index = gprIndex; + this->aluUnit = channel; + } + + void SetLocationPSPVTemporary(uint8 aluUnit, uint32 groupIndex) + { + cemu_assert_debug(aluUnit < 5); + this->location = LOCATION_TYPE::LOCATION_PVPS; + this->index = groupIndex & 1; + this->aluUnit = aluUnit; + } + }; + + struct GPRTemporary + { + GPRTemporary(uint8 gprIndex, uint8 channel, uint8 backupVarIndex) : gprIndex(gprIndex), channel(channel), backupVarIndex(backupVarIndex) {} + + uint8 gprIndex; + uint8 channel; + uint8 backupVarIndex; + }; + + void TrackGroupOutputPVPS(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstr, size_t numInstr) + { + // unset current + for (auto& it : m_pvps) + it.location = PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + for (size_t i = 0; i < numInstr; i++) + { + LatteDecompilerALUInstruction& inst = aluInstr[i]; + if (!inst.isOP3 && inst.opcode == ALU_OP2_INST_NOP) + continue; // skip NOP instruction + + if (inst.writeMask == 0) + { + // map to temporary + m_pvps[inst.aluUnit].SetLocationPSPVTemporary(inst.aluUnit, aluInstr->instructionGroupIndex); + } + else + { + // map to GPR + if(inst.destRel == 0) // is PV/PS set for indexed writes? + m_pvps[inst.aluUnit].SetLocationGPR(inst.destGpr, inst.destElem); + } + } + } + + bool HasPVPS(uint8 aluUnitIndex) const + { + cemu_assert_debug(aluUnitIndex < 5); + return m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + } + + void EmitPVPSAccess(LatteDecompilerShaderContext* shaderContext, uint8 aluUnitIndex, uint32 currentGroupIndex) const + { + switch (m_pvps[aluUnitIndex].location) + { + case PVPSAlias::LOCATION_TYPE::LOCATION_GPR: + { + sint32 temporaryIndex = GetTemporaryForGPR(m_pvps[aluUnitIndex].index, m_pvps[aluUnitIndex].aluUnit); + if (temporaryIndex < 0) + { + shaderContext->shaderSource->add(_getRegisterVarName(shaderContext, m_pvps[aluUnitIndex].index, -1)); + _appendChannelAccess(shaderContext->shaderSource, m_pvps[aluUnitIndex].aluUnit); + } + else + { + // use temporary instead of GPR + shaderContext->shaderSource->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(shaderContext->shaderSource, shaderContext->typeTracker.defaultDataType); + } + break; + } + case PVPSAlias::LOCATION_TYPE::LOCATION_PVPS: + _appendPVPS(shaderContext, shaderContext->shaderSource, currentGroupIndex-1, m_pvps[aluUnitIndex].aluUnit); + break; + default: + cemuLog_log(LogType::Force, "Shader {:016x} accesses PV/PS without writing to it", shaderContext->shaderBaseHash); + cemu_assert_suspicious(); + break; + } + } + + /* + * Check for GPR channels which are modified before they are read within the same group + * These registers need to be copied to a temporary + */ + void CreateGPRTemporaries(LatteDecompilerShaderContext* shaderContext, std::span aluInstructions) + { + uint8 registerChannelWriteMask[(LATTE_NUM_GPR * 4 + 7) / 8] = { 0 }; + + m_gprTemporaries.clear(); + for (auto& aluInstruction : aluInstructions) + { + // ignore NOP instructions + if (aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP) + continue; + cemu_assert_debug(aluInstruction.destElem <= 3); + // check if any previously written register is read + for (sint32 f = 0; f < 3; f++) + { + uint32 readGPRIndex; + uint32 readGPRChannel; + if (GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel)) + { + readGPRIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); + cemu_assert_debug(aluInstruction.sourceOperand[f].chan <= 3); + readGPRChannel = aluInstruction.sourceOperand[f].chan; + } + else if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel) || GPU7_ALU_SRC_IS_PS(aluInstruction.sourceOperand[f].sel)) + { + uint8 aluUnitIndex = 0; + if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel)) + aluUnitIndex = aluInstruction.sourceOperand[f].chan; + else + aluUnitIndex = 4; + // if aliased to a GPR, then consider it a GPR read + if(m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_GPR) + continue; + readGPRIndex = m_pvps[aluUnitIndex].index; + readGPRChannel = m_pvps[aluUnitIndex].aluUnit; + } + else + continue; + // track GPR read + if ((registerChannelWriteMask[(readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) / 8] & (1 << ((readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) % 8))) != 0) + { + // register is overwritten by previous instruction, a temporary variable is required + if (GetTemporaryForGPR(readGPRIndex, readGPRChannel) < 0) + m_gprTemporaries.emplace_back(readGPRIndex, readGPRChannel, m_gprTemporaries.size()); + } + } + // track write + if (aluInstruction.writeMask != 0) + registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); + } + // output code to move GPRs into temporaries + StringBuf* src = shaderContext->shaderSource; + for (auto& it : m_gprTemporaries) + { + src->addFmt("backupReg{}", it.backupVarIndex); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + src->add(" = "); + src->add(_getRegisterVarName(shaderContext, it.gprIndex)); + _appendChannelAccess(src, it.channel); + src->add(";" _CRLF); + } + } + + // returns -1 if none present + sint32 GetTemporaryForGPR(uint8 gprIndex, uint8 channel) const + { + for (auto& it : m_gprTemporaries) + { + if (it.gprIndex == gprIndex && it.channel == channel) + return (sint32)it.backupVarIndex; + } + return -1; + } + +private: + PVPSAlias m_pvps[5]{}; + boost::container::small_vector m_gprTemporaries; +}; + +sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index); +sint32 _getInputRegisterDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex); +sint32 _getALUInstructionOutputDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction); +bool _isReductionInstruction(LatteDecompilerALUInstruction* aluInstruction); + +/* + * Writes the name of the output variable and channel + * E.g. R5f.x or tempf.x if writeMask is 0 + */ +static void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + auto src = shaderContext->shaderSource; + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + if( aluInstruction->writeMask == 0 ) + { + // does not output to GPR + if( !_isReductionInstruction(aluInstruction) ) + { + // output to PV/PS + _appendPVPS(shaderContext, src, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); + return; + } + else + { + // output to temp + src->add("temp"); + _appendRegisterTypeSuffix(src, outputDataType); + } + _appendChannelAccess(src, aluInstruction->aluUnit); + } + else + { + // output to GPR. Aliasing to PV/PS happens at the end of the group + src->add(_getRegisterVarName(shaderContext, aluInstruction->destGpr, aluInstruction->destRel==0?-1:aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->destElem); + } +} + +static void _emitInstructionPVPSOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + _appendPVPS(shaderContext, shaderContext->shaderSource, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); +} + +static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel0, sint32 channel1, sint32 channel2, sint32 channel3, sint32 dataType = -1) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + + sint32 channelArray[4]; + channelArray[0] = channel0; + channelArray[1] = channel1; + channelArray[2] = channel2; + channelArray[3] = channel3; + + sint32 numComponents = 0; + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + numComponents++; + } + + if (dataType >= 0) + { + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType, numComponents); + } + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + + src->add("."); + + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + src->add(_getElementStrByIndex(channelArray[i])); + else if (channelArray[i] == -1) + { + // channel not used + } + else + { + cemu_assert_unimplemented(); + } + } + if (dataType >= 0) + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +// optimized variant of _emitRegisterAccessCode for raw one channel reads +static void _emitRegisterChannelAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel, sint32 dataType) +{ + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + cemu_assert_debug(channel >= 0 && channel < 4); + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + src->add("."); + src->add(_getElementStrByIndex(channel)); + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +static void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + cemu_assert_debug(GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel)); + sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + sint32 temporaryIndex = shaderContext->aluPVPSState->GetTemporaryForGPR(gprIndex, aluInstruction->sourceOperand[operandIndex].chan); + if(temporaryIndex >= 0) + { + // access via backup variable + src->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(src, currentRegisterElementType); + } + else + { + // access via register variable + _emitRegisterAccessCode(shaderContext, gprIndex, aluInstruction->sourceOperand[operandIndex].chan, -1, -1, -1); + } +} + +static void _emitPVPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, uint8 aluUnitIndex) +{ + cemu_assert_debug(aluInstruction->instructionGroupIndex > 0); // PV/PS is uninitialized for group 0 + // PV/PS vars are currently always using the default type (shaderContext->typeTracker.defaultDataType) + shaderContext->aluPVPSState->EmitPVPSAccess(shaderContext, aluUnitIndex, aluInstruction->instructionGroupIndex); +} + +/* + * Emits the expression used for calculating the index for uniform access + * For static access, this is a number + * For dynamic access, this is AR.* + base + */ +static void _emitUniformAccessIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array, for relative accesses this is the base offset + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + } + } + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + src->addFmt("ARi.x+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + src->addFmt("ARi.y+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + src->addFmt("ARi.z+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + src->addFmt("ARi.w+{}", uniformOffset); + else + cemu_assert_unimplemented(); + } + else + { + src->addFmt("{}", uniformOffset); + } +} + +static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if(shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED ) + { + // uniform registers or buffers are accessed statically with predictable offsets + // find entry in remapped uniform + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + debugBreakpoint(); + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array + sint32 uniformBufferIndex = 0; + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + uniformBufferIndex = 0; + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + } + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntry = NULL; + for(size_t i=0; i< shaderContext->shader->list_remappedUniformEntries.size(); i++) + { + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntryItr = shaderContext->shader->list_remappedUniformEntries.data() + i; + if( remappedUniformEntryItr->isRegister && isUniformRegister ) + { + if( remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + else + { + if( remappedUniformEntryItr->kcacheBankId == uniformBufferIndex && remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + } + cemu_assert_debug(remappedUniformEntry); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + src->addFmt("supportBuffer.remapped[{}]", remappedUniformEntry->mappedIndex); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE ) + { + // uniform registers are accessed with unpredictable (dynamic) offset + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + src->add("supportBuffer.uniformRegister["); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->add("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK ) + { + // uniform buffers are available as a whole + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + if( isUniformRegister ) + debugBreakpoint(); + sint32 uniformBufferIndex = 0; + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->addFmt("ubuff{}.d[", uniformBufferIndex); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->addFmt("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else + debugBreakpoint(); +} + +// Generates (slow) code to read an indexed GPR +static void _emitCodeToReadRelativeGPR(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 gprBaseIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + cemu_assert_debug(aluInstruction->sourceOperand[operandIndex].rel != 0); + + if( shaderContext->typeTracker.useArrayGPRs ) + { + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + src->add(_getRegisterVarName(shaderContext, gprBaseIndex, aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + return; + } + + char indexAccessCode[64]; + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + sprintf(indexAccessCode, "ARi.x"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + sprintf(indexAccessCode, "ARi.y"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + sprintf(indexAccessCode, "ARi.z"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + sprintf(indexAccessCode, "ARi.w"); + else + cemu_assert_unimplemented(); + + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + + // generated code looks like this: + // result = ((lookupIndex==0)?GPR5:(lookupIndex==1)?GPR6:(lookupIndex==2)?GPR7:...:(lookupIndex==122)?GPR127:0) + src->add("("); + for(sint32 i=gprBaseIndex; ianalyzer.gprUseMask[i / 8] & (1 << (i % 8))) == 0 ) + continue; + src->addFmt("({}=={})?", indexAccessCode, i-gprBaseIndex); + // code to access gpr + uint32 gprIndex = i; + src->add(_getRegisterVarName(shaderContext, i)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + src->add(":"); + } + src->add("0)"); + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); +} + +static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if( operandIndex < 0 || operandIndex >= 3 ) + debugBreakpoint(); + sint32 requiredTypeOut = requiredType; + if( requiredType != LATTE_DECOMPILER_DTYPE_FLOAT && (aluInstruction->sourceOperand[operandIndex].abs != 0 || aluInstruction->sourceOperand[operandIndex].neg != 0) ) + { + // we need to apply float operations on the input but it's not read as a float + // force internal required type to float and then cast it back to whatever type is actually required + requiredType = LATTE_DECOMPILER_DTYPE_FLOAT; + } + + if( requiredTypeOut != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, requiredType, requiredTypeOut); + + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add("-("); + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add("abs("); + + if( GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + _emitCodeToReadRelativeGPR(shaderContext, aluInstruction, operandIndex, requiredType); + } + else + { + uint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // signed int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + // write code for register input + _emitTypeConversionPrefixMSL(shaderContext, currentRegisterElementType, requiredType); + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionSuffixMSL(shaderContext, currentRegisterElementType, requiredType); + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // unsigned int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert from int to uint + src->add("uint("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // float 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert (not cast) from int bits to float + src->add("as_type("); // TODO: correct? + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else + debugBreakpoint(); + } + } + else if( GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if(requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("0"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("0.0"); + } + else if( GPU7_ALU_SRC_IS_CONST_1F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("1.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_0_5F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("0.5"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("int(1)"); + else if (requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("uint(1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_CONST_M1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int(-1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("int(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + uint32 constVal = aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]; + sint32 exponent = (constVal >> 23) & 0xFF; + exponent -= 127; + if ((constVal & 0xFF) == 0 && exponent >= -10 && exponent <= 10) + { + src->add(_FormatFloatAsConstant(*(float*)&constVal)); + } + else + src->addFmt("as_type(0x{:08x})", constVal); + } + } + else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) || + GPU7_ALU_SRC_IS_CBANK1(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_PV(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPVDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPVDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, currentPVDataType, requiredType); + } + else if( GPU7_ALU_SRC_IS_PS(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPSDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPSDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, 4); + _emitTypeConversionSuffixMSL(shaderContext, currentPSDataType, requiredType); + } + else + { + cemuLog_log(LogType::Force, "Unsupported shader ALU operand sel {:#x}\n", aluInstruction->sourceOperand[operandIndex].sel); + debugBreakpoint(); + } + + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add(")"); + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add(")"); + + if( requiredTypeOut != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, requiredType, requiredTypeOut); +} + +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + if (destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else + cemu_assert_debug(false); +} + +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + src->add(")"); +} + +template +static void _emitALUOperationBinary(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, const char* operandStr) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, TDataType, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, TDataType); + src->add((char*)operandStr); + _emitOperandInputCode(shaderContext, aluInstruction, 1, TDataType); + _emitTypeConversionSuffixMSL(shaderContext, TDataType, outputType); + src->add(";" _CRLF); +} + +static bool _isSameGPROperand(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndexA, sint32 opIndexB) +{ + if (aluInstruction->sourceOperand[opIndexA].sel != aluInstruction->sourceOperand[opIndexB].sel) + return false; + if (!GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[opIndexA].sel)) + return false; + if (aluInstruction->sourceOperand[opIndexA].chan != aluInstruction->sourceOperand[opIndexB].chan) + return false; + if (aluInstruction->sourceOperand[opIndexA].abs != aluInstruction->sourceOperand[opIndexB].abs) + return false; + if (aluInstruction->sourceOperand[opIndexA].neg != aluInstruction->sourceOperand[opIndexB].neg) + return false; + if (aluInstruction->sourceOperand[opIndexA].rel != aluInstruction->sourceOperand[opIndexB].rel) + return false; + return true; +} + +static bool _operandHasModifiers(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndex) +{ + return aluInstruction->sourceOperand[opIndex].abs != 0 || aluInstruction->sourceOperand[opIndex].neg != 0; +} + +static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); // data type of output + if( aluInstruction->opcode == ALU_OP2_INST_MOV ) + { + bool requiresFloatMove = false; + requiresFloatMove = aluInstruction->sourceOperand[0].abs != 0 || aluInstruction->sourceOperand[0].neg != 0; + if( requiresFloatMove ) + { + // abs/neg operations are applied to source operand, do float based move + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, outputType); + src->add(";" _CRLF); + } + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_FLOOR ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResultf = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(";" _CRLF); + src->add("tempResultf = floor(tempResultf);" _CRLF); + src->add("tempResultf = clamp(tempResultf, -256.0, 255.0);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = int(tempResultf);" _CRLF); + else + src->add("ARi.w = int(tempResultf);" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("as_type(tempResultf)"); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResulti = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(";" _CRLF); + src->add("tempResulti = clamp(tempResulti, -256, 255);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = tempResulti;" _CRLF); + else + src->add("ARi.w = tempResulti;" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("tempResulti"); + src->add(";" _CRLF); + + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD ) + { + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL ) + { + // 0*anything is always 0 + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // if any operand is a non-zero literal or constant we can use standard multiplication + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[1].sel)) + { + // result is always zero + src->add("0.0"); + } + else + { + // multiply + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL_IEEE ) + { + // 0*anything according to IEEE rules + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_IEEE) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("1.0"); + src->add(" / "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_FF) + { + // untested (BotW bombs) + src->add("tempResultf = 1.0 / ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // INF becomes 0.0 + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + // -INF becomes -0.0 + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_IEEE || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF ) + { + // todo: This should be correct but testing is needed + src->add("tempResultf = 1.0 / sqrt("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) + { + // note: if( -INF < 0.0 ) does not resolve to true + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) + { + // untested (BotW bombs) + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MAX || + aluInstruction->opcode == ALU_OP2_INST_MIN || + aluInstruction->opcode == ALU_OP2_INST_MAX_DX10 || + aluInstruction->opcode == ALU_OP2_INST_MIN_DX10 ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX ) + src->add("max"); + else if( aluInstruction->opcode == ALU_OP2_INST_MIN ) + src->add("min"); + else if (aluInstruction->opcode == ALU_OP2_INST_MAX_DX10) + src->add("max"); + else if (aluInstruction->opcode == ALU_OP2_INST_MIN_DX10) + src->add("min"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLOOR || + aluInstruction->opcode == ALU_OP2_INST_FRACT || + aluInstruction->opcode == ALU_OP2_INST_TRUNC ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_FLOOR ) + src->add("floor"); + else if( aluInstruction->opcode == ALU_OP2_INST_FRACT ) + src->add("fract"); + else + src->add("trunc"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_LOG_IEEE ) + { + src->add("tempResultf = max(0.0, "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + + src->add("tempResultf = log2(tempResultf);" _CRLF); + if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED ) + { + src->add("if( isinf(tempResultf) == true ) tempResultf = -3.40282347E+38F;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RNDNE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("rint("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_EXP_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("exp2"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SQRT_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("sqrt"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SIN || + aluInstruction->opcode == ALU_OP2_INST_COS ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_SIN ) + src->add("sin"); + else + src->add("cos"); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")/0.1591549367)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("int"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_UINT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add("uint"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_INT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_UINT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_AND_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " & "); + else if (aluInstruction->opcode == ALU_OP2_INST_OR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " | "); + else if (aluInstruction->opcode == ALU_OP2_INST_XOR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " ^ "); + else if( aluInstruction->opcode == ALU_OP2_INST_NOT_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("~("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT || + aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT) + { + // not verified + bool isUnsigned = aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT; + auto opType = isUnsigned ? LATTE_DECOMPILER_DTYPE_UNSIGNED_INT : LATTE_DECOMPILER_DTYPE_SIGNED_INT; + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, opType, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MAX_UINT ) + src->add("max("); + else + src->add("min("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, opType); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, opType); + _emitTypeConversionSuffixMSL(shaderContext, opType, outputType); + src->add(");" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SUB_INT ) + { + // note: The AMD doc says src1 is on the left side but tests indicate otherwise. It's src0 - src1. + _emitALUOperationBinary(shaderContext, aluInstruction, " - "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_UINT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHL_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " << "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHR_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " >> "); + else if( aluInstruction->opcode == ALU_OP2_INST_ASHR_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(" >> "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT || + aluInstruction->opcode == ALU_OP2_INST_SETGE || + aluInstruction->opcode == ALU_OP2_INST_SETNE || + aluInstruction->opcode == ALU_OP2_INST_SETE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE ) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETNE) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETE) + src->add(" == "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?1.0:0.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + { + if( aluInstruction->omod != 0 ) + debugBreakpoint(); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?-1:0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_INT ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_INT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")?-1:0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + { + // todo: Unsure if the result is unsigned or signed + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + src->add(" > "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")?int(0xFFFFFFFF):int(0x0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + bool isIntPred = (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT); + + src->add("predResult"); + src->add(" = ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + + if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) + src->add(" != "); + else + cemu_assert_debug(false); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // handle result of predicate instruction based on current ALU clause type + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->addFmt("{} = predResult;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = predResult == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_BREAK ) + { + // leave current loop + src->add("if( predResult == false ) break;" _CRLF); + } + else + cemu_assert_debug(false); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT ) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + src->add(" > "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->add(") discard_fragment();"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGT || + aluInstruction->opcode == ALU_OP2_INST_KILLGE || + aluInstruction->opcode == ALU_OP2_INST_KILLE ) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGE ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE ) + src->add(" == "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + src->add(") discard_fragment();"); + src->add(_CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op2 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + cemu_assert_debug(aluInstruction->destRel == 0); // todo + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + + /* check for common no-op or mov-like instructions */ + if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT || + aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || + aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || + aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + if (_isSameGPROperand(aluInstruction, 1, 2) && !_operandHasModifiers(aluInstruction, 1)) + { + // the condition is irrelevant as both operands are the same + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, outputType); + src->add(";" _CRLF); + return; + } + } + + + /* generic handlers */ + if( aluInstruction->opcode == ALU_OP3_INST_MULADD || + aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE ) + { + // todo: The difference between MULADD and MULADD IEEE is that the former has 0*anything=0 rule similar to MUL/MUL_IEEE? + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if (aluInstruction->opcode != ALU_OP3_INST_MULADD) // avoid unnecessary parenthesis to improve code readability slightly + src->add("("); + + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE) + useDefaultMul = true; + + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + + src->add(" + "); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + if(aluInstruction->opcode != ALU_OP3_INST_MULADD) + src->add(")"); + if( aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 ) + src->add("/2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 ) + src->add("*2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 ) + src->add("*4.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if(aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + bool requiresFloatResult = (aluInstruction->sourceOperand[1].neg != 0) || (aluInstruction->sourceOperand[2].neg != 0); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if (aluInstruction->opcode == ALU_OP3_INST_CNDE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + src->add(" >= "); + src->add("0)?("); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if (aluInstruction->opcode == ALU_OP3_INST_CMOVE) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGT) + src->add(" > "); + src->add("0.0)?("); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op3 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluRedcInstruction[4]) +{ + StringBuf* src = shaderContext->shaderSource; + if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4 || aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4_IEEE) ) + { + // todo: Figure out and implement the difference between normal DOT4 and DOT4_IEEE + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // dot(float4(op0),float4(op1)) + src->add("dot(float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE) ) + { + /* + * How the CUBE instruction works (guessed mostly, based on DirectX/OpenGL spec): + Input: float4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) + + First we figure out the major axis (closest axis-aligned vector). There are six possible vectors: + +rx 0 + -rx 1 + +ry 2 + -ry 3 + +rz 4 + -rz 5 + The major axis vector is calculated by looking at the largest (absolute) 3d vector component and then setting the other components to 0.0 + The value that remains in the axis vector is referred to as 'MajorAxis' by the AMD documentation. + The S,T coordinates are taken from the other two components. + Example: -0.5,0.2,0.4 -> -rx -> -0.5,0.0,0.0 MajorAxis: -0.5, S: 0.2 T: 0.4 + + The CUBE reduction instruction requires a specific mapping for the input vector: + src0 = Rn.zzxy + src1 = Rn.yxzz + It's probably related to the way the instruction works internally? + If we look at the individual components per ALU unit: + z y -> Compare y/z + z x -> Compare x/z + x z -> Compare x/z + y z -> Compare y/z + */ + + sint32 outputType; + + src->add("redcCUBE("); + src->add("float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("cubeMapSTM,cubeMapFaceId);" _CRLF); + + // dst.X (S) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.x"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Y (T) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[1]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[1]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.y"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Z (MajorAxis) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[2]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[2]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.z"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.W (FaceId) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[3]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[3]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("cubeMapFaceId"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); +} + +static void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex) +{ + sint32 instructionGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + size_t groupSize = 1; + while ((startIndex + groupSize) < cfInstruction->instructionsALU.size()) + { + if (instructionGroupIndex != cfInstruction->instructionsALU[startIndex + groupSize].instructionGroupIndex) + break; + groupSize++; + } + shaderContext->aluPVPSState->CreateGPRTemporaries(shaderContext, { cfInstruction->instructionsALU.data() + startIndex, groupSize }); +} + +/* +bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 pvUnit) +{ + sint32 currentGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + for (sint32 i = startIndex + 1; i < (sint32)cfInstruction->instructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstructionItr = cfInstruction->instructionsALU[i]; + if(aluInstructionItr.instructionGroupIndex == currentGroupIndex ) + continue; + if ((sint32)aluInstructionItr.instructionGroupIndex > currentGroupIndex + 1) + return false; + // check OP code type + if (aluInstructionItr.isOP3) + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // op2 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[2].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[2].chan; + if (pvUnit == chan) + return true; + } + } + else + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // todo: Not all operations use both operands + } + } + return false; +} +*/ + +static void _emitFloat3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) +{ + StringBuf* src = shaderContext->shaderSource; + if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + src->add("float3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + src->add("int3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + } + else + cemu_assert_unimplemented(); +} + +static void _emitGPRVectorAssignment(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction** aluInstructions, sint32 count) +{ + StringBuf* src = shaderContext->shaderSource; + // output var name (GPR) + src->add(_getRegisterVarName(shaderContext, aluInstructions[0]->destGpr, -1)); + src->add("."); + for (sint32 f = 0; f < count; f++) + { + src->add(_getElementStrByIndex(aluInstructions[f]->destElem)); + } + src->add(" = "); +} + +static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + ALUClauseTemporariesState pvpsState; + shaderContext->aluPVPSState = &pvpsState; + StringBuf* src = shaderContext->shaderSource; + LatteDecompilerALUInstruction* aluRedcInstruction[4]; + size_t groupStartIndex = 0; + for(size_t i=0; iinstructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; + if( aluInstruction.indexInGroup == 0 ) + { + src->addFmt("// {}" _CRLF, aluInstruction.instructionGroupIndex); + // apply PV/PS updates for previous group + if (i > 0) + { + pvpsState.TrackGroupOutputPVPS(shaderContext, cfInstruction->instructionsALU.data() + groupStartIndex, i - groupStartIndex); + } + groupStartIndex = i; + // backup registers which are read after being written + _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i); + } + // detect reduction instructions and use a special handler + bool isReductionOperation = _isReductionInstruction(&aluInstruction); + if( isReductionOperation ) + { + cemu_assert_debug((i + 4) <= cfInstruction->instructionsALU.size()); + aluRedcInstruction[0] = &aluInstruction; + aluRedcInstruction[1] = &cfInstruction->instructionsALU[i + 1]; + aluRedcInstruction[2] = &cfInstruction->instructionsALU[i + 2]; + aluRedcInstruction[3] = &cfInstruction->instructionsALU[i + 3]; + if( aluRedcInstruction[0]->isOP3 != aluRedcInstruction[1]->isOP3 || aluRedcInstruction[1]->isOP3 != aluRedcInstruction[2]->isOP3 || aluRedcInstruction[2]->isOP3 != aluRedcInstruction[3]->isOP3 ) + debugBreakpoint(); + if( aluRedcInstruction[0]->opcode != aluRedcInstruction[1]->opcode || aluRedcInstruction[1]->opcode != aluRedcInstruction[2]->opcode || aluRedcInstruction[2]->opcode != aluRedcInstruction[3]->opcode ) + debugBreakpoint(); + if( aluRedcInstruction[0]->omod != aluRedcInstruction[1]->omod || aluRedcInstruction[1]->omod != aluRedcInstruction[2]->omod || aluRedcInstruction[2]->omod != aluRedcInstruction[3]->omod ) + debugBreakpoint(); + if( aluRedcInstruction[0]->destClamp != aluRedcInstruction[1]->destClamp || aluRedcInstruction[1]->destClamp != aluRedcInstruction[2]->destClamp || aluRedcInstruction[2]->destClamp != aluRedcInstruction[3]->destClamp ) + debugBreakpoint(); + _emitALUReductionInstructionCode(shaderContext, aluRedcInstruction); + i += 3; // skip the instructions that are part of the reduction operation + } + else /* not a reduction operation */ + { + if( aluInstruction.isOP3 ) + { + // op3 + _emitALUOP3InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + else + { + // op2 + if( aluInstruction.opcode == ALU_OP2_INST_NOP ) + continue; // skip NOP instruction + _emitALUOP2InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + } + // handle omod + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, &aluInstruction); + if( aluInstruction.omod != ALU_OMOD_NONE ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + if( aluInstruction.omod == ALU_OMOD_MUL2 ) + src->add(" *= 2.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_MUL4 ) + src->add(" *= 4.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_DIV2 ) + src->add(" /= 2.0;" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = "); + src->add("as_type(as_type("); // TODO: correct? + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(")"); + if( aluInstruction.omod == 1 ) + src->add(" * 2.0"); + else if( aluInstruction.omod == 2 ) + src->add(" * 4.0"); + else if( aluInstruction.omod == 3 ) + src->add(" / 2.0"); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle clamp + if( aluInstruction.destClamp != 0 ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clamp("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(", 0.0, 1.0);" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clampFI32("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle result broadcasting for reduction instructions + if( isReductionOperation ) + { + // reduction operations set all four PV components (todo: Needs further research. According to AMD docs, dot4 only sets PV.x? update: Unlike DOT4, CUBE sets all PV elements accordingly to their GPR output?) + if( aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE ) + { + // CUBE + for (sint32 f = 0; f < 4; f++) + { + if (aluRedcInstruction[f]->writeMask != 0) + continue; + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + else + { + // DOT4, DOT4_IEEE, etc. + // reduction operation result is only set for output in redc[0], we also need to update redc[1] to redc[3] + for(sint32 f=0; f<4; f++) + { + if( aluRedcInstruction[f]->writeMask == 0 ) + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + else + { + if (f == 0) + continue; + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + } + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + } + } + shaderContext->aluPVPSState = nullptr; +} + +/* + * Emits code to access one component (xyzw) of the texture coordinate input vector + */ +static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction, sint32 componentIndex, sint32 interpretSrcAsType) +{ + cemu_assert(componentIndex >= 0 && componentIndex < 4); + cemu_assert_debug(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT); + StringBuf* src = shaderContext->shaderSource; + sint32 elementSel = texInstruction->textureFetch.srcSel[componentIndex]; + if (elementSel < 4) + { + _emitRegisterChannelAccessCode(shaderContext, texInstruction->srcGpr, elementSel, interpretSrcAsType); + return; + } + const char* resultElemTable[4] = {"x","y","z","w"}; + if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + if( elementSel == 4 ) + src->add("as_type(0.0)"); + else if( elementSel == 5 ) + src->add("as_type(1.0)"); + } + else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + if( elementSel == 4 ) + src->add("0.0"); + else if( elementSel == 5 ) + src->add("1.0"); + } +} + +static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"}; + +static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) +{ + // as_type(R{}i.w) + *tempBuffer = '\0'; + uint8 elemCount = (selX >= 0 ? 1 : 0) + (selY >= 0 ? 1 : 0) + (selZ >= 0 ? 1 : 0) + (selW >= 0 ? 1 : 0); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (elemCount == 1) + strcat(tempBuffer, "as_type("); + else + strcat(tempBuffer, ("as_type(").c_str()); + } + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, ")"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + } + else + cemu_assert_unimplemented(); + return tempBuffer; +} + +static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->textureFetch.textureIndex < 0 || texInstruction->textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) + { + // skip out of bounds texture unit access + return; + } + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + char tempBuffer0[32]; + char tempBuffer1[32]; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f = 0; f < 4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + // texture sampler opcode + uint32 texOpcode = texInstruction->opcode; + // TODO: is this needed? + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + // vertex shader forces LOD to zero, but certain sampler types don't support textureLod(...) API + if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + texOpcode = GPU7_TEX_INST_SAMPLE_C; + } + // check if offset is used + bool hasOffset = false; + if( texInstruction->textureFetch.offsetX != 0 || texInstruction->textureFetch.offsetY != 0 || texInstruction->textureFetch.offsetZ != 0 ) + hasOffset = true; + // emit sample code + if (shaderContext->shader->textureIsIntegerFormat[texInstruction->textureFetch.textureIndex]) + { + // integer samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) // uint to int + { + if (numWrittenElements == 1) + src->add(" = int("); + else + shaderContext->shaderSource->addFmt(" = int{}(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } + } + else + { + // float samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = ("); + } + + bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; + bool emulateCompare = (isCompare && !IsValidDepthTextureType(texDim)); + bool isGather = (texOpcode == GPU7_TEX_INST_FETCH4); + + bool unnormalizationHandled = false; + bool useTexelCoordinates = false; + bool isRead = ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || texOpcode == GPU7_TEX_INST_LD); + + // handle illegal combinations + if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) + { + // fetch4 is not allowed on 1D textures + // seen in YWW during boss fight of Level 1-4 + // todo - investigate what this returns on actual HW + if (numWrittenElements == 1) + shaderContext->shaderSource->add("0.0"); + else + shaderContext->shaderSource->addFmt("float{}(0.0)", numWrittenElements); + shaderContext->shaderSource->add(");" _CRLF); + return; + } + + // Do a framebuffer fetch if possible + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) + { + // TODO: support comparison samplers + // TODO: support swizzling + src->addFmt("col{}", renderTargetIndex); + } + else + { + // sample_compare returns a float, need to convert to float4 + if (isCompare) + src->addFmt("float4("); + + if (emulateCompare) + { + cemu_assert_debug(!isGather); + + src->add("sampleCompareEmulate("); + } + + src->addFmt("tex{}", texInstruction->textureFetch.textureIndex); + if (!emulateCompare) + { + src->add("."); + if (isRead) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("read("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else + { + if (isGather) + src->add("gather"); + else + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + } + } + else + { + src->addFmt(", samplr{}, ", texInstruction->textureFetch.textureIndex); + } + + // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + if (texDim == Latte::E_DIM::DIM_2D) + { + //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); + + // vec2(-0.00001) is minimum to break Nvidia + // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) + + // todo - emulating coordinate rounding mode correctly is tricky + // GX2 supports two modes: Truncate or rounding according to DX9 rules + // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding + + // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation + src->addFmt("float2(0.0001) + "); + } + } + + const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; + if(useTexelCoordinates) + { + // handle integer coordinates for texelFetch + if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + { + src->add("uint2("); + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); + src->addFmt(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); + + src->addFmt(")*supportBuffer.tex{}Scale", texInstruction->textureFetch.textureIndex); // close float2 and scale + + src->add("), 0"); // close int2 and lod param + // todo - lod + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) + src->add("uint("); + src->add("float("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); + src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->add("), 0"); + // todo - lod + } + else + cemu_assert_debug(false); + } + else /* useTexelCoordinates == false */ + { + // float coordinates + if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) + { + // shadow sampler + if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + compare value + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + } + else if (texDim == Latte::E_DIM::DIM_CUBEMAP) + { + // 2 coords + faceId + if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->addFmt(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // 1 coord + 1 unused coord (per spec) + compare value + if (texInstruction->textureFetch.srcSel[0] >= 4) + { + debugBreakpoint(); + } + src->addFmt("{}, {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + else + { + // 2 coords + compare value (as float3) + if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("float2({}), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + } + else if(texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + } + else if(texDim == Latte::E_DIM::DIM_3D) + { + // 3 coords + src->add("float3("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) + { + // 2 coords + faceId + cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); + cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if( texDim == Latte::E_DIM::DIM_1D ) + { + // 1 coord + src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); + } + else + { + // 2 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + // avoid truncate to effectively round downwards on texel edges + if (ActiveSettings::ForceSamplerRoundToPrecision()) + src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } + // lod or lod bias parameter + // 1D textures don't support lod + if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) + { + if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(", "); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } + else + { + // TODO: is this correct? + src->add("level("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add(", level(0.0)"); + } + } + } + // gradient parameters + if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (texDim == Latte::E_DIM::DIM_2D || + texDim == Latte::E_DIM::DIM_1D) + { + src->add(", gradient2d(gradH.xy, gradV.xy)"); + } + else + { + cemu_assert_unimplemented(); + } + } + + // offset + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) + { + if( hasOffset ) + { + uint8 offsetComponentCount = 0; + if( texDim == Latte::E_DIM::DIM_1D ) + offsetComponentCount = 1; + else if( texDim == Latte::E_DIM::DIM_2D ) + offsetComponentCount = 2; + else if( texDim == Latte::E_DIM::DIM_3D ) + offsetComponentCount = 3; + else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) + offsetComponentCount = 2; + else + cemu_assert_unimplemented(); + + if( (texInstruction->textureFetch.offsetX&1) ) + cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetY&1) ) + cemu_assert_unimplemented(); + if ((texInstruction->textureFetch.offsetZ & 1)) + cemu_assert_unimplemented(); + + if( offsetComponentCount == 1 ) + src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); + else if( offsetComponentCount == 2 ) + src->addFmt(",int2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + else if( offsetComponentCount == 3 ) + src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + } + } + + // lod bias (TODO: wht?) + + src->add(")"); + } + + if (isCompare) + src->add(")"); + + if (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add("."); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + else + { + src->add("x"); + } + } + else + { + src->add("."); + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (isGather) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = + { + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } + src->add(");"); + + // debug +#ifdef CEMU_DEBUG_ASSERT + if(texInstruction->opcode == GPU7_TEX_INST_LD ) + src->add(" // TEX_INST_LD"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE ) + src->add(" // TEX_INST_SAMPLE"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_L ) + src->add(" // TEX_INST_SAMPLE_L"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_LZ ) + src->add(" // TEX_INST_SAMPLE_LZ"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_C ) + src->add(" // TEX_INST_SAMPLE_C"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_G ) + src->add(" // TEX_INST_SAMPLE_G"); + else + src->addFmt(" // 0x{:02x}", texInstruction->opcode); + if (texInstruction->opcode != texOpcode) + src->addFmt(" (applied as 0x{:02x})", texOpcode); + src->addFmt(" OffsetXYZ {:02x} {:02x} {:02x}", (uint8)texInstruction->textureFetch.offsetX&0xFF, (uint8)texInstruction->textureFetch.offsetY&0xFF, (uint8)texInstruction->textureFetch.offsetZ&0xFF); +#endif + src->add("" _CRLF); +} + +static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("R{}", texInstruction->dstGpr); + src->add("i"); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + // todo - mip index parameter? + + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + { + // TODO: use the render target size + src->addFmt(" = int4(1920, 1080, 1, 1)."); + } + else + { + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + if (texDim == Latte::E_DIM::DIM_1D) + src->addFmt(" = int4(tex{}.get_width(), 1, 1, 1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_array_size(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), tex{}.get_array_size(), 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else + { + cemu_assert_debug(false); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } + } + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + { + // We assume that textures accessed as framebuffer fetch are always sampled at pixel coordinates, therefore the lod would always be 0.0 + src->add("float4(0.0, 0.0, 0.0, 0.0)"); + } + else + { + if (shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP) + { + // 3 coordinates + if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + } + else + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + debugBreakpoint(); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + src->add("."); + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); + const char* resultElemTable[4] = {"x","y","z","w"}; + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt(" = as_type(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + cemu_assert_unimplemented(); +} + +static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 componentCount = 0; + for (sint32 i = 0; i < 4; i++) + { + if (texInstruction->dstSel[i] == 7) + continue; + componentCount++; + } + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + const char* funcName; + if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) + funcName = "dfdx"; + else + funcName = "dfdy"; + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType, componentCount); + + src->addFmt("{}(", funcName); + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4) ? texInstruction->textureFetch.srcSel[3] : -1, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(")"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->add(";" _CRLF); + +} + +static void _emitTEXSetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->opcode == GPU7_TEX_INST_SET_GRADIENTS_H) + src->add("gradH = "); + else + src->add("gradV = "); + + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], texInstruction->textureFetch.srcSel[2], texInstruction->textureFetch.srcSel[3], LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(";" _CRLF); +} + +static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType, numWrittenElements); + src->add("(objectPayload.vertexOut["); + if (texInstruction->textureFetch.srcSel[0] >= 4) + cemu_assert_unimplemented(); + if (texInstruction->textureFetch.srcSel[1] >= 4) + cemu_assert_unimplemented(); + src->add("vertexIndex"); + src->addFmt("].passParameterSem{}.", texInstruction->textureFetch.offset/16); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add(";" _CRLF); +} + +static sint32 _writeDestMaskXYZW(LatteDecompilerShaderContext* shaderContext, sint8* dstSel) +{ + StringBuf* src = shaderContext->shaderSource; + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + return numWrittenElements; +} + +static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + // handle special case where geometry shader reads input attributes from vertex shader via ringbuffer + StringBuf* src = shaderContext->shaderSource; + if( texInstruction->textureFetch.textureIndex == 0x9F && shaderContext->shaderType == LatteConst::ShaderType::Geometry ) + { + _emitGSReadInputVFetchCode(shaderContext, texInstruction); + return; + } + + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + const char* resultElemTable[4] = {"x","y","z","w"}; + uint32 numWrittenElements = 0; + for (sint32 f=0; f<4; f++) + { + if (texInstruction->dstSel[f] < 4) + numWrittenElements++; + } + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numWrittenElements == 1) + src->add("as_type("); + else + src->addFmt("as_type(", numWrittenElements); + } + else + src->add("("); + + src->addFmt("ubuff{}.d[", texInstruction->textureFetch.textureIndex - 0x80); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + src->addFmt("as_type({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->add("]."); + + + for (sint32 f=0; f<4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(");" _CRLF); +} + +static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + sint32 count = _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (count == 1) + src->add("as_type("); + else + src->addFmt("as_type(", count); + } + else + src->add("("); + + sint32 readCount; + + if (texInstruction->memRead.format == FMT_32_FLOAT) + { + readCount = 1; + // todo + src->add("0.0"); + } + else if (texInstruction->memRead.format == FMT_32_32_FLOAT) + { + readCount = 2; + // todo + src->add("float2(0.0,0.0)"); + } + else if (texInstruction->memRead.format == FMT_32_32_32_FLOAT) + { + readCount = 3; + // todo + src->add("float3(0.0,0.0,0.0)"); + } + else + { + cemu_assert_unimplemented(); + } + + if (count < readCount) + { + if (count == 1) + src->add(".x"); + else if (count == 2) + src->add(".xy"); + else if (count == 3) + src->add(".xyz"); + } + src->add(");" _CRLF); +} + +static void _emitTEXClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + cemu_assert_debug(cfInstruction->instructionsALU.empty()); + for(auto& texInstruction : cfInstruction->instructionsTEX) + { + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || texInstruction.opcode == GPU7_TEX_INST_FETCH4 || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) + _emitTEXSampleTextureCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_TEXTURE_RESINFO ) + _emitTEXGetTextureResInfoCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_COMP_TEX_LOD ) + _emitTEXGetCompTexLodCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_SET_CUBEMAP_INDEX ) + _emitTEXSetCubemapIndexCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_V) + _emitTEXGetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_V) + _emitTEXSetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_VFETCH) + _emitTEXVFetchCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_MEM) + _emitTEXReadMemCode(shaderContext, &texInstruction); + else + cemu_assert_unimplemented(); + } +} + +// generate the code for reading the source input GPR (or constants) for exports +static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 requiredType, uint32 burstIndex) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 numOutputs = 4; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + numOutputs = (cfInstruction->memWriteCompMask&1)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&2)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&4)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&8)?1:0; + } + if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if(numOutputs == 1) + src->add("float("); + else + src->addFmt("float{}(", numOutputs); + } + else if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numOutputs == 1) + src->add("int("); + else + src->addFmt("int{}(", numOutputs); + } + else + cemu_assert_unimplemented(); + sint32 actualOutputs = 0; + for(sint32 i=0; i<4; i++) + { + // todo: Use type of register element based on information from type tracker (currently we assume it's always a signed integer) + uint32 exportSel = 0; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + exportSel = i; + if( (cfInstruction->memWriteCompMask&(1<exportComponentSel[i]; + } + if( actualOutputs > 0 ) + src->add(", "); + actualOutputs++; + if( exportSel < 4 ) + { + _emitRegisterAccessCode(shaderContext, cfInstruction->exportSourceGPR+burstIndex, exportSel, -1, -1, -1, requiredType); + } + else if (exportSel == 4) + { + // constant zero + src->add("0"); + } + else if (exportSel == 5) + { + // constant one + src->add("1.0"); + } + else if( exportSel == 7 ) + { + // element masked (which means 0 is exported?) + src->add("0"); + } + else + { + cemu_assert_debug(false); + src->add("0"); + } + } + if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add(")"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add(")"); + else + cemu_assert_unimplemented(); +} + +static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add("// export" _CRLF); + if(shaderContext->shaderType == LatteConst::ShaderType::Vertex ) + { + if (!shaderContext->contextRegistersNew->IsRasterizationEnabled()) + { + src->add("// Rasterization disabled" _CRLF); + return; + } + + if( cfInstruction->exportBurstCount != 0 ) + debugBreakpoint(); + if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + // export position + // GX2 special state 0 disables rasterizer viewport offset and scaling (probably, exact mechanism is not known). Handle this here + bool hasAnyViewportScaleDisabled = + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (hasAnyViewportScaleDisabled) + { + src->add("float4 finalPos = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);" _CRLF); + src->add("SET_POSITION(finalPos);"); + } + else + { + src->add("SET_POSITION("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(");" _CRLF); + } + } + else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) + { + // export gl_PointSize + if (shaderContext->analyzer.outputPointSize) + { + cemu_assert_debug(shaderContext->analyzer.writesPointSize); + src->add("out.pointSize = ("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(").x"); + src->add(";" _CRLF); + } + } + else if( cfInstruction->exportType == 2 && cfInstruction->exportArrayBase < 32 ) + { + // export parameter + sint32 paramIndex = cfInstruction->exportArrayBase; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, paramIndex); + if (vsSemanticId != 0xFF) + { + src->addFmt("out.passParameterSem{} = ", vsSemanticId); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + } + else + { + src->add("// skipped export to semanticId 255" _CRLF); + } + } + else + cemu_assert_unimplemented(); + } + else if(shaderContext->shaderType == LatteConst::ShaderType::Pixel ) + { + if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + { + for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) + { + sint32 pixelColorOutputIndex = LatteDecompiler_getColorOutputIndexFromExportIndex(shaderContext, cfInstruction->exportArrayBase+i); + // if color output is for target 0, then also handle alpha test + bool alphaTestEnable = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + auto alphaTestFunc = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_FUNC(); + if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc == Latte::E_COMPAREFUNC::NEVER ) + { + // never pass alpha test + src->add("discard_fragment();" _CRLF); + } + else if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc != Latte::E_COMPAREFUNC::ALWAYS) + { + src->add("if( (("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(").a "); + + switch( alphaTestFunc ) + { + case Latte::E_COMPAREFUNC::LESS: + src->add("<"); + break; + case Latte::E_COMPAREFUNC::EQUAL: + src->add("=="); + break; + case Latte::E_COMPAREFUNC::LEQUAL: + src->add("<="); + break; + case Latte::E_COMPAREFUNC::GREATER: + src->add(">"); + break; + case Latte::E_COMPAREFUNC::NOTEQUAL: + src->add("!="); + break; + case Latte::E_COMPAREFUNC::GEQUAL: + src->add(">="); + break; + } + src->add(" supportBuffer.alphaTestRef"); + src->add(") == false) discard_fragment();" _CRLF); + } + // pixel color output + auto dataType = GetColorBufferDataType(pixelColorOutputIndex, *shaderContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetDataTypeStr(dataType)); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(");" _CRLF); + } + + if( cfInstruction->exportArrayBase+i >= 8 ) + cemu_assert_unimplemented(); + } + } + else if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61 ) + { + // pixel depth or gl_FragStencilRefARB + if( cfInstruction->exportBurstCount > 0 ) + cemu_assert_unimplemented(); + + if (cfInstruction->exportComponentSel[0] == 7) + { + cemu_assert_unimplemented(); // gl_FragDepth ? + } + if (cfInstruction->exportComponentSel[1] != 7) + { + cemu_assert_unimplemented(); // exporting to gl_FragStencilRefARB + } + if (cfInstruction->exportComponentSel[2] != 7) + { + cemu_assert_unimplemented(); // ukn + } + if (cfInstruction->exportComponentSel[3] != 7) + { + cemu_assert_unimplemented(); // ukn + } + + if (!shaderContext->shader->depthMask) + return; + + src->add("out.passDepth = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(".x"); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); + } +} + +static void _emitXYZWByMask(StringBuf* src, uint32 mask) +{ + if( (mask&(1<<0)) != 0 ) + src->add("x"); + if( (mask&(1<<1)) != 0 ) + src->add("y"); + if( (mask&(1<<2)) != 0 ) + src->add("z"); + if( (mask&(1<<3)) != 0 ) + src->add("w"); +} + +static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + // calculate parameter output (based on ring buffer output offset relative to GS unit) + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + bytesPerVertex = std::max(bytesPerVertex, (uint32)1); // avoid division by zero + uint32 parameterOffset = ((cfInstruction->exportArrayBase * 4) % bytesPerVertex); + // for geometry shaders with streamout, MEM_RING_WRITE is used to pass the data to the copy shader, which then uses STREAM*_WRITE + if (shaderContext->shaderType == LatteConst::ShaderType::Geometry && shaderContext->analyzer.hasStreamoutEnable) + { + // if streamout is enabled, we generate transform feedback output code instead of the normal gs output + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + parameterOffset = ((cfInstruction->exportArrayBase * 4 + burstIndex*0x10) % bytesPerVertex); + // find matching stream write in copy shader + LatteGSCopyShaderStreamWrite_t* streamWrite = nullptr; + for (auto& it : shaderContext->parsedGSCopyShader->list_streamWrites) + { + if (it.offset == parameterOffset) + { + streamWrite = ⁢ + break; + } + } + if (streamWrite == nullptr) + { + cemu_assert_suspicious(); + return; + } + + for (sint32 i = 0; i < 4; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->addFmt("{}.", _getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR+burstIndex)); + if (i == 0) + src->add("x"); + else if (i == 1) + src->add("y"); + else if (i == 2) + src->add("z"); + else if (i == 3) + src->add("w"); + + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + return; + } + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + if (!shaderContext->contextRegistersNew->IsRasterizationEnabled()) + { + src->add("// Rasterization disabled" _CRLF); + return; + } + + if (cfInstruction->memWriteElemSize != 3) + cemu_assert_unimplemented(); + if ((cfInstruction->exportArrayBase & 3) != 0) + cemu_assert_unimplemented(); + for (sint32 burstIndex = 0; burstIndex < (sint32)(cfInstruction->exportBurstCount + 1); burstIndex++) + { + src->addFmt("out.passParameterSem{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_SIGNED_INT, burstIndex); + src->add(";" _CRLF); + } + } + else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) + { + cemu_assert_debug(cfInstruction->memWriteElemSize == 3); + //if (cfInstruction->memWriteElemSize != 3) + // debugBreakpoint(); + cemu_assert_debug((cfInstruction->exportArrayBase & 3) == 0); + + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + uint32 parameterExportType = 0; + uint32 parameterExportBase = 0; + if (LatteGSCopyShaderParser_getExportTypeByOffset(shaderContext->parsedGSCopyShader, parameterOffset + burstIndex * (cfInstruction->memWriteElemSize+1)*4, ¶meterExportType, ¶meterExportBase) == false) + { + cemu_assert_debug(false); + shaderContext->hasError = true; + return; + } + + if (parameterExportType == 1 && parameterExportBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + src->add("{" _CRLF); + src->addFmt("float4 pos = float4(0.0,0.0,0.0,1.0);" _CRLF); + src->addFmt("pos."); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + src->add("SET_POSITION(pos);" _CRLF); + src->add("}" _CRLF); + } + else if (parameterExportType == 2 && parameterExportBase < 16) + { + src->addFmt("out.passParameterSem{}.", parameterExportBase); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + } + else + cemu_assert_debug(false); + } + } + else + debugBreakpoint(); // todo +} + +static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (shaderContext->analyzer.hasStreamoutEnable == false) + { +#ifdef CEMU_DEBUG_ASSERT + src->add("// omitted streamout write" _CRLF); +#endif + return; + } + uint32 streamoutBufferIndex; + if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE) + streamoutBufferIndex = 0; + else if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE) + streamoutBufferIndex = 1; + else + cemu_assert_unimplemented(); + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + uint32 arraySize = cfInstruction->memWriteArraySize + 1; + + for (sint32 i = 0; i < (sint32)arraySize; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(_getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR)); + _appendChannelAccess(src, i); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + else + cemu_assert_debug(false); +} + +static void _emitCFCall(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 subroutineAddr = cfInstruction->addr; + LatteDecompilerSubroutineInfo* subroutineInfo = nullptr; + // find subroutine + for (auto& subroutineItr : shaderContext->list_subroutines) + { + if (subroutineItr.cfAddr == subroutineAddr) + { + subroutineInfo = &subroutineItr; + break; + } + } + if (subroutineInfo == nullptr) + { + cemu_assert_debug(false); + return; + } + // inline function + if (shaderContext->isSubroutine) + { + cemu_assert_debug(false); // inlining with cascaded function calls not supported + return; + } + // init CF stack variables + src->addFmt("activeMaskStackSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[1] = true;" _CRLF, subroutineInfo->cfAddr); + + shaderContext->isSubroutine = true; + shaderContext->subroutineInfo = subroutineInfo; + for(auto& cfInstruction : subroutineInfo->instructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, true); + shaderContext->isSubroutine = false; + shaderContext->subroutineInfo = nullptr; +} + +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine) +{ + StringBuf* src = shaderContext->shaderSource; + + if( cfInstruction->type == GPU7_CF_INST_ALU || cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_BREAK || cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // emit ALU code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + if(cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1)); + else + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + if (cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + { + src->addFmt("{} = {};" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth-1)); + src->addFmt("{} = {};" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + _emitALUClauseCode(shaderContext, cfInstruction); + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + cemu_assert_debug(!(shaderContext->analyzer.modifiesPixelActiveState == false && cfInstruction->type != GPU7_CF_INST_ALU)); + // handle ELSE case of PUSH_BEFORE + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->add("else {" _CRLF); + src->addFmt("{} = false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = false;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("}" _CRLF); + } + // post clause handler + if( cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 1)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 2), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 2), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 2)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // no condition test + // pop stack + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + // else operation + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + } + else if( cfInstruction->type == GPU7_CF_INST_TEX ) + { + // emit TEX code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth+1)); + } + _emitTEXClauseCode(shaderContext, cfInstruction); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->add("}" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_EXPORT || cfInstruction->type == GPU7_CF_INST_EXPORT_DONE ) + { + // emit export code + _emitExportCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_ELSE ) + { + // todo: Condition test, popCount? + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_POP ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - cfInstruction->popCount), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount)); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_START_DX10 || + cfInstruction->type == GPU7_CF_INST_LOOP_START_NO_AL) + { + // start of loop + // if pixel is disabled, then skip loop + if (ActiveSettings::ShaderPreventInfiniteLoopsEnabled()) + { + // with iteration limit to prevent infinite loops + src->addFmt("int loopCounter{} = 0;" _CRLF, (sint32)cfInstruction->cfAddr); + src->addFmt("while( {} == true && loopCounter{} < 500 )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), (sint32)cfInstruction->cfAddr); + src->add("{" _CRLF); + src->addFmt("loopCounter{}++;" _CRLF, (sint32)cfInstruction->cfAddr); + } + else + { + src->addFmt("while( {} == true )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("{" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_END ) + { + // this might not always work + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + src->add("}" _CRLF); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_BREAK ) + { + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + // note: active stack level is set to the same level as the loop begin. popCount is ignored + src->add("break;" _CRLF); + + if (shaderContext->analyzer.modifiesPixelActiveState) + src->add("}" _CRLF); + + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE || + cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE ) + { + _emitStreamWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + _emitCFRingWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_EMIT_VERTEX ) + { + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + // write point size + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); + src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); + src->add("vertexIndex++;" _CRLF); + // increment transform feedback pointer + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); + } + + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + } + else if (cfInstruction->type == GPU7_CF_INST_CALL) + { + _emitCFCall(shaderContext, cfInstruction); + } + else if (cfInstruction->type == GPU7_CF_INST_RETURN) + { + // todo (handle properly) + } + else + { + cemu_assert_debug(false); + } +} + +void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderContext, StringBuf* fCStr_shaderSource) +{ + if( shaderContext->analyzer.hasRedcCUBE ) + { + fCStr_shaderSource->add("void redcCUBE(float4 src0, float4 src1, thread float3& stm, thread int& faceId)\r\n" + "{\r\n" + "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" + + "float3 inputCoord = normalize(float3(src1.y, src1.x, src0.x));\r\n" + + "float rx = inputCoord.x;\r\n" + "float ry = inputCoord.y;\r\n" + "float rz = inputCoord.z;\r\n" + "if( abs(rx) > abs(ry) && abs(rx) > abs(rz) )\r\n" + "{\r\n" + "stm.z = rx*2.0;\r\n" + "stm.xy = float2(ry,rz); \r\n" + "if( rx >= 0.0 )\r\n" + "{\r\n" + "faceId = 0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 1;\r\n" + "}\r\n" + "}\r\n" + "else if( abs(ry) > abs(rx) && abs(ry) > abs(rz) )\r\n" + "{\r\n" + "stm.z = ry*2.0;\r\n" + "stm.xy = float2(rx,rz); \r\n" + "if( ry >= 0.0 )\r\n" + "{\r\n" + "faceId = 2;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 3;\r\n" + "}\r\n" + "}\r\n" + "else //if( abs(rz) > abs(ry) && abs(rz) > abs(rx) )\r\n" + "{\r\n" + "stm.z = rz*2.0;\r\n" + "stm.xy = float2(rx,ry); \r\n" + "if( rz >= 0.0 )\r\n" + "{\r\n" + "faceId = 4;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 5;\r\n" + "}\r\n" + "}\r\n" + "}\r\n"); + } + + if( shaderContext->analyzer.hasCubeMapTexture ) + { + fCStr_shaderSource->add("float3 redcCUBEReverse(float2 st, int faceId)\r\n" + "{\r\n" + "st.yx = st.xy;\r\n" + "float3 v;\r\n" + "float majorAxis = 1.0;\r\n" + "if( faceId == 0 )\r\n" + "{\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.x = 1.0;\r\n" + "}\r\n" + "else if( faceId == 1 )\r\n" + "{\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.x = -1.0;\r\n" + "}\r\n" + "else if( faceId == 2 )\r\n" + "{\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.y = 1.0;\r\n" + "}\r\n" + "else if( faceId == 3 )\r\n" + "{\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.y = -1.0;\r\n" + "}\r\n" + "else if( faceId == 4 )\r\n" + "{\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.z = 1.0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.z = -1.0;\r\n" + "}\r\n" + + "return v;\r\n" + "}\r\n"); + } + + // Sample compare emulate + // TODO: only add when needed + // TODO: lod_options overload + // TODO: when the sampler has linear min mag filter, use gather and filter manually + // TODO: offset? + fCStr_shaderSource->add("" + "template\r\n" + "float sampleCompareEmulate(TextureT tex, sampler samplr, CoordT coord, float compareValue) {\r\n" + "return compareValue < tex.sample(samplr, coord).x ? 1.0 : 0.0;\r\n" + "}\r\n" + ); + + // Texture calculate lod + // TODO: only add when needed + fCStr_shaderSource->add("" + "template\r\n" + "float2 textureCalculateLod(TextureT tex, sampler samplr, CoordT coord) {\r\n" + "float lod = tex.calculate_unclamped_lod(samplr, coord);\r\n" + "return float2(floor(lod), fract(lod));\r\n" + "}\r\n"); + + // clamp + fCStr_shaderSource->add("" + "int clampFI32(int v)\r\n" + "{\r\n" + "if( v == 0x7FFFFFFF )\r\n" + " return as_type(1.0);\r\n" + "else if( v == 0xFFFFFFFF )\r\n" + " return as_type(0.0);\r\n" + "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" + "}\r\n"); + + // mul non-ieee way (0*NaN/INF => 0.0) + if (shaderContext->options->strictMul) + { + // things we tried: + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + + // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } + + if( LatteGPUState.glVendor == GLVENDOR_NVIDIA && !ActiveSettings::DumpShadersEnabled()) + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){return mix(0.0, a*b, (a != 0.0) && (b != 0.0));}" _CRLF); // compiles faster on Nvidia and also results in lower RAM usage (OpenGL) + else + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" _CRLF); + + // DXKV-like: fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b); }" _CRLF); + } +} + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp" + +static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* shaderContext, LatteParsedFetchShaderAttribute_t& attrib) +{ + auto src = shaderContext->shaderSource; + + static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", /*"floatBitsToInt(0.0)"*/ "0", /*"floatBitsToInt(1.0)"*/ "0x3f800000" }; + static const char* dsMappingTableInt[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "0", "1" }; + + // get register index based on vtx semantic table + uint32 attributeShaderLoc = 0xFFFFFFFF; + for (sint32 f = 0; f < 32; f++) + { + if (shaderContext->contextRegisters[mmSQ_VTX_SEMANTIC_0 + f] == attrib.semanticId) + { + attributeShaderLoc = f; + break; + } + } + if (attributeShaderLoc == 0xFFFFFFFF) + return; // attribute is not mapped to VS input + uint32 registerIndex = attributeShaderLoc + 1; // R0 is skipped + // is register used? + if ((shaderContext->analyzer.gprUseMask[registerIndex / 8] & (1 << (registerIndex % 8))) == 0) + { + src->addFmt("// skipped unused attribute for r{}" _CRLF, registerIndex); + return; + } + + LatteDecompiler_emitAttributeDecodeMSL(shaderContext->shader, src, &attrib); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(", _getRegisterVarName(shaderContext, registerIndex)); + else + src->addFmt("{} = float4(", _getRegisterVarName(shaderContext, registerIndex)); + for (sint32 f = 0; f < 4; f++) + { + uint8 ds = attrib.ds[f]; + if (f > 0) + src->add(", "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + if (ds >= 6) + { + cemu_assert_unimplemented(); + ds = 4; // read as 0.0 + } + if (attrib.nfa != 1) + { + src->add(dsMappingTableFloat[ds]); + } + else + { + src->add(dsMappingTableInt[ds]); + } + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + } + src->add(");" _CRLF); +} + +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) +{ + bool isRectVertexShader = UseRectEmulation(*shaderContext->contextRegistersNew); + bool usesGeometryShader = UseGeometryShader(*shaderContext->contextRegistersNew, shaderContext->options->usesGeometryShader); + bool fetchVertexManually = (usesGeometryShader || (shaderContext->fetchShader && shaderContext->fetchShader->mtlFetchVertexManually)); + + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) + shaderContext->shaderSource = src; + + // debug info + src->addFmt("// shader {:016x}" _CRLF, shaderContext->shaderBaseHash); +#ifdef CEMU_DEBUG_ASSERT + src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues ? "true" : "false"); + src->addFmt(_CRLF); +#endif + // include metal standard library + src->add("#include " _CRLF); + src->add("using namespace metal;" _CRLF); + // header part (definitions for inputs and outputs) + LatteDecompiler::emitHeader(shaderContext, isRectVertexShader, usesGeometryShader, fetchVertexManually); + // helper functions + LatteDecompiler_emitHelperFunctions(shaderContext, src); + const char* functionType = ""; + const char* outputTypeName = ""; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + if (fetchVertexManually) + { + // TODO: clean this up + // fetchVertex will modify vid in case of an object shader and an indexed draw + + // Vertex buffers + std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; + std::string vertexBuffers = "#define VERTEX_BUFFERS "; + std::string inputFetchDefinition = "VertexIn fetchVertex("; + if (usesGeometryShader) + inputFetchDefinition += "thread uint&"; + else + inputFetchDefinition += "uint"; + inputFetchDefinition += " vid, uint iid"; + if (usesGeometryShader) + inputFetchDefinition += ", device uint* indexBuffer, uchar indexType"; + inputFetchDefinition += " VERTEX_BUFFER_DEFINITIONS) {\n"; + + // Index buffer + if (usesGeometryShader) + { + inputFetchDefinition += "if (indexType == 1) // UShort\n"; + inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; + inputFetchDefinition += "else if (indexType == 2) // UInt\n"; + inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid];\n"; + } + + inputFetchDefinition += "VertexIn in;\n"; + for (auto& bufferGroup : shaderContext->fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (shaderContext->contextRegisters[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = shaderContext->output->resourceMappingMTL.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + std::string formatName; + uint8 componentCount = 0; + switch (GetMtlVertexFormat(attr.format)) + { + case MTL::VertexFormatUChar: + formatName = "uchar"; + componentCount = 1; + break; + case MTL::VertexFormatUChar2: + formatName = "uchar2"; + componentCount = 2; + break; + case MTL::VertexFormatUChar3: + formatName = "uchar3"; + componentCount = 3; + break; + case MTL::VertexFormatUChar4: + formatName = "uchar4"; + componentCount = 4; + break; + case MTL::VertexFormatUShort: + formatName = "ushort"; + componentCount = 1; + break; + case MTL::VertexFormatUShort2: + formatName = "ushort2"; + componentCount = 2; + break; + case MTL::VertexFormatUShort3: + formatName = "ushort3"; + componentCount = 3; + break; + case MTL::VertexFormatUShort4: + formatName = "ushort4"; + componentCount = 4; + break; + case MTL::VertexFormatUInt: + formatName = "uint"; + componentCount = 1; + break; + case MTL::VertexFormatUInt2: + formatName = "uint2"; + componentCount = 2; + break; + case MTL::VertexFormatUInt3: + formatName = "uint3"; + componentCount = 3; + break; + case MTL::VertexFormatUInt4: + formatName = "uint4"; + componentCount = 4; + break; + } + + // Get the fetch type + std::string fetchTypeStr; + if (attr.fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + fetchTypeStr = "vid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + fetchTypeStr = "iid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::NO_INDEX_OFFSET_DATA) + fetchTypeStr = "0"; // TODO: correct? + + // Fetch the attribute + inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = uint4(uint", semanticId); + if (componentCount != 1) + inputFetchDefinition += fmt::format("{}", componentCount); + inputFetchDefinition += fmt::format("(*(device {}*)", formatName); + inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); + inputFetchDefinition += fmt::format(" + {} * {} + {}))", fetchTypeStr, bufferStride, attr.offset); + for (uint8 i = 0; i < (4 - componentCount); i++) + inputFetchDefinition += ", 0"; + inputFetchDefinition += ");\n"; + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + // TODO: fetch type + + vertexBufferDefinitions += fmt::format(", device uchar* vertexBuffer{} [[buffer({})]]", bufferIndex, GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + vertexBuffers += fmt::format(", vertexBuffer{}", bufferIndex); + } + + inputFetchDefinition += "return in;\n"; + inputFetchDefinition += "}\n"; + + src->add(vertexBufferDefinitions.c_str()); + src->add("\n"); + src->add(vertexBuffers.c_str()); + src->add("\n"); + src->add(inputFetchDefinition.c_str()); + } + + if (usesGeometryShader) + { + functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_VERTEX_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; + outputTypeName = "void"; + } + else + { + functionType = "vertex"; + if (shaderContext->contextRegistersNew->IsRasterizationEnabled()) + outputTypeName = "VertexOut"; + else + outputTypeName = "void"; + } + break; + case LatteConst::ShaderType::Geometry: + functionType = "[[mesh, max_total_threads_per_threadgroup(1)]]"; + outputTypeName = "void"; + break; + case LatteConst::ShaderType::Pixel: + functionType = "fragment"; + outputTypeName = "FragmentOut"; + break; + } + // start of main + src->addFmt("{} {} main0(", functionType, outputTypeName); + LatteDecompiler::emitInputs(shaderContext, isRectVertexShader, usesGeometryShader, fetchVertexManually); + src->add(") {" _CRLF); + if (fetchVertexManually && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + if (usesGeometryShader) + { + // Calculate the imaginary vertex id + LattePrimitiveMode vsOutPrimType = shaderContext->contextRegistersNew->VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + if (PrimitiveRequiresConnection(vsOutPrimType)) + src->add("uint vid = tig + tid;" _CRLF); + else + src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); + src->add("uint iid = vid / supportBuffer.verticesPerInstance;" _CRLF); + src->add("vid %= supportBuffer.verticesPerInstance;" _CRLF); + + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); + + // Output is defined as object payload + src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); + } + else + { + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid VERTEX_BUFFERS);" _CRLF); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("GeometryOut out;" _CRLF); + // The index of the current vertex that is being emitted + src->add("uint vertexIndex = 0;" _CRLF); + } + } + + if (shader->shaderType == LatteConst::ShaderType::Pixel || (shaderContext->contextRegistersNew->IsRasterizationEnabled() && !usesGeometryShader)) + { + src->addFmt("{} out;" _CRLF, outputTypeName); + } + + // variable definition + if (shaderContext->typeTracker.useArrayGPRs == false) + { + // each register is a separate variable + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->analyzer.usesRelativeGPRRead || (shaderContext->analyzer.gprUseMask[i / 8] & (1 << (i & 7))) != 0) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 R{}i = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 R{}f = float4(0.0);" _CRLF, i); + } + } + } + else + { + // registers are represented using a single large array + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 Ri[128];" _CRLF); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 Rf[128];" _CRLF); + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("Ri[{}] = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("Rf[{}] = float4(0.0);" _CRLF, i); + } + } + + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uint4 attrDecoder;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int backupReg0i, backupReg1i, backupReg2i, backupReg3i, backupReg4i;" _CRLF); + if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float backupReg0f, backupReg1f, backupReg2f, backupReg3f, backupReg4f;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + { + src->addFmt("int PV0ix = 0, PV0iy = 0, PV0iz = 0, PV0iw = 0, PV1ix = 0, PV1iy = 0, PV1iz = 0, PV1iw = 0;" _CRLF); + src->addFmt("int PS0i = 0, PS1i = 0;" _CRLF); + src->addFmt("int4 tempi = int4(0);" _CRLF); + } + if (shaderContext->typeTracker.genFloatReg) + { + src->addFmt("float PV0fx = 0.0, PV0fy = 0.0, PV0fz = 0.0, PV0fw = 0.0, PV1fx = 0.0, PV1fy = 0.0, PV1fz = 0.0, PV1fw = 0.0;" _CRLF); + src->addFmt("float PS0f = 0.0, PS1f = 0.0;" _CRLF); + src->addFmt("float4 tempf = float4(0.0);" _CRLF); + } + if (shaderContext->analyzer.hasGradientLookup) + { + src->add("float4 gradH;" _CRLF); + src->add("float4 gradV;" _CRLF); + } + src->add("float tempResultf;" _CRLF); + src->add("int tempResulti;" _CRLF); + src->add("int4 ARi = int4(0);" _CRLF); + src->add("bool predResult = true;" _CRLF); + if(shaderContext->analyzer.modifiesPixelActiveState ) + { + src->addFmt("bool activeMaskStack[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+1); + src->addFmt("bool activeMaskStackC[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+2); + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth; i++) + { + src->addFmt("activeMaskStack[{}] = false;" _CRLF, i); + } + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth+1; i++) + { + src->addFmt("activeMaskStackC[{}] = false;" _CRLF, i); + } + src->addFmt("activeMaskStack[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[1] = true;" _CRLF); + // generate vars for each subroutine + for (auto& subroutineInfo : shaderContext->list_subroutines) + { + sint32 subroutineMaxStackDepth = 0; + src->addFmt("bool activeMaskStackSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 1); + src->addFmt("bool activeMaskStackCSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 2); + } + } + // helper variables for cube maps (todo: Only emit when used) + if (shaderContext->analyzer.hasRedcCUBE) + { + src->add("float3 cubeMapSTM;" _CRLF); + src->add("int cubeMapFaceId;" _CRLF); + } + for(sint32 i=0; ioutput->textureUnitMask[i]) + continue; + if( shader->textureUnitDim[i] != Latte::E_DIM::DIM_CUBEMAP ) + continue; + src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); + } + // init base offset for streamout buffer writes + if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if(!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); + + if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (vid + supportBuffer.verticesPerInstance * iid)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + else // geometry shader + { + uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + uint32 maxVerticesInGS = ((shaderContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) * 4) / bytesPerVertex; + + cemu_assert_debug(gsOutPrimType == 0); // currently we only properly handle GS output primitive points + + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + } + // code to load inputs from previous stage + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + { + if( (shaderContext->analyzer.gprUseMask[0/8]&(1<<(0%8))) != 0 ) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: as_type(float4(vid, 0, 0, iid))? + else + cemu_assert_unimplemented(); + } + + LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; + for(auto& bufferGroup : parsedFetchShader->bufferGroups) + { + for(sint32 i=0; ibufferGroupsInvalid) + { + // these attributes point to non-existent buffers + // todo - figure out how the hardware actually handles this, currently we assume the input values are zero + for (sint32 i = 0; i < bufferGroup.attribCount; i++) + LatteDecompiler_emitAttributeImport(shaderContext, bufferGroup.attrib[i]); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 psControl1 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_1]; + + uint32 spiInterpControl = shaderContext->contextRegisters[mmSPI_INTERP_CONTROL_0]; + uint8 spriteEnable = (spiInterpControl >> 1) & 1; + cemu_assert_debug(spriteEnable == 0); + + uint8 frontFace_enabled = (psControl1 >> 8) & 1; + uint8 frontFace_chan = (psControl1 >> 9) & 3; + uint8 frontFace_allBits = (psControl1 >> 11) & 1; + uint8 frontFace_regIndex = (psControl1 >> 12) & 0x1F; + + // handle param_gen + if (psInputTable->paramGen != 0) + { + cemu_assert_debug((psInputTable->paramGen) == 1); // handle the other bits (the same set of coordinates with different perspective/projection settings?) + uint32 paramGenGPRIndex = psInputTable->paramGenGPR; + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = pointCoord.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + else + src->addFmt("{} = as_type(pointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + } + + for (sint32 i = 0; i < psInputTable->count; i++) + { + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 spi0_paramGen = (psControl0 >> 15) & 0xF; + + sint32 gprIndex = i;// +spi0_paramGen + paramRegOffset; + if ((shaderContext->analyzer.gprUseMask[gprIndex / 8] & (1 << (gprIndex % 8))) == 0 && shaderContext->analyzer.usesRelativeGPRRead == false) + continue; + uint32 psInputSemanticId = psInputTable->import[i].semanticId; + if (psInputSemanticId == LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + else + src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + continue; + } + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else + cemu_assert_unimplemented(); + } + // front facing attribute + if (frontFace_enabled) + { + if ((shaderContext->analyzer.gprUseMask[0 / 8] & (1 << (0 % 8))) != 0) + { + if (frontFace_allBits) + cemu_assert_debug(false); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{} = as_type(frontFacing ? 1.0 : 0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else + cemu_assert_debug(false); + } + } + } + for(auto& cfInstruction : shaderContext->cfInstructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, false); + //if(shader->shaderType == LatteConst::ShaderType::Geometry) + // src->add("EndPrimitive();" _CRLF); + // vertex shader should write renderstate point size at the end if required but not modified by shader + if (shaderContext->analyzer.outputPointSize && !shaderContext->analyzer.writesPointSize) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader && shaderContext->contextRegistersNew->IsRasterizationEnabled()) + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); + } + + if (usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + src->add("if (tid == 0) {" _CRLF); + src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); + src->add("}" _CRLF); + } + else if (shader->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("mesh.set_primitive_count(GET_PRIMITIVE_COUNT(vertexIndex));" _CRLF); + + // Set indices + if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 1) // Line strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 2; i++) {" _CRLF); + src->add("mesh.set_index(i, (i 2 3) + i % 2);" _CRLF); + src->add("}" _CRLF); + } + else if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 2) // Triangle strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 3; i++) {" _CRLF); + src->add("mesh.set_index(i, (i / 3) + i % 3);" _CRLF); + src->add("}" _CRLF); + } + else + { + src->add("for (uint8_t i = 0; i < vertexIndex; i++) {" _CRLF); + src->add("mesh.set_index(i, i);" _CRLF); + src->add("}" _CRLF); + } + } + } + + if (shader->shaderType == LatteConst::ShaderType::Pixel || (shaderContext->contextRegistersNew->IsRasterizationEnabled() && !usesGeometryShader)) + { + // Return + src->add("return out;" _CRLF); + } + + // end of shader main + src->add("}" _CRLF); + src->shrink_to_fit(); + shader->strBuf_shaderSource = src; +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp new file mode 100644 index 0000000000..9ee5c31f1f --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -0,0 +1,511 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "util/helpers/StringBuf.h" + +#define _CRLF "\r\n" + +static void _readLittleEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readBigEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); + src->add("attrDecoder = (attrDecoder>>24)|((attrDecoder>>8)&0xFF00)|((attrDecoder<<8)&0xFF0000)|((attrDecoder<<24));" _CRLF); +} + +static void _readBigEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyz = in.attrDataSem{}.xyz;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xyz = (attrDecoder.xyz>>24)|((attrDecoder.xyz>>8)&0xFF00)|((attrDecoder.xyz<<8)&0xFF0000)|((attrDecoder.xyz<<24));" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = (attrDecoder.xy>>24)|((attrDecoder.xy>>8)&0xFF00)|((attrDecoder.xy<<8)&0xFF0000)|((attrDecoder.xy<<24));" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.x = in.attrDataSem{}.x;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = (attrDecoder.x>>24)|((attrDecoder.x>>8)&0xFF00)|((attrDecoder.x<<8)&0xFF0000)|((attrDecoder.x<<24));" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = ((attrDecoder.x>>8)&0xFF)|((attrDecoder.x<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = ((attrDecoder.xy>>8)&0xFF)|((attrDecoder.xy<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("attrDecoder = ((attrDecoder>>8)&0xFF)|((attrDecoder<<8)&0xFF00);" _CRLF); +} + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib) +{ + if (attrib->attributeBufferIndex >= Latte::GPU_LIMITS::NUM_VERTEX_BUFFERS) + { + src->add("attrDecoder = int4(0);" _CRLF); + return; + } + + uint32 attributeInputIndex = attrib->semanticId; + if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U32 ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + // Bayonetta 2 uses this format to store normals + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + if (attrib->isSigned != 0) + { + src->add("if( (attrDecoder.x&0x200) != 0 ) attrDecoder.x |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.y&0x200) != 0 ) attrDecoder.y |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.z&0x200) != 0 ) attrDecoder.z |= 0xFFFFFC00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/511.0,-1.0));" _CRLF); + } + else + { + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + } + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // unsure? + + } + else if( attrib->format == FMT_32_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 1) + { + // we can just read the signed s32 as a u32 since no sign-extension is necessary + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in Ben 10 Omniverse + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_unimplemented(); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_NONE ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readLittleEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2) + { + _readLittleEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2) + { + // seen in Cities of Gold + _readLittleEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in Fast Racing Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // todo - is this correct? + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in CoD ghosts + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned == 1 ) + { + // seen in Rabbids Land + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(int4(attrDecoder)));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); + } + else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned != 0) + { + // seen in Sonic Lost World + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + // seen in One Piece + // TODO: uint4? + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) + { + if( (attrib->offset&3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL ) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in BotW + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned != 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = in.attrDataSem{}.zw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); + } + else + { + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + } + } + else if( attrib->format == FMT_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Pikmin 3 + src->addFmt("attrDecoder.x = as_type(float(in.attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.yzw = uint3(0);" _CRLF); + } + else if( attrib->format == FMT_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_debug(false); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U16 ) + { + if( attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in BotW + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z))/65535.0);" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w))/65535.0);" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x)));" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z)));" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w)));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xy = as_type(float2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if (attrib->format == FMT_16 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in CoD ghosts + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + } + else + { + cemuLog_logDebug(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + } + } + else + { + cemu_assert_debug(false); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp new file mode 100644 index 0000000000..f61abcb184 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -0,0 +1,554 @@ +#pragma once + +#include "Common/precompiled.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" + +namespace LatteDecompiler +{ + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, bool usesGeometryShader) + { + auto src = decompilerContext->shaderSource; + + auto& uniformOffsets = decompilerContext->output->uniformOffsetsVK; + + src->add("struct SupportBuffer {" _CRLF); + + sint32 uniformCurrentOffset = 0; + auto shader = decompilerContext->shader; + auto shaderType = decompilerContext->shader->shaderType; + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // uniform registers or buffers are accessed statically with predictable offsets + // this allows us to remap the used entries into a more compact array + src->addFmt("int4 remapped[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + uniformOffsets.offset_remapped = uniformCurrentOffset; + uniformCurrentOffset += 16 * shader->list_remappedUniformEntries.size(); + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); + // full or partial uniform register file has to be present + src->addFmt("int4 uniformRegister[{}];" _CRLF, cfileSize); + uniformOffsets.offset_uniformRegister = uniformCurrentOffset; + uniformOffsets.count_uniformRegister = cfileSize; + uniformCurrentOffset += 16 * cfileSize; + } + // special uniforms + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && hasAnyViewportScaleDisabled) + { + // aka GX2 special state 0 + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + src->add("float2 windowSpaceToClipSpaceTransform;" _CRLF); + uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + bool alphaTestEnable = decompilerContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + src->add("float alphaTestRef;" _CRLF); + uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + if (decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) + { + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + src->add("float pointSize;" _CRLF); + uniformOffsets.offset_pointSize = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + // define fragCoordScale which holds the xy scale for render target resolution vs effective resolution + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + src->add("float2 fragCoordScale;" _CRLF); + uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // provide scale factor for every texture that is accessed via texel coordinates (texelFetch) + for (sint32 t = 0; t < LATTE_NUM_MAX_TEX_UNITS; t++) + { + if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) + continue; + uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; + src->addFmt("float2 tex{}Scale;" _CRLF, t); + uniformOffsets.offset_texScale[t] = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // define verticesPerInstance + streamoutBufferBaseX + if ((shader->shaderType == LatteConst::ShaderType::Vertex && + usesGeometryShader) || + (decompilerContext->analyzer.useSSBOForStreamout && + (shader->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + (shader->shaderType == LatteConst::ShaderType::Geometry))) + { + src->add("int verticesPerInstance;" _CRLF); + uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; + uniformCurrentOffset += 4; + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (decompilerContext->output->streamoutBufferWriteMask[i]) + { + src->addFmt("int streamoutBufferBase{};" _CRLF, i); + uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + } + + src->add("};" _CRLF _CRLF); + + uniformOffsets.offset_endOfBlock = uniformCurrentOffset; + } + + static void _emitUniformBuffers(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); + + shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); + shaderSrc->addFmt("float4 d[{}];" _CRLF, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->add("};" _CRLF _CRLF); + } + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_NONE) + { + // no uniforms used + } + else + { + cemu_assert_debug(false); + } + } + + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + std::string attributeNames; + + if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) + { + src->add("struct VertexIn {" _CRLF); + // attribute inputs + for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) + { + if (decompilerContext->analyzer.inputAttributSemanticMask[i]) + { + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.attributeMapping[i] >= 0); + + src->addFmt("uint4 attrDataSem{}", i); + if (fetchVertexManually) + attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; + else + src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]); + src->add(";" _CRLF); + } + } + src->add("};" _CRLF _CRLF); + } + src->addFmt("{}", attributeNames); + } + + static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext, bool isRectVertexShader) + { + auto* src = shaderContext->shaderSource; + + src->add("struct VertexOut {" _CRLF); + src->add("float4 position [[position]] [[invariant]];" _CRLF); + if (shaderContext->analyzer.outputPointSize) + src->add("float pointSize [[point_size]];" _CRLF); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + auto parameterMask = shaderContext->shader->outputParameterMask; + bool psInputsWritten[GPU7_PS_MAX_INPUTS] = {false}; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask&(1 << i)) == 0) + continue; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, i); + if (vsSemanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + // get import based on semanticId + sint32 psInputIndex = -1; + for (sint32 f = 0; f < psInputTable->count; f++) + { + if (psInputTable->import[f].semanticId == vsSemanticId) + { + psInputIndex = f; + break; + } + } + if (psInputIndex == -1) + continue; // no ps input + + psInputsWritten[psInputIndex] = true; + + src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); + if (!isRectVertexShader) + { + src->addFmt(" [[user(locn{})]]", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add(" [[center_no_perspective]]"); + } + src->addFmt(";" _CRLF); + } + + // TODO: handle this in the fragment shader instead? + // Declare all PS inputs that are not written by the VS + for (uint32 i = 0; i < psInputTable->count; i++) + { + if (psInputsWritten[i]) + continue; + + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + + src->addFmt("float4 unknown{} [[user(locn{})]];" _CRLF, psInputTable->import[i].semanticId, i); + } + + src->add("};" _CRLF _CRLF); + + if (isRectVertexShader) + { + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); + src->add("};" _CRLF _CRLF); + } + } + + static void _emitPSInputs(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + + src->add("#define GET_FRAGCOORD() float4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); + + src->add("struct FragmentIn {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + src->addFmt("float4 passParameterSem{}", psInputTable->import[i].semanticId); + src->addFmt(" [[user(locn{})]]", i); + if (psInputTable->import[i].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[i].isNoPerspective) + src->add(" [[center_no_perspective]]"); + src->add(";" _CRLF); + } + + src->add("};" _CRLF _CRLF); + } + + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool usesGeometryShader, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + _emitAttributes(decompilerContext, fetchVertexManually); + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + _emitPSInputs(decompilerContext); + + src->add("struct FragmentOut {" _CRLF); + + // generate pixel outputs for pixel shader + for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) + { + auto dataType = GetColorBufferDataType(i, *decompilerContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetDataTypeStr(dataType), i, i); + } + } + } + + // generate depth output for pixel shader + if (decompilerContext->shader->depthMask) + src->add("float passDepth [[depth(any)]];" _CRLF); + + src->add("};" _CRLF _CRLF); + } + + if (!usesGeometryShader || isRectVertexShader) + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->contextRegistersNew->IsRasterizationEnabled()) + _emitVSOutputs(decompilerContext, isRectVertexShader); + } + else + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("struct VertexOut {" _CRLF); + uint32 ringParameterCountVS2GS = 0; + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; + } + else + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; + } + for (uint32 f = 0; f < ringParameterCountVS2GS; f++) + src->addFmt("int4 passParameterSem{};" _CRLF, f); + src->add("};" _CRLF _CRLF); + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); + src->add("};" _CRLF _CRLF); + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between geometry and pixel shader + uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; + if ((ringItemSize & 0xF) != 0) + debugBreakpoint(); + if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) + debugBreakpoint(); + + src->add("struct GeometryOut {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) + { + if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) + continue; + src->addFmt("float4 passParameterSem{} [[user(locn{})]];" _CRLF, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F); + } + src->add("};" _CRLF _CRLF); + + const uint32 MAX_VERTEX_COUNT = 32; + + // Define the mesh shader output type + src->addFmt("using MeshType = mesh;" _CRLF, MAX_VERTEX_COUNT, MAX_VERTEX_COUNT); + } + } + } + + static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool usesGeometryShader, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + + if (usesGeometryShader && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + { + LattePrimitiveMode vsOutPrimType = decompilerContext->contextRegistersNew->VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + src->addFmt("#define VERTICES_PER_VERTEX_PRIMITIVE {}" _CRLF, GetVerticesPerPrimitive(vsOutPrimType)); + + uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + switch (gsOutPrimType) + { + case 0: // Point + src->add("#define MTL_PRIMITIVE_TYPE point" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount / 1)" _CRLF); + break; + case 1: // Line strip + src->add("#define MTL_PRIMITIVE_TYPE line" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 1)" _CRLF); + break; + case 2: // Triangle strip + src->add("#define MTL_PRIMITIVE_TYPE triangle" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 2)" _CRLF); + break; + default: + cemuLog_log(LogType::Force, "Unknown geometry out primitive type {}", gsOutPrimType); + break; + } + } + } + + if (decompilerContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) + src->add("#define SET_POSITION(_v) out.position = _v" _CRLF); + else + src->add("#define SET_POSITION(_v) out.position = _v; out.position.z = (out.position.z + out.position.w) / 2.0" _CRLF); + + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); + if(dump_shaders_enabled) + decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); + // uniform variables + _emitUniformVariables(decompilerContext, usesGeometryShader); + // uniform buffers + _emitUniformBuffers(decompilerContext); + // inputs and outputs + _emitInputsAndOutputs(decompilerContext, isRectVertexShader, usesGeometryShader, fetchVertexManually); + + if (dump_shaders_enabled) + decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); + } + + static void _emitUniformBufferDefinitions(LatteDecompilerShaderContext* decompilerContext) + { + auto src = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); + + src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i]); + } + } + } + + static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) + { + bool renderTargetIndexUsed[LATTE_NUM_COLOR_TARGET] = {false}; + + auto src = shaderContext->shaderSource; + // texture sampler definition + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!shaderContext->output->textureUnitMask[i]) + continue; + + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[i]; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) + { + if (!renderTargetIndexUsed[renderTargetIndex]) + { + src->addFmt(", {} col{} [[color({})]]", GetDataTypeStr(GetColorBufferDataType(renderTargetIndex, *shaderContext->contextRegistersNew)), renderTargetIndex, renderTargetIndex); + renderTargetIndexUsed[renderTargetIndex] = true; + } + } + else + { + src->add(", "); + + // Only certain texture dimensions can be used with comparison samplers + if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) + src->add("depth"); + else + src->add("texture"); + + if (shaderContext->shader->textureIsIntegerFormat[i]) + { + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("cube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("3d"); + else + { + cemu_assert_unimplemented(); + } + + uint32 binding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i]; + //uint32 textureBinding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] % 31; + //uint32 samplerBinding = textureBinding % 16; + src->addFmt(" tex{} [[texture({})]]", i, binding); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); + } + } + } + + static void emitInputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool usesGeometryShader, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + + switch (decompilerContext->shaderType) + { + case LatteConst::ShaderType::Vertex: + if (usesGeometryShader) + { + src->add("object_data ObjectPayload& objectPayload [[payload]]"); + src->add(", mesh_grid_properties meshGridProperties"); + src->add(", uint tig [[threadgroup_position_in_grid]]"); + src->add(", uint tid [[thread_index_in_threadgroup]]"); + // TODO: only include index buffer if needed + src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); + // TODO: put into the support buffer? + src->addFmt(", constant uchar& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); + } + else + { + // TODO: only include these if needed? + src->add("uint vid [[vertex_id]]"); + src->add(", uint iid [[instance_id]]"); + } + + if (fetchVertexManually) + src->add(" VERTEX_BUFFER_DEFINITIONS"); + else + src->add(", VertexIn in [[stage_in]]"); + + break; + case LatteConst::ShaderType::Geometry: + src->add("MeshType mesh"); + src->add(", const object_data ObjectPayload& objectPayload [[payload]]"); + break; + case LatteConst::ShaderType::Pixel: + src->add("FragmentIn in [[stage_in]]"); + // TODO: only include these if needed? + src->add(", float2 pointCoord [[point_coord]]"); + src->add(", bool frontFacing [[front_facing]]"); + break; + default: + break; + } + + if (decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint >= 0) + src->addFmt(", constant SupportBuffer& supportBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint); + + // streamout buffer (transform feedback) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint); + } + + // uniform buffers + _emitUniformBufferDefinitions(decompilerContext); + // textures + _emitTextureDefinitions(decompilerContext); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index ed1858bae8..f4135640f0 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -47,7 +47,7 @@ struct LatteDecompilerTEXInstruction sint32 dstGpr; sint8 dstSel[4]; // texture fetch - struct + struct { sint32 textureIndex{}; sint32 samplerIndex{}; @@ -216,7 +216,7 @@ struct LatteDecompilerShaderContext bool genIntReg; // if set, generate R*i register variables bool useArrayGPRs; // if set, an array is used to represent GPRs instead of individual variables }typeTracker; - // analyzer + // analyzer struct { // general @@ -260,6 +260,8 @@ struct LatteDecompilerShaderContext // emitter bool hasUniformVarBlock; sint32 currentBindingPointVK{}; + sint32 currentBufferBindingPointMTL{}; + sint32 currentTextureBindingPointMTL{}; struct ALUClauseTemporariesState* aluPVPSState{nullptr}; // misc std::vector list_subroutines; @@ -268,9 +270,10 @@ struct LatteDecompilerShaderContext void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_analyzeDataTypes(LatteDecompilerShaderContext* shaderContext); void LatteDecompiler_emitGLSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_cleanup(LatteDecompilerShaderContext* shaderContext); // helper functions -sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); \ No newline at end of file +sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp new file mode 100644 index 0000000000..a7e87c7943 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -0,0 +1,64 @@ +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : LatteCachedFBO(key) +{ + m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + + bool hasAttachment = false; + for (int i = 0; i < 8; ++i) + { + const auto& buffer = colorBuffer[i]; + auto textureView = (LatteTextureViewMtl*)buffer.texture; + if (!textureView) + { + continue; + } + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(i); + colorAttachment->setTexture(textureView->GetRGBAView()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + hasAttachment = true; + } + + // setup depth attachment + if (depthBuffer.texture) + { + auto textureView = static_cast(depthBuffer.texture); + auto depthAttachment = m_renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(textureView->GetRGBAView()); + depthAttachment->setLoadAction(MTL::LoadActionLoad); + depthAttachment->setStoreAction(MTL::StoreActionStore); + + // setup stencil attachment + if (depthBuffer.hasStencil && GetMtlPixelFormatInfo(depthBuffer.texture->format, true).hasStencil) + { + auto stencilAttachment = m_renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(textureView->GetRGBAView()); + stencilAttachment->setLoadAction(MTL::LoadActionLoad); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + } + + hasAttachment = true; + } + + // HACK: setup a dummy color attachment to prevent Metal from discarding draws for stremout draws in Super Smash Bros. for Wii U (works fine on MoltenVK without this hack though) + if (!hasAttachment) + { + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(metalRenderer->GetNullTexture2D()); + colorAttachment->setLoadAction(MTL::LoadActionDontCare); + colorAttachment->setStoreAction(MTL::StoreActionDontCare); + } + + // Visibility buffer + m_renderPassDescriptor->setVisibilityResultBuffer(metalRenderer->GetOcclusionQueryResultBuffer()); +} + +CachedFBOMtl::~CachedFBOMtl() +{ + m_renderPassDescriptor->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h new file mode 100644 index 0000000000..f1221eb224 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/LatteCachedFBO.h" + +class CachedFBOMtl : public LatteCachedFBO +{ +public: + CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key); + + ~CachedFBOMtl(); + + MTL::RenderPassDescriptor* GetRenderPassDescriptor() + { + return m_renderPassDescriptor; + } + +private: + MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp new file mode 100644 index 0000000000..c5d1f54063 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -0,0 +1,107 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, + Latte::E_HWTILEMODE tileMode, bool isDepth) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) +{ + NS_STACK_SCOPED MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setStorageMode(MTL::StorageModePrivate); + //desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); + + sint32 effectiveBaseWidth = width; + sint32 effectiveBaseHeight = height; + sint32 effectiveBaseDepth = depth; + if (overwriteInfo.hasResolutionOverwrite) + { + effectiveBaseWidth = overwriteInfo.width; + effectiveBaseHeight = overwriteInfo.height; + effectiveBaseDepth = overwriteInfo.depth; + } + effectiveBaseWidth = std::max(1, effectiveBaseWidth); + effectiveBaseHeight = std::max(1, effectiveBaseHeight); + effectiveBaseDepth = std::max(1, effectiveBaseDepth); + + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + effectiveBaseHeight = 1; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + cemu_assert_debug(effectiveBaseDepth % 6 == 0 && "cubemaps must have an array length multiple of 6"); + + textureType = MTL::TextureTypeCubeArray; + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + desc->setTextureType(textureType); + + // Clamp mip levels + mipLevels = std::min(mipLevels, (uint32)maxPossibleMipLevels); + mipLevels = std::max(mipLevels, (uint32)1); + + desc->setWidth(effectiveBaseWidth); + desc->setHeight(effectiveBaseHeight); + desc->setMipmapLevelCount(mipLevels); + + if (textureType == MTL::TextureType3D) + { + desc->setDepth(effectiveBaseDepth); + } + else if (textureType == MTL::TextureTypeCubeArray) + { + desc->setArrayLength(effectiveBaseDepth / 6); + } + else if (textureType == MTL::TextureType2DArray) + { + desc->setArrayLength(effectiveBaseDepth); + } + + auto pixelFormat = GetMtlPixelFormat(format, isDepth); + desc->setPixelFormat(pixelFormat); + + MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; + if (FormatIsRenderable(format)) + usage |= MTL::TextureUsageRenderTarget; + desc->setUsage(usage); + + m_texture = mtlRenderer->GetDevice()->newTexture(desc); +} + +LatteTextureMtl::~LatteTextureMtl() +{ + m_texture->release(); +} + +LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) +{ + cemu_assert_debug(mipCount > 0); + cemu_assert_debug(sliceCount > 0); + cemu_assert_debug((firstMip + mipCount) <= this->mipLevels); + cemu_assert_debug((firstSlice + sliceCount) <= this->depth); + + return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); +} + +// TODO: lazy allocation? +void LatteTextureMtl::AllocateOnHost() +{ + // The texture is already allocated +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h new file mode 100644 index 0000000000..884a5c5b02 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" +#include "HW/Latte/ISA/LatteReg.h" +#include "util/ChunkedHeap/ChunkedHeap.h" + +class LatteTextureMtl : public LatteTexture +{ +public: + LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, + uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth); + ~LatteTextureMtl(); + + MTL::Texture* GetTexture() const { + return m_texture; + } + + void AllocateOnHost() override; + +protected: + LatteTextureView* CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) override; + +private: + class MetalRenderer* m_mtlr; + + MTL::Texture* m_texture; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp new file mode 100644 index 0000000000..405c49df49 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -0,0 +1,52 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +LatteTextureReadbackInfoMtl::~LatteTextureReadbackInfoMtl() +{ + if (m_commandBuffer) + m_commandBuffer->release(); +} + +void LatteTextureReadbackInfoMtl::StartTransfer() +{ + cemu_assert(m_textureView); + + auto* baseTexture = (LatteTextureMtl*)m_textureView->baseTexture; + + cemu_assert_debug(m_textureView->firstSlice == 0); + cemu_assert_debug(m_textureView->firstMip == 0); + cemu_assert_debug(m_textureView->baseTexture->dim != Latte::E_DIM::DIM_3D); + + size_t bytesPerRow = GetMtlTextureBytesPerRow(baseTexture->format, baseTexture->isDepth, baseTexture->width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(baseTexture->format, baseTexture->isDepth, baseTexture->height, bytesPerRow); + + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); + + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); + // TODO: uncomment? + //m_mtlr->RequestSoonCommit(); + m_mtlr->CommitCommandBuffer(); +} + +bool LatteTextureReadbackInfoMtl::IsFinished() +{ + // Command buffer wasn't even comitted, let's commit immediately + //if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) + // m_mtlr->CommitCommandBuffer(); + + return CommandBufferCompleted(m_commandBuffer); +} + +void LatteTextureReadbackInfoMtl::ForceFinish() +{ + m_commandBuffer->waitUntilCompleted(); +} + +uint8* LatteTextureReadbackInfoMtl::GetData() +{ + return (uint8*)m_mtlr->GetTextureReadbackBuffer()->contents() + m_bufferOffset; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h new file mode 100644 index 0000000000..19ca6574a7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h @@ -0,0 +1,25 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h" + +class LatteTextureReadbackInfoMtl : public LatteTextureReadbackInfo +{ +public: + LatteTextureReadbackInfoMtl(class MetalRenderer* mtlRenderer, LatteTextureView* textureView, uint32 bufferOffset) : LatteTextureReadbackInfo(textureView), m_mtlr{mtlRenderer}, m_bufferOffset{bufferOffset} {} + ~LatteTextureReadbackInfoMtl(); + + void StartTransfer() override; + + bool IsFinished() override; + void ForceFinish() override; + + uint8* GetData() override; + +private: + class MetalRenderer* m_mtlr; + + MTL::CommandBuffer* m_commandBuffer = nullptr; + + uint32 m_bufferOffset = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp new file mode 100644 index 0000000000..a06b11f025 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -0,0 +1,191 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLTexture.hpp" + +uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) +{ + switch (format) + { + case Latte::E_GX2SURFFMT::R8_UNORM: // R8 is replicated on all channels (while OpenGL would return 1.0 for BGA instead) + case Latte::E_GX2SURFFMT::R8_SNORM: // probably the same as _UNORM, but needs testing + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: // order of components is reversed (RGBA -> ABGR) + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::BC4_UNORM: + case Latte::E_GX2SURFFMT::BC4_SNORM: + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::BC5_UNORM: + case Latte::E_GX2SURFFMT::BC5_SNORM: + // RG maps to RG + // B maps to ? + // A maps to G (guessed) + if (compSel == 3) + compSel = 1; // read Alpha as Green + break; + case Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM: + // reverse components (Wii U: ABGR, OpenGL: RGBA) + // used in Resident Evil Revelations + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::X24_G8_UINT: + // map everything to alpha? + if (compSel >= 0 && compSel <= 3) + compSel = 3; + break; + case Latte::E_GX2SURFFMT::R4_G4_UNORM: + // red and green swapped + if (compSel == 0) + compSel = 1; + else if (compSel == 1) + compSel = 0; + break; + default: + break; + } + return compSel; +} + +LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) + : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_baseTexture(texture) +{ + m_rgbaView = CreateSwizzledView(RGBA_SWIZZLE); +} + +LatteTextureViewMtl::~LatteTextureViewMtl() +{ + m_rgbaView->release(); + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + if (m_viewCache[i].key != INVALID_SWIZZLE) + m_viewCache[i].texture->release(); + } + + for (auto& [key, texture] : m_fallbackViewCache) + { + texture->release(); + } +} + +MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) +{ + // Mask out + gpuSamplerSwizzle &= 0x0FFF0000; + + // RGBA swizzle == no swizzle + if (gpuSamplerSwizzle == RGBA_SWIZZLE) + { + return m_rgbaView; + } + + // First, try to find a view in the cache + + // Fast cache + sint32 freeIndex = -1; + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + const auto& entry = m_viewCache[i]; + if (entry.key == gpuSamplerSwizzle) + { + return entry.texture; + } + else if (entry.key == INVALID_SWIZZLE && freeIndex == -1) + { + freeIndex = i; + } + } + + // Fallback cache + auto& fallbackEntry = m_fallbackViewCache[gpuSamplerSwizzle]; + if (fallbackEntry) + { + return fallbackEntry; + } + + MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); + if (freeIndex != -1) + m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; + else + fallbackEntry = texture; + + return texture; +} + +MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) +{ + uint32 compSelR = (gpuSamplerSwizzle >> 16) & 0x7; + uint32 compSelG = (gpuSamplerSwizzle >> 19) & 0x7; + uint32 compSelB = (gpuSamplerSwizzle >> 22) & 0x7; + uint32 compSelA = (gpuSamplerSwizzle >> 25) & 0x7; + compSelR = LatteTextureMtl_AdjustTextureCompSel(format, compSelR); + compSelG = LatteTextureMtl_AdjustTextureCompSel(format, compSelG); + compSelB = LatteTextureMtl_AdjustTextureCompSel(format, compSelB); + compSelA = LatteTextureMtl_AdjustTextureCompSel(format, compSelA); + + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + cemu_assert_debug(this->numSlice % 6 == 0 && "cubemaps must have an array length multiple of 6"); + + textureType = MTL::TextureTypeCubeArray; + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + + uint32 baseLevel = firstMip; + uint32 levelCount = this->numMip; + uint32 baseLayer = 0; + uint32 layerCount = 1; + // TODO: check if base texture is 3D texture as well + if (textureType == MTL::TextureType3D) + { + cemu_assert_debug(firstMip == 0); + cemu_assert_debug(this->numSlice == baseTexture->depth); + } + else + { + baseLayer = firstSlice; + if (textureType == MTL::TextureTypeCubeArray || textureType == MTL::TextureType2DArray) + layerCount = this->numSlice; + } + + MTL::TextureSwizzleChannels swizzle; + swizzle.red = GetMtlTextureSwizzle(compSelR); + swizzle.green = GetMtlTextureSwizzle(compSelG); + swizzle.blue = GetMtlTextureSwizzle(compSelB); + swizzle.alpha = GetMtlTextureSwizzle(compSelA); + + // Clamp mip levels + levelCount = std::min(levelCount, m_baseTexture->maxPossibleMipLevels - baseLevel); + levelCount = std::max(levelCount, (uint32)1); + + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->isDepth); + MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); + + return texture; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h new file mode 100644 index 0000000000..2634735ef7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" + +#define RGBA_SWIZZLE 0x06880000 +#define INVALID_SWIZZLE 0xFFFFFFFF + +class LatteTextureViewMtl : public LatteTextureView +{ +public: + LatteTextureViewMtl(class MetalRenderer* mtlRenderer, class LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount); + ~LatteTextureViewMtl(); + + MTL::Texture* GetSwizzledView(uint32 gpuSamplerSwizzle); + + MTL::Texture* GetRGBAView() + { + return GetSwizzledView(RGBA_SWIZZLE); + } + +private: + class MetalRenderer* m_mtlr; + + class LatteTextureMtl* m_baseTexture; + + MTL::Texture* m_rgbaView; + struct { + uint32 key; + MTL::Texture* texture; + } m_viewCache[4] = {{INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}}; + std::unordered_map m_fallbackViewCache; + + MTL::Texture* CreateSwizzledView(uint32 gpuSamplerSwizzle); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp new file mode 100644 index 0000000000..7bf295df89 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -0,0 +1,511 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cemu/Logging/CemuLogging.h" +#include "HW/Latte/Core/LatteTextureLoader.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" + +std::map MTL_COLOR_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, MetalDataType::UINT, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, MetalDataType::INT, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: sRGB? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, MetalDataType::UINT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, MetalDataType::INT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, MetalDataType::FLOAT, 16}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? +}; + +std::map MTL_DEPTH_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2, {1, 1}}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, +}; + +// TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT +// TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) +{ + // Texture decoders + + // Color + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT].textureDecoder = TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT].textureDecoder = TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT].textureDecoder = TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT].textureDecoder = TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_FLOAT].textureDecoder = TextureDecoder_R32_G32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_UINT].textureDecoder = TextureDecoder_R32_G32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_UNORM].textureDecoder = TextureDecoder_R16_G16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_FLOAT].textureDecoder = TextureDecoder_R16_G16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_UNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_SNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4_G4_UNORM_To_ABGR4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_UINT].textureDecoder = TextureDecoder_R32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_FLOAT].textureDecoder = TextureDecoder_R16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_SNORM].textureDecoder = TextureDecoder_R16_SNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UINT].textureDecoder = TextureDecoder_R16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_SNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UINT].textureDecoder = TextureDecoder_R8_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5_G6_B5_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT].textureDecoder = TextureDecoder_R11_G11_B10_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_SRGB].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_UNORM].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_UNORM].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_SRGB].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_UNORM].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_SRGB].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_UNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_SNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_UNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_SNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R24_X8_UNORM].textureDecoder = TextureDecoder_R24_X8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::X24_G8_UINT].textureDecoder = TextureDecoder_X24_G8_UINT::getInstance(); + + if (!support.m_supportsPacked16BitFormats) + { + // B5G6R5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5G6B5_UNORM_To_RGBA8::getInstance(); + + // A1BGR5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM_vulkan_To_RGBA8::getInstance(); + + // ABGR4Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].pixelFormat = MTL::PixelFormatRG8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 2; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RG8::getInstance(); + + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4G4B4A4_UNORM_To_RGBA8::getInstance(); + + // BGR5A1Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB_To_RGBA8::getInstance(); + } + + // Depth + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_FLOAT].textureDecoder = TextureDecoder_NullData64::getInstance(); // TODO: why? + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_S8_FLOAT].textureDecoder = TextureDecoder_D32_S8_UINT_X24::getInstance(); + + if (!support.m_supportsDepth24Unorm_Stencil8) + { + // Depth24Unorm_Stencil8 + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + // TODO: implement the decoder + //MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8_To_D32_S8::getInstance(); + } +} + +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) +{ + if (isDepth) + { + auto it = MTL_DEPTH_FORMAT_TABLE.find(format); + if (it == MTL_DEPTH_FORMAT_TABLE.end()) + return {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2}; // Fallback + else + return it->second; + } + else + { + auto it = MTL_COLOR_FORMAT_TABLE.find(format); + if (it == MTL_COLOR_FORMAT_TABLE.end()) + return {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}; // Fallback + else + return it->second; + } +} + +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth) +{ + auto pixelFormat = GetMtlPixelFormatInfo(format, isDepth).pixelFormat; + if (pixelFormat == MTL::PixelFormatInvalid) + cemuLog_log(LogType::Force, "invalid pixel format 0x{:x}, is depth: {}\n", format, isDepth); + + return pixelFormat; +} + +inline uint32 CeilDivide(uint32 a, uint32 b) { + return (a + b - 1) / b; +} + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width) +{ + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); + + return CeilDivide(width, formatInfo.blockTexelSize.x) * formatInfo.bytesPerBlock; +} + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow) +{ + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); + + return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; +} + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) +{ + switch (primitiveMode) + { + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS: + return MTL::PrimitiveTypePoint; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINES: + return MTL::PrimitiveTypeLine; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP: + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_LOOP: + return MTL::PrimitiveTypeLineStrip; // line loops are emulated as line strips with an extra connecting strip at the end + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP_ADJACENT: // Tropical Freeze level 3-6 + cemuLog_logOnce(LogType::Force, "Metal doesn't support line strip adjacent primitive, using line strip instead"); + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLES: + return MTL::PrimitiveTypeTriangle; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_FAN: + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_STRIP: + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUADS: + return MTL::PrimitiveTypeTriangle; // quads are emulated as 2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUAD_STRIP: + return MTL::PrimitiveTypeTriangle; // quad strips are emulated as (count-2)/2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS: + return MTL::PrimitiveTypeTriangle; // rects are emulated as 2 triangles + default: + cemuLog_log(LogType::Force, "Unsupported primitive mode {}", primitiveMode); + cemu_assert_debug(false); + return MTL::PrimitiveTypeTriangle; + } +} + +MTL::VertexFormat GetMtlVertexFormat(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return MTL::VertexFormatUInt4; + case FMT_32_32_32_FLOAT: + return MTL::VertexFormatUInt3; + case FMT_32_32_FLOAT: + return MTL::VertexFormatUInt2; + case FMT_32_FLOAT: + return MTL::VertexFormatUInt; + case FMT_8_8_8_8: + return MTL::VertexFormatUChar4; + case FMT_8_8_8: + return MTL::VertexFormatUChar3; + case FMT_8_8: + return MTL::VertexFormatUChar2; + case FMT_8: + return MTL::VertexFormatUChar; + case FMT_32_32_32_32: + return MTL::VertexFormatUInt4; + case FMT_32_32_32: + return MTL::VertexFormatUInt3; + case FMT_32_32: + return MTL::VertexFormatUInt2; + case FMT_32: + return MTL::VertexFormatUInt; + case FMT_16_16_16_16: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16: + return MTL::VertexFormatUShort3; + case FMT_16_16: + return MTL::VertexFormatUShort2; + case FMT_16: + return MTL::VertexFormatUShort; + case FMT_16_16_16_16_FLOAT: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16_FLOAT: + return MTL::VertexFormatUShort3; + case FMT_16_16_FLOAT: + return MTL::VertexFormatUShort2; + case FMT_16_FLOAT: + return MTL::VertexFormatUShort; + case FMT_2_10_10_10: + return MTL::VertexFormatUInt; // verified to match OpenGL + default: + cemuLog_log(LogType::Force, "unsupported vertex format {}", (uint32)format); + assert_dbg(); + return MTL::VertexFormatInvalid; + } +} + +uint32 GetMtlVertexFormatSize(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return 16; + case FMT_32_32_32_FLOAT: + return 12; + case FMT_32_32_FLOAT: + return 8; + case FMT_32_FLOAT: + return 4; + case FMT_8_8_8_8: + return 4; + case FMT_8_8_8: + return 3; + case FMT_8_8: + return 2; + case FMT_8: + return 1; + case FMT_32_32_32_32: + return 16; + case FMT_32_32_32: + return 12; + case FMT_32_32: + return 8; + case FMT_32: + return 4; + case FMT_16_16_16_16: + return 8; + case FMT_16_16_16: + return 6; + case FMT_16_16: + return 4; + case FMT_16: + return 2; + case FMT_16_16_16_16_FLOAT: + return 8; + case FMT_16_16_16_FLOAT: + return 6; + case FMT_16_16_FLOAT: + return 4; + case FMT_16_FLOAT: + return 2; + case FMT_2_10_10_10: + return 4; + default: + return 0; + } +} + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) +{ + switch (indexType) + { + case Renderer::INDEX_TYPE::U16: + return MTL::IndexTypeUInt16; + case Renderer::INDEX_TYPE::U32: + return MTL::IndexTypeUInt32; + default: + cemu_assert_suspicious(); + return MTL::IndexTypeUInt32; + } +} + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc) +{ + switch (combineFunc) + { + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_PLUS_SRC: + return MTL::BlendOperationAdd; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::SRC_MINUS_DST: + return MTL::BlendOperationSubtract; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MIN_DST_SRC: + return MTL::BlendOperationMin; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MAX_DST_SRC: + return MTL::BlendOperationMax; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_MINUS_SRC: + return MTL::BlendOperationReverseSubtract; + default: + cemu_assert_suspicious(); + return MTL::BlendOperationAdd; + } +} + +const MTL::BlendFactor MTL_BLEND_FACTORS[] = +{ + /* 0x00 */ MTL::BlendFactorZero, + /* 0x01 */ MTL::BlendFactorOne, + /* 0x02 */ MTL::BlendFactorSourceColor, + /* 0x03 */ MTL::BlendFactorOneMinusSourceColor, + /* 0x04 */ MTL::BlendFactorSourceAlpha, + /* 0x05 */ MTL::BlendFactorOneMinusSourceAlpha, + /* 0x06 */ MTL::BlendFactorDestinationAlpha, + /* 0x07 */ MTL::BlendFactorOneMinusDestinationAlpha, + /* 0x08 */ MTL::BlendFactorDestinationColor, + /* 0x09 */ MTL::BlendFactorOneMinusDestinationColor, + /* 0x0A */ MTL::BlendFactorSourceAlphaSaturated, + /* 0x0B */ MTL::BlendFactorZero, // TODO + /* 0x0C */ MTL::BlendFactorZero, // TODO + /* 0x0D */ MTL::BlendFactorBlendColor, + /* 0x0E */ MTL::BlendFactorOneMinusBlendColor, + /* 0x0F */ MTL::BlendFactorSource1Color, + /* 0x10 */ MTL::BlendFactorOneMinusSource1Color, + /* 0x11 */ MTL::BlendFactorSource1Alpha, + /* 0x12 */ MTL::BlendFactorOneMinusSource1Alpha, + /* 0x13 */ MTL::BlendFactorBlendAlpha, + /* 0x14 */ MTL::BlendFactorOneMinusBlendAlpha +}; + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor) +{ + cemu_assert_debug((uint32)factor < std::size(MTL_BLEND_FACTORS)); + return MTL_BLEND_FACTORS[(uint32)factor]; +} + +const MTL::CompareFunction MTL_COMPARE_FUNCTIONS[8] = +{ + MTL::CompareFunctionNever, + MTL::CompareFunctionLess, + MTL::CompareFunctionEqual, + MTL::CompareFunctionLessEqual, + MTL::CompareFunctionGreater, + MTL::CompareFunctionNotEqual, + MTL::CompareFunctionGreaterEqual, + MTL::CompareFunctionAlways +}; + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func) +{ + cemu_assert_debug((uint32)func < std::size(MTL_COMPARE_FUNCTIONS)); + return MTL_COMPARE_FUNCTIONS[(uint32)func]; +} + +// TODO: clamp to border color? (should be fine though) +const MTL::SamplerAddressMode MTL_SAMPLER_ADDRESS_MODES[] = { + MTL::SamplerAddressModeRepeat, // WRAP + MTL::SamplerAddressModeMirrorRepeat, // MIRROR + MTL::SamplerAddressModeClampToEdge, // CLAMP_LAST_TEXEL + MTL::SamplerAddressModeMirrorClampToEdge, // MIRROR_ONCE_LAST_TEXEL + MTL::SamplerAddressModeClampToEdge, // unsupported HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // unsupported MIRROR_ONCE_HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // CLAMP_BORDER + MTL::SamplerAddressModeClampToBorderColor // MIRROR_ONCE_BORDER +}; + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp) +{ + cemu_assert_debug((uint32)clamp < std::size(MTL_SAMPLER_ADDRESS_MODES)); + return MTL_SAMPLER_ADDRESS_MODES[(uint32)clamp]; +} + +const MTL::TextureSwizzle MTL_TEXTURE_SWIZZLES[] = { + MTL::TextureSwizzleRed, + MTL::TextureSwizzleGreen, + MTL::TextureSwizzleBlue, + MTL::TextureSwizzleAlpha, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleOne, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleZero +}; + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle) +{ + cemu_assert_debug(swizzle < std::size(MTL_TEXTURE_SWIZZLES)); + return MTL_TEXTURE_SWIZZLES[swizzle]; +} + +const MTL::StencilOperation MTL_STENCIL_OPERATIONS[8] = { + MTL::StencilOperationKeep, + MTL::StencilOperationZero, + MTL::StencilOperationReplace, + MTL::StencilOperationIncrementClamp, + MTL::StencilOperationDecrementClamp, + MTL::StencilOperationInvert, + MTL::StencilOperationIncrementWrap, + MTL::StencilOperationDecrementWrap +}; + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action) +{ + cemu_assert_debug((uint32)action < std::size(MTL_STENCIL_OPERATIONS)); + return MTL_STENCIL_OPERATIONS[(uint32)action]; +} + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask) +{ + MTL::ColorWriteMask mtlMask = MTL::ColorWriteMaskNone; + if (mask & 0x1) mtlMask |= MTL::ColorWriteMaskRed; + if (mask & 0x2) mtlMask |= MTL::ColorWriteMaskGreen; + if (mask & 0x4) mtlMask |= MTL::ColorWriteMaskBlue; + if (mask & 0x8) mtlMask |= MTL::ColorWriteMaskAlpha; + + return mtlMask; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h new file mode 100644 index 0000000000..ef25ca5d51 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -0,0 +1,86 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +//#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/precompiled.h" +#include "HW/Latte/Core/LatteTextureLoader.h" + +struct Uvec2 { + uint32 x; + uint32 y; +}; + +enum class MetalDataType +{ + NONE, + INT, + UINT, + FLOAT, +}; + +struct MetalPixelFormatInfo { + MTL::PixelFormat pixelFormat; + MetalDataType dataType; + size_t bytesPerBlock; + Uvec2 blockTexelSize = {1, 1}; + bool hasStencil = false; + TextureDecoder* textureDecoder = nullptr; +}; + +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support); + +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); + +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth); + +inline MetalDataType GetColorBufferDataType(const uint32 index, const LatteContextRegister& lcr) +{ + auto format = LatteMRT::GetColorBufferFormat(index, lcr); + return GetMtlPixelFormatInfo(format, false).dataType; +} + +inline const char* GetDataTypeStr(MetalDataType dataType) +{ + switch (dataType) + { + case MetalDataType::INT: + return "int4"; + case MetalDataType::UINT: + return "uint4"; + case MetalDataType::FLOAT: + return "float4"; + default: + cemu_assert_suspicious(); + return "INVALID"; + } +} + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode); + +MTL::VertexFormat GetMtlVertexFormat(uint8 format); + +uint32 GetMtlVertexFormatSize(uint8 format); + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc); + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor); + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp); + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle); + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action); + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp new file mode 100644 index 0000000000..88a2dface7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp @@ -0,0 +1,48 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +MetalAttachmentsInfo::MetalAttachmentsInfo(class CachedFBOMtl* fbo) +{ + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + const auto& colorBuffer = fbo->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + continue; + + colorFormats[i] = texture->format; + } + + // Depth stencil attachment + if (fbo->depthBuffer.texture) + { + auto texture = static_cast(fbo->depthBuffer.texture); + depthFormat = texture->format; + hasStencil = fbo->depthBuffer.hasStencil; + } +} + +MetalAttachmentsInfo::MetalAttachmentsInfo(const LatteContextRegister& lcr, const LatteDecompilerShader* pixelShader) +{ + uint8 cbMask = LatteMRT::GetActiveColorBufferMask(pixelShader, lcr); + bool dbMask = LatteMRT::GetActiveDepthBufferMask(lcr); + + // Color attachments + for (int i = 0; i < 8; ++i) + { + if ((cbMask & (1 << i)) == 0) + continue; + + colorFormats[i] = LatteMRT::GetColorBufferFormat(i, lcr); + } + + // Depth stencil attachment + if (dbMask) + { + Latte::E_GX2SURFFMT format = LatteMRT::GetDepthBufferFormat(lcr); + depthFormat = format; + hasStencil = GetMtlPixelFormatInfo(format, true).hasStencil; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h new file mode 100644 index 0000000000..c8ebe7c115 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h @@ -0,0 +1,15 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +class MetalAttachmentsInfo +{ +public: + MetalAttachmentsInfo() = default; + MetalAttachmentsInfo(class CachedFBOMtl* fbo); + MetalAttachmentsInfo(const LatteContextRegister& lcr, const class LatteDecompilerShader* pixelShader); + + Latte::E_GX2SURFFMT colorFormats[LATTE_NUM_COLOR_TARGET] = {Latte::E_GX2SURFFMT::INVALID_FORMAT}; + Latte::E_GX2SURFFMT depthFormat = Latte::E_GX2SURFFMT::INVALID_FORMAT; + bool hasStencil = false; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp new file mode 100644 index 0000000000..05d169b309 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp @@ -0,0 +1,217 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" + +MetalBufferChunkedHeap::~MetalBufferChunkedHeap() +{ + for (auto& chunk : m_chunkBuffers) + chunk->release(); +} + +uint32 MetalBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) +{ + size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); + cemu_assert_debug(buffer); + cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); + m_chunkBuffers.emplace_back(buffer); + + return allocationSize; +} + +void MetalSynchronizedRingAllocator::addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset) +{ + auto commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + if (commandBuffer == buffer.lastSyncpointCommandBuffer) + return; + buffer.lastSyncpointCommandBuffer = commandBuffer; + buffer.queue_syncPoints.emplace(commandBuffer, offset); +} + +void MetalSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc) +{ + // calculate buffer size, should be a multiple of bufferAllocSize that is at least as large as sizeRequiredForAlloc + uint32 bufferAllocSize = m_minimumBufferAllocSize; + while (bufferAllocSize < sizeRequiredForAlloc) + bufferAllocSize += m_minimumBufferAllocSize; + + AllocatorBuffer_t newBuffer{}; + newBuffer.writeIndex = 0; + newBuffer.basePtr = nullptr; + newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, m_options); + newBuffer.basePtr = (uint8*)newBuffer.mtlBuffer->contents(); + newBuffer.size = bufferAllocSize; + newBuffer.index = (uint32)m_buffers.size(); + m_buffers.push_back(newBuffer); +} + +MetalSynchronizedRingAllocator::AllocatorReservation_t MetalSynchronizedRingAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + if (alignment < 128) + alignment = 128; + size = (size + 127) & ~127; + + for (auto& itr : m_buffers) + { + // align pointer + uint32 alignmentPadding = (alignment - (itr.writeIndex % alignment)) % alignment; + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = 0xFFFFFFFF; + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = 0xFFFFFFFF; + uint32 spaceNeeded = alignmentPadding + size; + if (spaceNeeded > distanceToSyncPoint) + continue; // not enough space in current buffer + if ((itr.writeIndex + spaceNeeded) > itr.size) + { + // wrap-around + spaceNeeded = size; + alignmentPadding = 0; + // check if there is enough space in current buffer after wrap-around + if (!itr.queue_syncPoints.empty()) + { + distanceToSyncPoint = itr.queue_syncPoints.front().offset - 0; + if (spaceNeeded > distanceToSyncPoint) + continue; + } + else if (spaceNeeded > itr.size) + continue; + itr.writeIndex = 0; + } + addUploadBufferSyncPoint(itr, itr.writeIndex); + itr.writeIndex += alignmentPadding; + uint32 offset = itr.writeIndex; + itr.writeIndex += size; + itr.cleanupCounter = 0; + MetalSynchronizedRingAllocator::AllocatorReservation_t res; + res.mtlBuffer = itr.mtlBuffer; + res.memPtr = itr.basePtr + offset; + res.bufferOffset = offset; + res.size = size; + res.bufferIndex = itr.index; + + return res; + } + + // allocate new buffer + allocateAdditionalUploadBuffer(size); + + return AllocateBufferMemory(size, alignment); +} + +void MetalSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) +{ + if (RequiresFlush()) + { + uploadReservation.mtlBuffer->didModifyRange(NS::Range(uploadReservation.bufferOffset, uploadReservation.size)); + } +} + +void MetalSynchronizedRingAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + for (auto& itr : m_buffers) + { + while (!itr.queue_syncPoints.empty() && latestFinishedCommandBuffer == itr.queue_syncPoints.front().commandBuffer) + { + itr.queue_syncPoints.pop(); + } + if (itr.queue_syncPoints.empty()) + itr.cleanupCounter++; + } + + // check if last buffer is available for deletion + if (m_buffers.size() >= 2) + { + auto& lastBuffer = m_buffers.back(); + if (lastBuffer.cleanupCounter >= 1000) + { + // release buffer + lastBuffer.mtlBuffer->release(); + m_buffers.pop_back(); + } + } +} + +MTL::Buffer* MetalSynchronizedRingAllocator::GetBufferByIndex(uint32 index) const +{ + return m_buffers[index].mtlBuffer; +} + +void MetalSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + numBuffers = (uint32)m_buffers.size(); + totalBufferSize = 0; + freeBufferSize = 0; + for (auto& itr : m_buffers) + { + totalBufferSize += itr.size; + // calculate free space in buffer + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = (itr.size - itr.writeIndex) + itr.queue_syncPoints.front().offset; // size with wrap-around + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = itr.size; + freeBufferSize += distanceToSyncPoint; + } +} + +/* MetalSynchronizedHeapAllocator */ + +MetalSynchronizedHeapAllocator::AllocatorReservation* MetalSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + CHAddr addr = m_chunkedHeap.alloc(size, alignment); + m_activeAllocations.emplace_back(addr); + AllocatorReservation* res = m_poolAllocatorReservation.allocObj(); + res->bufferIndex = addr.chunkIndex; + res->bufferOffset = addr.offset; + res->size = size; + res->mtlBuffer = m_chunkedHeap.GetBufferByIndex(addr.chunkIndex); + res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset; + + return res; +} + +void MetalSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation) +{ + // put the allocation on a delayed release queue for the current command buffer + MTL::CommandBuffer* currentCommandBuffer = m_mtlr->GetCurrentCommandBuffer(); + auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; }); + cemu_assert_debug(it != m_activeAllocations.end()); + m_releaseQueue[currentCommandBuffer].emplace_back(it->allocation); + m_activeAllocations.erase(it); + m_poolAllocatorReservation.freeObj(uploadReservation); +} + +void MetalSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) +{ + if (m_chunkedHeap.RequiresFlush()) + { + uploadReservation->mtlBuffer->didModifyRange(NS::Range(uploadReservation->bufferOffset, uploadReservation->size)); + } +} + +void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + auto it = m_releaseQueue.find(latestFinishedCommandBuffer); + if (it == m_releaseQueue.end()) + return; + + // release allocations + for (auto& addr : it->second) + m_chunkedHeap.free(addr); + m_releaseQueue.erase(it); +} + +void MetalSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h new file mode 100644 index 0000000000..2a62de19cb --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -0,0 +1,163 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" +#include "util/ChunkedHeap/ChunkedHeap.h" +#include "util/helpers/MemoryPool.h" + +#include + +inline MTL::ResourceOptions GetResourceOptions(MTL::ResourceOptions options) +{ + if (options & MTL::ResourceStorageModeShared || options & MTL::ResourceStorageModeManaged) + options |= MTL::ResourceCPUCacheModeWriteCombined; + + return options; +} + +class MetalBufferChunkedHeap : private ChunkedHeap<> +{ + public: + MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + ~MetalBufferChunkedHeap(); + + using ChunkedHeap::alloc; + using ChunkedHeap::free; + + uint8* GetChunkPtr(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return nullptr; + + return (uint8*)m_chunkBuffers[index]->contents(); + } + + MTL::Buffer* GetBufferByIndex(uint32 index) const + { + cemu_assert_debug(index < m_chunkBuffers.size()); + + return m_chunkBuffers[index]; + } + + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const + { + numBuffers = m_chunkBuffers.size(); + totalBufferSize = m_numHeapBytes; + freeBufferSize = m_numHeapBytes - m_numAllocatedBytes; + } + + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + + const class MetalRenderer* m_mtlr; + + MTL::ResourceOptions m_options; + size_t m_minimumBufferAllocationSize; + + std::vector m_chunkBuffers; +}; + +// a circular ring-buffer which tracks and releases memory per command-buffer +class MetalSynchronizedRingAllocator +{ +public: + MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + MetalSynchronizedRingAllocator(const MetalSynchronizedRingAllocator&) = delete; // disallow copy + + struct BufferSyncPoint_t + { + // todo - modularize sync point + MTL::CommandBuffer* commandBuffer; + uint32 offset; + + BufferSyncPoint_t(MTL::CommandBuffer* _commandBuffer, uint32 _offset) : commandBuffer(_commandBuffer), offset(_offset) {}; + }; + + struct AllocatorBuffer_t + { + MTL::Buffer* mtlBuffer; + uint8* basePtr; + uint32 size; + uint32 writeIndex; + std::queue queue_syncPoints; + MTL::CommandBuffer* lastSyncpointCommandBuffer{ nullptr }; + uint32 index; + uint32 cleanupCounter{ 0 }; // increased by one every time CleanupBuffer() is called if there is no sync point. If it reaches 300 then the buffer is released + }; + + struct AllocatorReservation_t + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation_t AllocateBufferMemory(uint32 size, uint32 alignment); + void FlushReservation(AllocatorReservation_t& uploadReservation); + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + MTL::Buffer* GetBufferByIndex(uint32 index) const; + + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + +private: + void allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc); + void addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset); + + const class MetalRenderer* m_mtlr; + + MTL::ResourceOptions m_options; + const uint32 m_minimumBufferAllocSize; + + std::vector m_buffers; +}; + +// heap style allocator with released memory being freed after the current command buffer finishes +class MetalSynchronizedHeapAllocator +{ + struct TrackedAllocation + { + TrackedAllocation(CHAddr allocation) : allocation(allocation) {}; + CHAddr allocation; + }; + + public: + MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, options, minimumBufferAllocSize) {} + MetalSynchronizedHeapAllocator(const MetalSynchronizedHeapAllocator&) = delete; // disallow copy + + struct AllocatorReservation + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment); + void FreeReservation(AllocatorReservation* uploadReservation); + void FlushReservation(AllocatorReservation* uploadReservation); + + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + private: + const class MetalRenderer* m_mtlr; + MetalBufferChunkedHeap m_chunkedHeap; + // allocations + std::vector m_activeAllocations; + MemoryPool m_poolAllocatorReservation{32}; + // release queue + std::unordered_map> m_releaseQueue; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h new file mode 100644 index 0000000000..d16c559760 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -0,0 +1,221 @@ +#pragma once + +#include +#include + +#include "Cafe/HW/Latte/Core/LatteConst.h" + +struct MetalPixelFormatSupport +{ + bool m_supportsR8Unorm_sRGB; + bool m_supportsRG8Unorm_sRGB; + bool m_supportsPacked16BitFormats; + bool m_supportsDepth24Unorm_Stencil8; + + MetalPixelFormatSupport() = default; + MetalPixelFormatSupport(MTL::Device* device) + { + m_supportsR8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsRG8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsPacked16BitFormats = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsDepth24Unorm_Stencil8 = device->depth24Stencil8PixelFormatSupported(); + } +}; + +// TODO: don't define a new struct for this +struct MetalQueryRange +{ + uint32 begin; + uint32 end; +}; + +#define MAX_MTL_BUFFERS 31 +// Buffer indices 28-30 are reserved for the helper shaders +#define MTL_RESERVED_BUFFERS 3 +#define MAX_MTL_VERTEX_BUFFERS (MAX_MTL_BUFFERS - MTL_RESERVED_BUFFERS) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_VERTEX_BUFFERS - index - 1) + +#define MAX_MTL_TEXTURES 31 +#define MAX_MTL_SAMPLERS 16 + +#define GET_HELPER_BUFFER_BINDING(index) (28 + index) +#define GET_HELPER_TEXTURE_BINDING(index) (29 + index) +#define GET_HELPER_SAMPLER_BINDING(index) (14 + index) + +constexpr uint32 INVALID_UINT32 = std::numeric_limits::max(); +constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); + +inline size_t Align(size_t size, size_t alignment) +{ + return (size + alignment - 1) & ~(alignment - 1); +} + +__attribute__((unused)) static inline void ETStackAutoRelease(void* object) +{ + (*(NS::Object**)object)->release(); +} + +#define NS_STACK_SCOPED __attribute__((cleanup(ETStackAutoRelease))) __attribute__((unused)) + +// Cast from const char* to NS::String* +inline NS::String* ToNSString(const char* str) +{ + return NS::String::string(str, NS::ASCIIStringEncoding); +} + +// Cast from std::string to NS::String* +inline NS::String* ToNSString(const std::string& str) +{ + return ToNSString(str.c_str()); +} + +// Cast from const char* to NS::URL* +inline NS::URL* ToNSURL(const char* str) +{ + return NS::URL::fileURLWithPath(ToNSString(str)); +} + +// Cast from std::string to NS::URL* +inline NS::URL* ToNSURL(const std::string& str) +{ + return ToNSURL(str.c_str()); +} + +inline NS::String* GetLabel(const std::string& label, const void* identifier) +{ + return ToNSString(label + " (" + std::to_string(reinterpret_cast(identifier)) + ")"); +} + +constexpr MTL::RenderStages ALL_MTL_RENDER_STAGES = MTL::RenderStageVertex | MTL::RenderStageObject | MTL::RenderStageMesh | MTL::RenderStageFragment; + +inline bool IsValidDepthTextureType(Latte::E_DIM dim) +{ + return (dim == Latte::E_DIM::DIM_2D || dim == Latte::E_DIM::DIM_2D_MSAA || dim == Latte::E_DIM::DIM_2D_ARRAY || dim == Latte::E_DIM::DIM_2D_ARRAY_MSAA || dim == Latte::E_DIM::DIM_CUBEMAP); +} + +inline bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) +{ + auto status = commandBuffer->status(); + return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); +} + +inline bool FormatIsRenderable(Latte::E_GX2SURFFMT format) +{ + return !Latte::IsCompressedFormat(format); +} + +template +inline bool executeCommand(fmt::format_string fmt, T&&... args) { + std::string command = fmt::format(fmt, std::forward(args)...); + int res = system(command.c_str()); + if (res != 0) + { + cemuLog_log(LogType::Force, "command \"{}\" failed with exit code {}", command, res); + return false; + } + + return true; +} + +/* +class MemoryMappedFile +{ +public: + MemoryMappedFile(const std::string& filePath) + { + // Open the file + m_fd = open(filePath.c_str(), O_RDONLY); + if (m_fd == -1) { + cemuLog_log(LogType::Force, "failed to open file: {}", filePath); + return; + } + + // Get the file size + // Use a loop to handle the case where the file size is 0 (more of a safety net) + struct stat fileStat; + while (true) + { + if (fstat(m_fd, &fileStat) == -1) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to get file size: {}", filePath); + return; + } + m_fileSize = fileStat.st_size; + + if (m_fileSize == 0) + { + cemuLog_logOnce(LogType::Force, "file size is 0: {}", filePath); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + break; + } + + // Memory map the file + m_data = mmap(nullptr, m_fileSize, PROT_READ, MAP_PRIVATE, m_fd, 0); + if (m_data == MAP_FAILED) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to memory map file: {}", filePath); + return; + } + } + + ~MemoryMappedFile() + { + if (m_data && m_data != MAP_FAILED) + munmap(m_data, m_fileSize); + + if (m_fd != -1) + close(m_fd); + } + + uint8* data() const { return static_cast(m_data); } + size_t size() const { return m_fileSize; } + +private: + int m_fd = -1; + void* m_data = nullptr; + size_t m_fileSize = 0; +}; +*/ + +inline uint32 GetVerticesPerPrimitive(LattePrimitiveMode primitiveMode) +{ + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + return 1; + case LattePrimitiveMode::LINES: + return 2; + case LattePrimitiveMode::LINE_STRIP: + // Same as line, but requires connection + return 2; + case LattePrimitiveMode::TRIANGLES: + return 3; + case LattePrimitiveMode::RECTS: + return 3; + default: + cemuLog_log(LogType::Force, "Unimplemented primitive type {}", primitiveMode); + return 0; + } +} + +inline bool PrimitiveRequiresConnection(LattePrimitiveMode primitiveMode) +{ + if (primitiveMode == LattePrimitiveMode::LINE_STRIP) + return true; + else + return false; +} + +inline bool UseRectEmulation(const LatteContextRegister& lcr) { + const LattePrimitiveMode primitiveMode = lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + return (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); +} + +inline bool UseGeometryShader(const LatteContextRegister& lcr, bool hasGeometryShader) { + return hasGeometryShader || UseRectEmulation(lcr); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp new file mode 100644 index 0000000000..13cd9dd67b --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp @@ -0,0 +1,6 @@ +#define NS_PRIVATE_IMPLEMENTATION +#define CA_PRIVATE_IMPLEMENTATION +#define MTL_PRIVATE_IMPLEMENTATION +#include +#include +#include diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp new file mode 100644 index 0000000000..1fe680bb45 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -0,0 +1,119 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/ISA/RegDefines.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLDepthStencil.hpp" + +MetalDepthStencilCache::~MetalDepthStencilCache() +{ + for (auto& pair : m_depthStencilCache) + { + pair.second->release(); + } + m_depthStencilCache.clear(); +} + +MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const LatteContextRegister& lcr) +{ + uint64 stateHash = CalculateDepthStencilHash(lcr); + auto& depthStencilState = m_depthStencilCache[stateHash]; + if (depthStencilState) + return depthStencilState; + + // Depth stencil state + bool depthEnable = lcr.DB_DEPTH_CONTROL.get_Z_ENABLE(); + auto depthFunc = lcr.DB_DEPTH_CONTROL.get_Z_FUNC(); + bool depthWriteEnable = lcr.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); + + NS_STACK_SCOPED MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); + if (depthEnable) + { + desc->setDepthWriteEnabled(depthWriteEnable); + desc->setDepthCompareFunction(GetMtlCompareFunc(depthFunc)); + } + + // Stencil state + bool stencilEnable = lcr.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + if (stencilEnable) + { + // get stencil control parameters + bool backStencilEnable = lcr.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + // get stencil control parameters + uint32 stencilCompareMaskFront = lcr.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = lcr.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilCompareMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + + NS_STACK_SCOPED MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); + frontStencil->setReadMask(stencilCompareMaskFront); + frontStencil->setWriteMask(stencilWriteMaskFront); + frontStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + frontStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + frontStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + frontStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + desc->setFrontFaceStencil(frontStencil); + + NS_STACK_SCOPED MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); + if (backStencilEnable) + { + backStencil->setReadMask(stencilCompareMaskBack); + backStencil->setWriteMask(stencilWriteMaskBack); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(backStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(backStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(backStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(backStencilZPass)); + } + else + { + backStencil->setReadMask(stencilCompareMaskFront); + backStencil->setWriteMask(stencilWriteMaskFront); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + } + desc->setBackFaceStencil(backStencil); + } + + depthStencilState = m_mtlr->GetDevice()->newDepthStencilState(desc); + + return depthStencilState; +} + +uint64 MetalDepthStencilCache::CalculateDepthStencilHash(const LatteContextRegister& lcr) +{ + uint32* ctxRegister = lcr.GetRawView(); + + // Hash + uint64 stateHash = 0; + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; + bool stencilTestEnable = depthControl & 1; + if (stencilTestEnable) + { + stateHash += ctxRegister[mmDB_STENCILREFMASK]; + stateHash = std::rotl(stateHash, 17); + if(depthControl & (1<<7)) // back stencil enable + { + stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; + stateHash = std::rotl(stateHash, 13); + } + } + else + { + // zero out stencil related bits (8-31) + depthControl &= 0xFF; + } + + stateHash = std::rotl(stateHash, 17); + stateHash += depthControl; + + return stateHash; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h new file mode 100644 index 0000000000..4ce05c286d --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +class MetalDepthStencilCache +{ +public: + MetalDepthStencilCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalDepthStencilCache(); + + MTL::DepthStencilState* GetDepthStencilState(const LatteContextRegister& lcr); + +private: + class MetalRenderer* m_mtlr; + + std::map m_depthStencilCache; + + uint64 CalculateDepthStencilHash(const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h new file mode 100644 index 0000000000..d2b30667ca --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h @@ -0,0 +1,3 @@ +#pragma once + +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm new file mode 100644 index 0000000000..16a7aa676a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm @@ -0,0 +1,22 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY) +{ + NSView* view = (NSView*)handle; + + MetalView* childView = [[MetalView alloc] initWithFrame:view.bounds]; + childView.autoresizingMask = NSViewWidthSizable | NSViewHeightSizable; + childView.wantsLayer = YES; + + [view addSubview:childView]; + + const NSRect points = [childView frame]; + const NSRect pixels = [childView convertRectToBacking:points]; + + scaleX = (float)(pixels.size.width / points.size.width); + scaleY = (float)(pixels.size.height / points.size.height); + + return childView.layer; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp new file mode 100644 index 0000000000..ddc9417723 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -0,0 +1,46 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "gui/interface/WindowSystem.h" + +MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow) +{ + const auto& windowInfo = (mainWindow ? WindowSystem::GetWindowInfo().window_main : WindowSystem::GetWindowInfo().window_pad); + + m_layer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.surface, m_layerScaleX, m_layerScaleY); + m_layer->setDevice(device); + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); + m_layer->setFramebufferOnly(true); +} + +MetalLayerHandle::~MetalLayerHandle() +{ + if (m_layer) + m_layer->release(); +} + +void MetalLayerHandle::Resize(const Vector2i& size) +{ + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); +} + +bool MetalLayerHandle::AcquireDrawable() +{ + if (m_drawable) + return true; + + m_drawable = m_layer->nextDrawable(); + if (!m_drawable) + { + cemuLog_log(LogType::Force, "layer {} failed to acquire next drawable", (void*)this); + return false; + } + + return true; +} + +void MetalLayerHandle::PresentDrawable(MTL::CommandBuffer* commandBuffer) +{ + commandBuffer->presentDrawable(m_drawable); + m_drawable = nullptr; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h new file mode 100644 index 0000000000..014d2d432f --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -0,0 +1,31 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "util/math/vector2.h" + +class MetalLayerHandle +{ +public: + MetalLayerHandle() = default; + MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow); + + ~MetalLayerHandle(); + + void Resize(const Vector2i& size); + + bool AcquireDrawable(); + + void PresentDrawable(MTL::CommandBuffer* commandBuffer); + + CA::MetalLayer* GetLayer() const { return m_layer; } + + CA::MetalDrawable* GetDrawable() const { return m_drawable; } + +private: + CA::MetalLayer* m_layer = nullptr; + float m_layerScaleX, m_layerScaleY; + + CA::MetalDrawable* m_drawable = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp new file mode 100644 index 0000000000..d2fac40183 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -0,0 +1,128 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + +#include "CafeSystem.h" +#include "Cemu/Logging/CemuLogging.h" +#include "Common/precompiled.h" +#include "HW/MMU/MMU.h" +#include "config/CemuConfig.h" + +MetalMemoryManager::~MetalMemoryManager() +{ + if (m_bufferCache) + { + m_bufferCache->release(); + } +} + +void* MetalMemoryManager::AcquireTextureUploadBuffer(size_t size) +{ + if (m_textureUploadBuffer.size() < size) + { + m_textureUploadBuffer.resize(size); + } + + return m_textureUploadBuffer.data(); +} + +void MetalMemoryManager::ReleaseTextureUploadBuffer(uint8* mem) +{ + cemu_assert_debug(m_textureUploadBuffer.data() == mem); + m_textureUploadBuffer.clear(); +} + +void MetalMemoryManager::InitBufferCache(size_t size) +{ + cemu_assert_debug(!m_bufferCache); + + m_metalBufferCacheMode = g_current_game_profile->GetBufferCacheMode(); + + if (m_metalBufferCacheMode == MetalBufferCacheMode::Auto) + { + // TODO: do this for all unified memory systems? + if (m_mtlr->IsAppleGPU()) + { + switch (CafeSystem::GetForegroundTitleId()) + { + // The Legend of Zelda: Wind Waker HD + case 0x0005000010143600: // EUR + case 0x0005000010143500: // USA + case 0x0005000010143400: // JPN + // TODO: use host instead? + m_metalBufferCacheMode = MetalBufferCacheMode::DeviceShared; + break; + default: + m_metalBufferCacheMode = MetalBufferCacheMode::DevicePrivate; + break; + } + } + else + { + m_metalBufferCacheMode = MetalBufferCacheMode::DevicePrivate; + } + } + + // First, try to import the host memory as a buffer + if (m_metalBufferCacheMode == MetalBufferCacheMode::Host) + { + if (m_mtlr->HasUnifiedMemory()) + { + m_importedMemBaseAddress = mmuRange_MEM2.getBase(); + m_hostAllocationSize = mmuRange_MEM2.getSize(); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (!m_bufferCache) + { + cemuLog_log(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); + m_metalBufferCacheMode = MetalBufferCacheMode::DeviceShared; + } + } + else + { + cemuLog_log(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); + m_metalBufferCacheMode = MetalBufferCacheMode::DeviceShared; + } + } + + if (!m_bufferCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_metalBufferCacheMode == MetalBufferCacheMode::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); + +#ifdef CEMU_DEBUG_ASSERT + m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); +#endif +} + +void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) +{ + cemu_assert_debug(m_metalBufferCacheMode != MetalBufferCacheMode::Host); + cemu_assert_debug(m_bufferCache); + cemu_assert_debug((offset + size) <= m_bufferCache->length()); + + if (m_metalBufferCacheMode == MetalBufferCacheMode::DevicePrivate) + { + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); + + auto allocation = m_stagingAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, data, size); + m_stagingAllocator.FlushReservation(allocation); + + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size); + + //m_mtlr->CopyBufferToBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + } + else + { + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + } +} + +void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) +{ + cemu_assert_debug(m_metalBufferCacheMode != MetalBufferCacheMode::Host); + cemu_assert_debug(m_bufferCache); + + if (m_metalBufferCacheMode == MetalBufferCacheMode::DevicePrivate) + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + else + memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h new file mode 100644 index 0000000000..9730cc6327 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -0,0 +1,76 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" + +#include "GameProfile/GameProfile.h" + +class MetalMemoryManager +{ +public: + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 32u * 1024 * 1024), m_indexAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 4u * 1024 * 1024) {} + ~MetalMemoryManager(); + + MetalSynchronizedRingAllocator& GetStagingAllocator() + { + return m_stagingAllocator; + } + + MetalSynchronizedHeapAllocator& GetIndexAllocator() + { + return m_indexAllocator; + } + + MTL::Buffer* GetBufferCache() + { + return m_bufferCache; + } + + void CleanupBuffers(MTL::CommandBuffer* latestFinishedCommandBuffer) + { + m_stagingAllocator.CleanupBuffer(latestFinishedCommandBuffer); + m_indexAllocator.CleanupBuffer(latestFinishedCommandBuffer); + } + + // Texture upload buffer + void* AcquireTextureUploadBuffer(size_t size); + void ReleaseTextureUploadBuffer(uint8* mem); + + // Buffer cache + void InitBufferCache(size_t size); + void UploadToBufferCache(const void* data, size_t offset, size_t size); + void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); + + // Getters + bool UseHostMemoryForCache() const + { + return (m_metalBufferCacheMode == MetalBufferCacheMode::Host); + } + + bool NeedsReducedLatency() const + { + return (m_metalBufferCacheMode == MetalBufferCacheMode::DeviceShared || m_metalBufferCacheMode == MetalBufferCacheMode::Host); + } + + MPTR GetImportedMemBaseAddress() const + { + return m_importedMemBaseAddress; + } + + size_t GetHostAllocationSize() const + { + return m_hostAllocationSize; + } + +private: + class MetalRenderer* m_mtlr; + + std::vector m_textureUploadBuffer; + + MetalSynchronizedRingAllocator m_stagingAllocator; + MetalSynchronizedHeapAllocator m_indexAllocator; + + MTL::Buffer* m_bufferCache = nullptr; + MetalBufferCacheMode m_metalBufferCacheMode; + MPTR m_importedMemBaseAddress; + size_t m_hostAllocationSize = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp new file mode 100644 index 0000000000..48cca54faa --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -0,0 +1,37 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" + +MetalOutputShaderCache::~MetalOutputShaderCache() +{ + for (uint8 i = 0; i < METAL_OUTPUT_SHADER_CACHE_SIZE; i++) + { + if (m_cache[i]) + m_cache[i]->release(); + } +} + +MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB) +{ + uint8 cacheIndex = (usesSRGB ? METAL_SHADER_TYPE_COUNT : 0) + shaderIndex; + auto& renderPipelineState = m_cache[cacheIndex]; + if (renderPipelineState) + return renderPipelineState; + + // Create a new render pipeline state + auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); + auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); + + NS_STACK_SCOPED auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); + renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + + NS::Error* error = nullptr; + renderPipelineState = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); + } + + return renderPipelineState; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h new file mode 100644 index 0000000000..85b9e8b243 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h @@ -0,0 +1,20 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +constexpr uint8 METAL_SHADER_TYPE_COUNT = 6; +constexpr uint8 METAL_OUTPUT_SHADER_CACHE_SIZE = 2 * METAL_SHADER_TYPE_COUNT; + +class MetalOutputShaderCache +{ +public: + MetalOutputShaderCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalOutputShaderCache(); + + MTL::RenderPipelineState* GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB); + +private: + class MetalRenderer* m_mtlr; + + MTL::RenderPipelineState* m_cache[METAL_OUTPUT_SHADER_CACHE_SIZE] = {nullptr}; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h new file mode 100644 index 0000000000..bdbaa84b9e --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -0,0 +1,26 @@ +#pragma once + +class MetalPerformanceMonitor +{ +public: + // Per frame data + uint32 m_commandBuffers = 0; + uint32 m_renderPasses = 0; + uint32 m_clears = 0; + uint32 m_manualVertexFetchDraws = 0; + uint32 m_meshDraws = 0; + uint32 m_triangleFans = 0; + + MetalPerformanceMonitor() = default; + ~MetalPerformanceMonitor() = default; + + void ResetPerFrameData() + { + m_commandBuffers = 0; + m_renderPasses = 0; + m_clears = 0; + m_manualVertexFetchDraws = 0; + m_meshDraws = 0; + m_triangleFans = 0; + } +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp new file mode 100644 index 0000000000..a922365b2a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -0,0 +1,621 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Common/RegisterSerializer.h" +#include "Cafe/HW/Latte/Core/LatteShaderCache.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cemu/FileCache/FileCache.h" +#include "Common/precompiled.h" +#include "util/helpers/helpers.h" +#include "config/ActiveSettings.h" + +#include + +static bool g_compilePipelineThreadInit{false}; +static std::mutex g_compilePipelineMutex; +static std::condition_variable g_compilePipelineCondVar; +static std::queue g_compilePipelineRequests; + +static void compileThreadFunc(sint32 threadIndex) +{ + SetThreadName("compilePl"); + + // one thread runs at normal priority while the others run at lower priority + if (threadIndex != 0) + ; // TODO: set thread priority + + while (true) + { + std::unique_lock lock(g_compilePipelineMutex); + while (g_compilePipelineRequests.empty()) + g_compilePipelineCondVar.wait(lock); + + MetalPipelineCompiler* request = g_compilePipelineRequests.front(); + + g_compilePipelineRequests.pop(); + + lock.unlock(); + + request->Compile(true, false, true); + delete request; + } +} + +static void initCompileThread() +{ + uint32 numCompileThreads; + + uint32 cpuCoreCount = GetPhysicalCoreCount(); + if (cpuCoreCount <= 2) + numCompileThreads = 1; + else + numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3 + + numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8 + + for (uint32 i = 0; i < numCompileThreads; i++) + { + std::thread compileThread(compileThreadFunc, i); + compileThread.detach(); + } +} + +static void queuePipeline(MetalPipelineCompiler* v) +{ + std::unique_lock lock(g_compilePipelineMutex); + g_compilePipelineRequests.push(std::move(v)); + lock.unlock(); + g_compilePipelineCondVar.notify_one(); +} + +// make a guess if a pipeline is not essential +// non-essential means that skipping these drawcalls shouldn't lead to permanently corrupted graphics +bool IsAsyncPipelineAllowed(const MetalAttachmentsInfo& attachmentsInfo, Vector2i extend, uint32 indexCount) +{ + if (extend.x == 1600 && extend.y == 1600) + return false; // Splatoon ink mechanics use 1600x1600 R8 and R8G8 framebuffers, this resolution is rare enough that we can just blacklist it globally + + if (attachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + return true; // aggressive filter but seems to work well so far + + // small index count (3,4,5,6) is often associated with full-viewport quads (which are considered essential due to often being used to generate persistent textures) + if (indexCount <= 6) + return false; + + return true; +} + +MetalPipelineCache* g_mtlPipelineCache = nullptr; + +MetalPipelineCache& MetalPipelineCache::GetInstance() +{ + return *g_mtlPipelineCache; +} + +MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} +{ + g_mtlPipelineCache = this; +} + +MetalPipelineCache::~MetalPipelineCache() +{ + for (auto& [key, pipelineObj] : m_pipelineCache) + { + pipelineObj->m_pipeline->release(); + delete pipelineObj; + } +} + +PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr) +{ + uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + PipelineObject*& pipelineObj = m_pipelineCache[hash]; + if (pipelineObj) + return pipelineObj; + + pipelineObj = new PipelineObject(); + + MetalPipelineCompiler* compiler = new MetalPipelineCompiler(m_mtlr, *pipelineObj); + compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + + bool allowAsyncCompile = false; + if (GetConfig().async_compile) + allowAsyncCompile = IsAsyncPipelineAllowed(activeAttachmentsInfo, extend, indexCount); + + if (allowAsyncCompile) + { + if (!g_compilePipelineThreadInit) + { + initCompileThread(); + g_compilePipelineThreadInit = true; + } + + queuePipeline(compiler); + } + else + { + // Also force compile to ensure that the pipeline is ready + cemu_assert_debug(compiler->Compile(true, true, true)); + delete compiler; + } + + // Save to cache + AddCurrentStateToCache(hash, lastUsedAttachmentsInfo); + + return pipelineObj; +} + +uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Hash + uint64 stateHash = 0; + for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i) + { + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + continue; + + stateHash += GetMtlPixelFormat(format, false) + i * 31; + stateHash = std::rotl(stateHash, 7); + + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } + } + + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); + stateHash = std::rotl(stateHash, 7); + + if (activeAttachmentsInfo.depthFormat == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } + } + + for (auto& group : fetchShader->bufferGroups) + { + uint32 bufferStride = group.getCurrentBufferStride(lcr.GetRawView()); + stateHash = std::rotl(stateHash, 7); + stateHash += bufferStride * 3; + } + + stateHash += fetchShader->getVkPipelineHashFragment(); + stateHash = std::rotl(stateHash, 7); + + stateHash += lcr.GetRawView()[mmVGT_STRMOUT_EN]; + stateHash = std::rotl(stateHash, 7); + + if(lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL()) + stateHash += 0x333333; + + stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; + + uint32* ctxRegister = lcr.GetRawView(); + + if (vertexShader) + stateHash += vertexShader->baseHash; + + stateHash = std::rotl(stateHash, 13); + + if (pixelShader) + stateHash += pixelShader->baseHash + pixelShader->auxHash; + + stateHash = std::rotl(stateHash, 13); + + uint32 polygonCtrl = lcr.PA_SU_SC_MODE_CNTL.getRawValue(); + stateHash += polygonCtrl; + stateHash = std::rotl(stateHash, 7); + + stateHash += ctxRegister[Latte::REGADDR::PA_CL_CLIP_CNTL]; + stateHash = std::rotl(stateHash, 7); + + const auto colorControlReg = ctxRegister[Latte::REGADDR::CB_COLOR_CONTROL]; + stateHash += colorControlReg; + + stateHash += ctxRegister[Latte::REGADDR::CB_TARGET_MASK]; + + const uint32 blendEnableMask = (colorControlReg >> 8) & 0xFF; + if (blendEnableMask) + { + for (auto i = 0; i < 8; ++i) + { + if (((blendEnableMask & (1 << i))) == 0) + continue; + stateHash = std::rotl(stateHash, 7); + stateHash += ctxRegister[Latte::REGADDR::CB_BLEND0_CONTROL + i]; + } + } + + // Mesh pipeline + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + + if (usesGeometryShader) + { + stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; + stateHash = std::rotl(stateHash, 7); + } + + return stateHash; +} + +struct +{ + uint32 pipelineLoadIndex; + uint32 pipelineMaxFileIndex; + + std::atomic_uint32_t pipelinesQueued; + std::atomic_uint32_t pipelinesLoaded; +} g_mtlCacheState; + +uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId) +{ + std::error_code ec; + fs::create_directories(ActiveSettings::GetCachePath("shaderCache/transferable"), ec); + const auto pathCacheFile = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlpipeline.bin", cacheTitleId); + + // init cache loader state + g_mtlCacheState.pipelineLoadIndex = 0; + g_mtlCacheState.pipelineMaxFileIndex = 0; + g_mtlCacheState.pipelinesLoaded = 0; + g_mtlCacheState.pipelinesQueued = 0; + + // start async compilation threads + m_compilationCount.store(0); + m_compilationQueue.clear(); + + // get core count + uint32 cpuCoreCount = GetPhysicalCoreCount(); + m_numCompilationThreads = std::clamp(cpuCoreCount, 1u, 8u); + // TODO: uncomment? + //if (VulkanRenderer::GetInstance()->GetDisableMultithreadedCompilation()) + // m_numCompilationThreads = 1; + + for (uint32 i = 0; i < m_numCompilationThreads; i++) + { + std::thread compileThread(&MetalPipelineCache::CompilerThread, this); + compileThread.detach(); + } + + // open cache file or create it + cemu_assert_debug(s_cache == nullptr); + s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId)); + if (!s_cache) + { + cemuLog_log(LogType::Force, "Failed to open or create Metal pipeline cache file: {}", _pathToUtf8(pathCacheFile)); + return 0; + } + else + { + s_cache->UseCompression(false); + g_mtlCacheState.pipelineMaxFileIndex = s_cache->GetMaximumFileIndex(); + } + return s_cache->GetFileCount(); +} + +bool MetalPipelineCache::UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders) +{ + pipelinesLoadedTotal = g_mtlCacheState.pipelinesLoaded; + pipelinesMissingShaders = 0; + while (g_mtlCacheState.pipelineLoadIndex <= g_mtlCacheState.pipelineMaxFileIndex) + { + if (m_compilationQueue.size() >= 50) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // queue up to 50 entries at a time + } + + uint64 fileNameA, fileNameB; + std::vector fileData; + if (s_cache->GetFileByIndex(g_mtlCacheState.pipelineLoadIndex, &fileNameA, &fileNameB, fileData)) + { + // queue for async compilation + g_mtlCacheState.pipelinesQueued++; + m_compilationQueue.push(std::move(fileData)); + g_mtlCacheState.pipelineLoadIndex++; + return true; + } + g_mtlCacheState.pipelineLoadIndex++; + } + if (g_mtlCacheState.pipelinesLoaded != g_mtlCacheState.pipelinesQueued) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // pipelines still compiling + } + return false; // done +} + +void MetalPipelineCache::EndLoading() +{ + // shut down compilation threads + uint32 threadCount = m_numCompilationThreads; + m_numCompilationThreads = 0; // signal thread shutdown + for (uint32 i = 0; i < threadCount; i++) + { + m_compilationQueue.push({}); // push empty workload for every thread. Threads then will shutdown after checking for m_numCompilationThreads == 0 + } + // keep cache file open for writing of new pipelines +} + +void MetalPipelineCache::Close() +{ + if(s_cache) + { + delete s_cache; + s_cache = nullptr; + } +} + +struct CachedPipeline +{ + struct ShaderHash + { + uint64 baseHash; + uint64 auxHash; + bool isPresent{}; + + void set(uint64 baseHash, uint64 auxHash) + { + this->baseHash = baseHash; + this->auxHash = auxHash; + this->isPresent = true; + } + }; + + ShaderHash vsHash; // includes fetch shader + ShaderHash gsHash; + ShaderHash psHash; + + MetalAttachmentsInfo lastUsedAttachmentsInfo; + + Latte::GPUCompactedRegisterState gpuState; +}; + +void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) +{ + static FSpinlock s_spinlockSharedInternal; + + // deserialize file + LatteContextRegister* lcr = new LatteContextRegister(); + s_spinlockSharedInternal.lock(); + CachedPipeline* cachedPipeline = new CachedPipeline(); + s_spinlockSharedInternal.unlock(); + + MemStreamReader streamReader(fileData.data(), fileData.size()); + if (!DeserializePipeline(streamReader, *cachedPipeline)) + { + // failed to deserialize + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); + return; + } + // restored register view from compacted state + Latte::LoadGPURegisterState(*lcr, cachedPipeline->gpuState); + + LatteDecompilerShader* vertexShader = nullptr; + LatteDecompilerShader* geometryShader = nullptr; + LatteDecompilerShader* pixelShader = nullptr; + // find vertex shader + if (cachedPipeline->vsHash.isPresent) + { + vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash); + if (!vertexShader) + { + cemuLog_log(LogType::Force, "Vertex shader not found in cache"); + return; + } + } + // find geometry shader + if (cachedPipeline->gsHash.isPresent) + { + geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash); + if (!geometryShader) + { + cemuLog_log(LogType::Force, "Geometry shader not found in cache"); + return; + } + } + // find pixel shader + if (cachedPipeline->psHash.isPresent) + { + pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash); + if (!pixelShader) + { + cemuLog_log(LogType::Force, "Pixel shader not found in cache"); + return; + } + } + + if (!pixelShader) + { + cemu_assert_debug(false); + return; + } + + MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); + + PipelineObject* pipelineObject = new PipelineObject(); + + // compile + { + MetalPipelineCompiler pp(m_mtlr, *pipelineObject); + pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); + pp.Compile(true, true, false); + // destroy pp early + } + + // Cache the pipeline + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); + m_pipelineCacheLock.lock(); + m_pipelineCache[pipelineStateHash] = pipelineObject; + m_pipelineCacheLock.unlock(); + + // clean up + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); +} + +ConcurrentQueue g_mtlPipelineCachingQueue; + +void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash, const MetalAttachmentsInfo& lastUsedAttachmentsInfo) +{ + if (!m_pipelineCacheStoreThread) + { + m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this); + m_pipelineCacheStoreThread->detach(); + } + // fill job structure with cached GPU state + // for each cached pipeline we store: + // - Active shaders (referenced by hash) + // - An almost-complete register state of the GPU (minus some ALU uniform constants which aren't relevant) + CachedPipeline* job = new CachedPipeline(); + auto vs = LatteSHRC_GetActiveVertexShader(); + auto gs = LatteSHRC_GetActiveGeometryShader(); + auto ps = LatteSHRC_GetActivePixelShader(); + if (vs) + job->vsHash.set(vs->baseHash, vs->auxHash); + if (gs) + job->gsHash.set(gs->baseHash, gs->auxHash); + if (ps) + job->psHash.set(ps->baseHash, ps->auxHash); + job->lastUsedAttachmentsInfo = lastUsedAttachmentsInfo; + Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState); + // queue job + g_mtlPipelineCachingQueue.push(job); +} + +bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPipeline& cachedPipeline) +{ + memWriter.writeBE(0x01); // version + uint8 presentMask = 0; + if (cachedPipeline.vsHash.isPresent) + presentMask |= 1; + if (cachedPipeline.gsHash.isPresent) + presentMask |= 2; + if (cachedPipeline.psHash.isPresent) + presentMask |= 4; + memWriter.writeBE(presentMask); + if (cachedPipeline.vsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.vsHash.baseHash); + memWriter.writeBE(cachedPipeline.vsHash.auxHash); + } + if (cachedPipeline.gsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.gsHash.baseHash); + memWriter.writeBE(cachedPipeline.gsHash.auxHash); + } + if (cachedPipeline.psHash.isPresent) + { + memWriter.writeBE(cachedPipeline.psHash.baseHash); + memWriter.writeBE(cachedPipeline.psHash.auxHash); + } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i]); + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.depthFormat); + + Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter); + + return true; +} + +bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedPipeline& cachedPipeline) +{ + // version + if (memReader.readBE() != 1) + { + cemuLog_log(LogType::Force, "Cached Metal pipeline corrupted or has unknown version"); + return false; + } + // shader hashes + uint8 presentMask = memReader.readBE(); + if (presentMask & 1) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.vsHash.set(baseHash, auxHash); + } + if (presentMask & 2) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.gsHash.set(baseHash, auxHash); + } + if (presentMask & 4) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.psHash.set(baseHash, auxHash); + } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i] = (Latte::E_GX2SURFFMT)memReader.readBE(); + cachedPipeline.lastUsedAttachmentsInfo.depthFormat = (Latte::E_GX2SURFFMT)memReader.readBE(); + + // deserialize GPU state + if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader)) + { + return false; + } + cemu_assert_debug(!memReader.hasError()); + + return true; +} + +int MetalPipelineCache::CompilerThread() +{ + SetThreadName("plCacheCompiler"); + while (m_numCompilationThreads != 0) + { + std::vector pipelineData = m_compilationQueue.pop(); + if(pipelineData.empty()) + continue; + LoadPipelineFromCache(pipelineData); + ++g_mtlCacheState.pipelinesLoaded; + } + return 0; +} + +void MetalPipelineCache::WorkerThread() +{ + SetThreadName("plCacheWriter"); + while (true) + { + CachedPipeline* job; + g_mtlPipelineCachingQueue.pop(job); + if (!s_cache) + { + delete job; + continue; + } + // serialize + MemStreamWriter memWriter(1024 * 4); + SerializePipeline(memWriter, *job); + auto blob = memWriter.getResult(); + // file name is derived from data hash + uint8 hash[SHA256_DIGEST_LENGTH]; + SHA256(blob.data(), blob.size(), hash); + uint64 nameA = *(uint64be*)(hash + 0); + uint64 nameB = *(uint64be*)(hash + 8); + s_cache->AddFileAsync({ nameA, nameB }, blob.data(), blob.size()); + delete job; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h new file mode 100644 index 0000000000..270c2db722 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -0,0 +1,52 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/fspinlock.h" +#include "util/math/vector2.h" + +class MetalPipelineCache +{ +public: + static MetalPipelineCache& GetInstance(); + + MetalPipelineCache(class MetalRenderer* metalRenderer); + ~MetalPipelineCache(); + + PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr); + + // Cache loading + uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache + bool UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders); + void EndLoading(); + void LoadPipelineFromCache(std::span fileData); + void Close(); // called on title exit + + // Debug + size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } + +private: + class MetalRenderer* m_mtlr; + + std::map m_pipelineCache; + FSpinlock m_pipelineCacheLock; + + std::thread* m_pipelineCacheStoreThread; + + class FileCache* s_cache; + + std::atomic_uint32_t m_numCompilationThreads{ 0 }; + ConcurrentQueue> m_compilationQueue; + std::atomic_uint32_t m_compilationCount; + + static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + void AddCurrentStateToCache(uint64 pipelineStateHash, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo); + + // pipeline serialization for file + bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); + bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline); + + int CompilerThread(); + void WorkerThread(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp new file mode 100644 index 0000000000..6193ab1036 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -0,0 +1,484 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" + +#include + +extern std::atomic_int g_compiling_pipelines; +extern std::atomic_int g_compiling_pipelines_async; +extern std::atomic_uint64_t g_compiling_pipelines_syncTimeSum; + +static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx)); + gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); +} + +static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, const char* variant, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); + gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); +} + +static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) +{ + sint32 pList[4] = { p0, p1, p2, p3 }; + for (sint32 i = 0; i < 4; i++) + { + if (pList[i] == 3) + rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister); + else + rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); + } + gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0])); + gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3])); +} + +static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) +{ + std::string gsSrc; + gsSrc.append("#include \r\n"); + gsSrc.append("using namespace metal;\r\n"); + + LatteShaderPSInputTable psInputTable; + LatteShader_CreatePSInputTable(&psInputTable, latteRegister.GetRawView()); + + // inputs & outputs + std::string vertexOutDefinition = "struct VertexOut {\r\n"; + vertexOutDefinition += "float4 position;\r\n"; + std::string geometryOutDefinition = "struct GeometryOut {\r\n"; + geometryOutDefinition += "float4 position [[position]];\r\n"; + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + auto psImport = psInputTable.getPSImportBySemanticId(vsSemanticId); + if (psImport == nullptr) + continue; + + // VertexOut + vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); + + // GeometryOut + geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); + + geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable.getPSImportLocationBySemanticId(vsSemanticId)); + if (psImport->isFlat) + geometryOutDefinition += " [[flat]]"; + if (psImport->isNoPerspective) + geometryOutDefinition += " [[center_no_perspective]]"; + geometryOutDefinition += ";\r\n"; + } + vertexOutDefinition += "};\r\n"; + geometryOutDefinition += "};\r\n"; + + gsSrc.append(vertexOutDefinition); + gsSrc.append(geometryOutDefinition); + + gsSrc.append("struct ObjectPayload {\r\n"); + gsSrc.append("VertexOut vertexOut[3];\r\n"); + gsSrc.append("};\r\n"); + + // gen function + gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return b - (c - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c - (b - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c + (b - a);\r\n"); + gsSrc.append("}\r\n"); + + // main + gsSrc.append("using MeshType = mesh;\r\n"); + gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n"); + gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("GeometryOut out;\r\n"); + + // there are two possible winding orders that need different triangle generation: + // 0 1 + // 2 3 + // and + // 0 1 + // 3 2 + // all others are just symmetries of these cases + + // we can determine the case by comparing the distance 0<->1 and 0<->2 + + gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n"); + + // emit vertices + gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n"); + gsSrc.append("{\r\n"); + // p0 to p1 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister); + gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n"); + // p0 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister); + gsSrc.append("} else {\r\n"); + // p1 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister); + gsSrc.append("}\r\n"); + + gsSrc.append("mesh.set_primitive_count(2);\r\n"); + + gsSrc.append("}\r\n"); + + auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); + mtlShader->PreponeCompilation(true); + + return mtlShader; +} + +#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF + +uint64 s_cacheTitleId = INVALID_TITLE_ID; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +template +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr) +{ + // TODO: check if the pixel shader is valid as well? + if (!rasterizationEnabled/* || !pixelShaderMtl*/) + { + desc->setRasterizationEnabled(false); + return; + } + + // Color attachments + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + continue; + + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(format, false); + auto colorAttachment = desc->colorAttachments()->object(i); + colorAttachment->setPixelFormat(pixelFormat); + + // Disable writes if not in the active FBO + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); + continue; + } + + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); + + // Blending + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + // Only float data type is blendable + if (blendEnabled && GetMtlPixelFormatInfo(format, false).dataType == MetalDataType::FLOAT) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } + } + + // Depth stencil attachment + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); + desc->setDepthAttachmentPixelFormat(pixelFormat); + if (lastUsedAttachmentsInfo.hasStencil) + desc->setStencilAttachmentPixelFormat(pixelFormat); + } +} + +MetalPipelineCompiler::~MetalPipelineCompiler() +{ + /* + for (auto& pair : m_pipelineCache) + { + pair.second->release(); + } + m_pipelineCache.clear(); + + NS::Error* error = nullptr; + m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); + if (error) + { + cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String()); + error->release(); + } + m_binaryArchive->release(); + + m_binaryArchiveURL->release(); + */ + if (m_pipelineDescriptor) + m_pipelineDescriptor->release(); +} + +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + m_usesGeometryShader = UseGeometryShader(lcr, geometryShader != nullptr); + if (m_usesGeometryShader && !m_mtlr->SupportsMeshShaders()) + return; + + // Rasterization + m_rasterizationEnabled = lcr.IsRasterizationEnabled(); + + // Shaders + m_vertexShaderMtl = static_cast(vertexShader->shader); + if (geometryShader) + m_geometryShaderMtl = static_cast(geometryShader->shader); + else if (UseRectEmulation(lcr)) + m_geometryShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + else + m_geometryShaderMtl = nullptr; + m_pixelShaderMtl = static_cast(pixelShader->shader); + + if (m_usesGeometryShader) + InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + else + InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); +} + +bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) +{ + if (m_usesGeometryShader && !m_mtlr->SupportsMeshShaders()) + return false; + + if (forceCompile) + { + // if some shader stages are not compiled yet, compile them now + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + m_vertexShaderMtl->PreponeCompilation(isRenderThread); + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + m_geometryShaderMtl->PreponeCompilation(isRenderThread); + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + m_pixelShaderMtl->PreponeCompilation(isRenderThread); + } + else + { + // fail early if some shader stages are not compiled + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + return false; + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + return false; + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + return false; + } + + // Compile + MTL::RenderPipelineState* pipeline = nullptr; + NS::Error* error = nullptr; + + auto start = std::chrono::high_resolution_clock::now(); + if (m_usesGeometryShader) + { + auto desc = static_cast(m_pipelineDescriptor); + + // Shaders + desc->setObjectFunction(m_vertexShaderMtl->GetFunction()); + desc->setMeshFunction(m_geometryShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Mesh render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + } + else + { + auto desc = static_cast(m_pipelineDescriptor); + + // Shaders + desc->setVertexFunction(m_vertexShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + } + auto end = std::chrono::high_resolution_clock::now(); + + auto creationDuration = std::chrono::duration_cast(end - start).count(); + + if (error) + { + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); + } + + if (showInOverlay) + { + if (isRenderThread) + g_compiling_pipelines_syncTimeSum += creationDuration; + else + g_compiling_pipelines_async++; + g_compiling_pipelines++; + } + + m_pipelineObj.m_pipeline = pipeline; + + return true; +} + +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Render pipeline state + MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); + + // Vertex descriptor + if (!fetchShader->mtlFetchVertexManually) + { + NS_STACK_SCOPED MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 minBufferStride = 0; + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); + + minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + if (bufferStride == 0) + { + // Buffer stride cannot be zero, let's use the minimum stride + bufferStride = minBufferStride; + + // Additionally, constant vertex function must be used + layout->setStepFunction(MTL::VertexStepFunctionConstant); + layout->setStepRate(0); + } + else + { + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + cemuLog_log(LogType::Force, "unimplemented vertex fetch type {}", (uint32)fetchType.value()); + cemu_assert(false); + } + } + bufferStride = Align(bufferStride, 4); + layout->setStride(bufferStride); + } + + desc->setVertexDescriptor(vertexDescriptor); + } + + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); + + m_pipelineDescriptor = desc; +} + +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Render pipeline state + MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); + + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); + + m_pipelineDescriptor = desc; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h new file mode 100644 index 0000000000..4d21e53d54 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -0,0 +1,38 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" + +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" + +struct PipelineObject +{ + MTL::RenderPipelineState* m_pipeline = nullptr; +}; + +class MetalPipelineCompiler +{ +public: + MetalPipelineCompiler(class MetalRenderer* metalRenderer, PipelineObject& pipelineObj) : m_mtlr{metalRenderer}, m_pipelineObj{pipelineObj} {} + ~MetalPipelineCompiler(); + + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + bool Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); + +private: + class MetalRenderer* m_mtlr; + PipelineObject& m_pipelineObj; + + class RendererShaderMtl* m_vertexShaderMtl; + class RendererShaderMtl* m_geometryShaderMtl; + class RendererShaderMtl* m_pixelShaderMtl; + bool m_usesGeometryShader; + bool m_rasterizationEnabled; + + NS::Object* m_pipelineDescriptor = nullptr; + + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp new file mode 100644 index 0000000000..ee79f2dd8b --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -0,0 +1,38 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) +{ + if (m_commandBuffer && !CommandBufferCompleted(m_commandBuffer)) + return false; + + uint64* resultPtr = m_mtlr->GetOcclusionQueryResultsPtr(); + + numSamplesPassed = 0; + for (uint32 i = m_range.begin; i != m_range.end; i = (i + 1) % MetalRenderer::OCCLUSION_QUERY_POOL_SIZE) + numSamplesPassed += resultPtr[i]; + + return true; +} + +LatteQueryObjectMtl::~LatteQueryObjectMtl() +{ + if (m_commandBuffer) + m_commandBuffer->release(); +} + +void LatteQueryObjectMtl::begin() +{ + m_range.begin = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->BeginOcclusionQuery(); +} + +void LatteQueryObjectMtl::end() +{ + m_range.end = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->EndOcclusionQuery(); + + m_commandBuffer = m_mtlr->GetAndRetainCurrentCommandBufferIfNotCompleted(); + if (m_commandBuffer) + m_mtlr->RequestSoonCommit(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h new file mode 100644 index 0000000000..3de0939a0a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -0,0 +1,28 @@ +#pragma once + +#include "Cafe/HW/Latte/Core/LatteQueryObject.h" + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +class LatteQueryObjectMtl : public LatteQueryObject +{ +public: + LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} + ~LatteQueryObjectMtl(); + + bool getResult(uint64& numSamplesPassed) override; + void begin() override; + void end() override; + + void GrowRange() + { + m_range.end++; + } + +private: + class MetalRenderer* m_mtlr; + + MetalQueryRange m_range = {INVALID_UINT32, INVALID_UINT32}; + // TODO: make this a list of command buffers? + MTL::CommandBuffer* m_commandBuffer = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp new file mode 100644 index 0000000000..360d6def57 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -0,0 +1,2321 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" + +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Core/LatteIndices.h" +#include "Cafe/HW/Latte/Core/LatteBufferCache.h" +#include "CafeSystem.h" +#include "Cemu/Logging/CemuLogging.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "config/CemuConfig.h" + +#define IMGUI_IMPL_METAL_CPP +#include "imgui/imgui_extension.h" +#include "imgui/imgui_impl_metal.h" + +#define EVENT_VALUE_WRAP 4096 + +extern bool hasValidFramebufferAttached; + +float supportBufferData[512 * 4]; + +// Defined in the OpenGL renderer +void LatteDraw_handleSpecialState8_clearAsDepth(); + +std::vector MetalRenderer::GetDevices() +{ + NS_STACK_SCOPED auto devices = MTL::CopyAllDevices(); + std::vector result; + result.reserve(devices->count()); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + result.push_back({std::string(device->name()->utf8String()), device->registryID()}); + } + + return result; +} + +MetalRenderer::MetalRenderer() +{ + // Options + + // Position invariance + switch (g_current_game_profile->GetPositionInvariance()) + { + case PositionInvariance::Auto: + switch (CafeSystem::GetForegroundTitleId()) + { + // Bayonetta + case 0x0005000010157F00: // EUR + case 0x0005000010157E00: // USA + case 0x000500001014DB00: // JPN + // Bayonetta 2 + case 0x0005000010172700: // EUR + case 0x0005000010172600: // USA + // Disney Planes + case 0x0005000010136900: // EUR + case 0x0005000010136A00: // EUR (TODO: check) + case 0x0005000010136B00: // EUR (TODO: check) + case 0x000500001011C500: // USA (TODO: check) + // LEGO STAR WARS: The Force Awakens + case 0x00050000101DAA00: // EUR + case 0x00050000101DAB00: // USA + // Mario Kart 8 + case 0x000500001010ED00: // EUR + case 0x000500001010EC00: // USA + case 0x000500001010EB00: // JPN + case 0x0005000010183A00: // JPN (TODO: check) + // Minecraft: Story Mode + case 0x000500001020A300: // EUR + case 0x00050000101E0100: // USA + //case 0x000500001020a200: // USA + // Ninja Gaiden 3: Razor's Edge + case 0x0005000010110B00: // EUR + case 0x0005000010139B00: // EUR (TODO: check) + case 0x0005000010110A00: // USA + case 0x0005000010110900: // JPN + // Resident Evil: Revelations + case 0x000500001012B400: // EUR + case 0x000500001012CF00: // USA + // Star Fox Zero + case 0x00050000101B0500: // EUR + case 0x0005000010201C00: // EUR (TODO: check) + case 0x00050000101B0400: // USA + case 0x0005000010201B00: // USA (TODO: check) + // The Legend of Zelda: Breath of the Wild + case 0x00050000101C9500: // EUR + case 0x00050000101C9400: // USA + case 0x00050000101C9300: // JPN + // Wonderful 101 + case 0x0005000010135300: // EUR + case 0x000500001012DC00: // USA + case 0x0005000010116300: // JPN + case 0x0005000010185600: // JPN (TODO: check) + m_positionInvariance = true; + break; + default: + m_positionInvariance = false; + break; + } + break; + case PositionInvariance::False: + m_positionInvariance = false; + break; + case PositionInvariance::True: + m_positionInvariance = true; + break; + } + + // Pick a device + auto& config = GetConfig(); + const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0; + + // If a device is set, try to find it + if (hasDeviceSet) + { + NS_STACK_SCOPED auto devices = MTL::CopyAllDevices(); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + if (device->registryID() == config.mtl_graphic_device_uuid) + { + m_device = device; + break; + } + } + } + + if (!m_device) + { + if (hasDeviceSet) + { + cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.mtl_graphic_device_uuid); + config.mtl_graphic_device_uuid = 0; + } + // Use the system default device + m_device = MTL::CreateSystemDefaultDevice(); + } + + // Vendor + const char* deviceName = m_device->name()->utf8String(); + if (memcmp(deviceName, "Apple", 5) == 0) + m_vendor = GfxVendor::Apple; + else if (memcmp(deviceName, "AMD", 3) == 0) + m_vendor = GfxVendor::AMD; + else if (memcmp(deviceName, "Intel", 5) == 0) + m_vendor = GfxVendor::Intel; + else if (memcmp(deviceName, "NVIDIA", 6) == 0) + m_vendor = GfxVendor::Nvidia; + else + m_vendor = GfxVendor::Generic; + + // Feature support + m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false; + m_hasUnifiedMemory = m_device->hasUnifiedMemory(); + m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); + m_supportsMeshShaders = (m_supportsMetal3 && (m_vendor != GfxVendor::Intel || GetConfig().force_mesh_shaders.GetValue())); // Intel GPUs have issues with mesh shaders + m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); + m_pixelFormatSupport = MetalPixelFormatSupport(m_device); + + CheckForPixelFormatSupport(m_pixelFormatSupport); + + // Command queue + m_commandQueue = m_device->newCommandQueue(); + + // Synchronization resources + m_event = m_device->newEvent(); + + // Resources + NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor)); +#endif + m_nearestSampler = m_device->newSamplerState(samplerDescriptor); + + samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear); + samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor)); +#endif + m_linearSampler = m_device->newSamplerState(samplerDescriptor); + + // Null resources + NS_STACK_SCOPED MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); + textureDescriptor->setTextureType(MTL::TextureType1D); + textureDescriptor->setWidth(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead); + m_nullTexture1D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); +#endif + + textureDescriptor->setTextureType(MTL::TextureType2D); + textureDescriptor->setHeight(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget); + m_nullTexture2D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); +#endif + + m_memoryManager = new MetalMemoryManager(this); + m_outputShaderCache = new MetalOutputShaderCache(this); + m_pipelineCache = new MetalPipelineCache(this); + m_depthStencilCache = new MetalDepthStencilCache(this); + m_samplerCache = new MetalSamplerCache(this); + + // Lower the commit treshold when buffer cache needs reduced latency + if (m_memoryManager->NeedsReducedLatency()) + m_defaultCommitTreshlod = 64; + else + m_defaultCommitTreshlod = 196; + + // Occlusion queries + m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_occlusionQuery.m_resultBuffer->setLabel(GetLabel("Occlusion query result buffer", m_occlusionQuery.m_resultBuffer)); +#endif + m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); + + // Reset vertex and uniform buffers + for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; + } + + // Utility shader library + + // Create the library + NS::Error* error = nullptr; + NS_STACK_SCOPED MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); + } + + // Pipelines + NS_STACK_SCOPED MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + NS_STACK_SCOPED MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); + + m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init(); + m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction); + m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction); + + // Void vertex pipelines + if (m_isAppleGPU) + m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); + + // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; + m_occlusionQuery.m_lastCommandBuffer = nullptr; + m_captureFrame = false; +} + +MetalRenderer::~MetalRenderer() +{ + if (m_isAppleGPU) + delete m_copyBufferToBufferPipeline; + //delete m_copyTextureToTexturePipeline; + //delete m_restrideBufferPipeline; + + m_copyDepthToColorDesc->release(); + for (const auto [pixelFormat, pipeline] : m_copyDepthToColorPipelines) + pipeline->release(); + + delete m_outputShaderCache; + delete m_pipelineCache; + delete m_depthStencilCache; + delete m_samplerCache; + delete m_memoryManager; + + m_nullTexture1D->release(); + m_nullTexture2D->release(); + + m_nearestSampler->release(); + m_linearSampler->release(); + + if (m_readbackBuffer) + m_readbackBuffer->release(); + + if (m_xfbRingBuffer) + m_xfbRingBuffer->release(); + + m_occlusionQuery.m_resultBuffer->release(); + + m_event->release(); + + m_commandQueue->release(); + m_device->release(); +} + +void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) +{ + auto& layer = GetLayer(mainWindow); + layer = MetalLayerHandle(m_device, size, mainWindow); + layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm); +} + +void MetalRenderer::ShutdownLayer(bool mainWindow) +{ + GetLayer(mainWindow) = MetalLayerHandle(); +} + +void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) +{ + GetLayer(mainWindow).Resize(size); +} + +void MetalRenderer::Initialize() +{ + Renderer::Initialize(); + RendererShaderMtl::Initialize(); +} + +void MetalRenderer::Shutdown() +{ + // TODO: should shutdown both layers + ImGui_ImplMetal_Shutdown(); + CommitCommandBuffer(); + Renderer::Shutdown(); + RendererShaderMtl::Shutdown(); +} + +bool MetalRenderer::IsPadWindowActive() +{ + return (GetLayer(false).GetLayer() != nullptr); +} + +bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const +{ + // Subtract host memory from total VRAM, since it's shared with the CPU + usageInMB = (m_device->currentAllocatedSize() - m_memoryManager->GetHostAllocationSize()) / 1024 / 1024; + totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; + + return true; +} + +void MetalRenderer::ClearColorbuffer(bool padView) +{ + if (!AcquireDrawable(!padView)) + return; + + ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); +} + +void MetalRenderer::DrawEmptyFrame(bool mainWindow) +{ + if (!BeginFrame(mainWindow)) + return; + SwapBuffers(mainWindow, !mainWindow); +} + +void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) +{ + if (swapTV) + SwapBuffer(true); + if (swapDRC) + SwapBuffer(false); + + // Reset the command buffers (they are released by TemporaryBufferAllocator) + CommitCommandBuffer(); + + // Debug + m_performanceMonitor.ResetPerFrameData(); + + // GPU capture + if (m_capturing) + { + EndCapture(); + } + else if (m_captureFrame) + { + StartCapture(); + m_captureFrame = false; + } +} + +void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { + if (!m_screenshot_requested && m_screenshot_state == ScreenshotState::None) + return; + + if (m_mainLayer.GetDrawable()) + { + // we already took a pad view screenshow and want a main window screenshot + if (m_screenshot_state == ScreenshotState::Main && padView) + return; + + if (m_screenshot_state == ScreenshotState::Pad && !padView) + return; + + // remember which screenshot is left to take + if (m_screenshot_state == ScreenshotState::None) + m_screenshot_state = padView ? ScreenshotState::Main : ScreenshotState::Pad; + else + m_screenshot_state = ScreenshotState::None; + } + else + m_screenshot_state = ScreenshotState::None; + + auto texMtl = static_cast(texView->baseTexture); + + int width, height; + texMtl->GetEffectiveSize(width, height, 0); + + uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width); + uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow); + + auto blitCommandEncoder = GetBlitCommandEncoder(); + + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto buffer = bufferAllocator.AllocateBufferMemory(size, 1); + + blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer.mtlBuffer, buffer.bufferOffset, bytesPerRow, 0); + + bool formatValid = true; + std::vector rgb_data; + rgb_data.reserve(3 * width * height); + + auto pixelFormat = texMtl->GetTexture()->pixelFormat(); + // TODO: implement more formats + switch (pixelFormat) + { + case MTL::PixelFormatRGBA8Unorm: + for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4) + { + rgb_data.emplace_back(*ptr); + rgb_data.emplace_back(*(ptr + 1)); + rgb_data.emplace_back(*(ptr + 2)); + } + break; + case MTL::PixelFormatRGBA8Unorm_sRGB: + for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4) + { + rgb_data.emplace_back(SRGBComponentToRGB(*ptr)); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1))); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 2))); + } + break; + default: + cemuLog_log(LogType::Force, "Unsupported screenshot texture pixel format {}", pixelFormat); + formatValid = false; + break; + } + + if (formatValid) + SaveScreenshot(rgb_data, width, height, !padView); +} + +void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) +{ + if (!AcquireDrawable(!padView)) + return; + + MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); + + // Create render pass + auto& layer = GetLayer(!padView); + + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); + + // Get a render pipeline + + // Find out which shader we are using + uint8 shaderIndex = 255; + if (shader == RendererOutputShader::s_copy_shader) shaderIndex = 0; + else if (shader == RendererOutputShader::s_bicubic_shader) shaderIndex = 1; + else if (shader == RendererOutputShader::s_hermit_shader) shaderIndex = 2; + else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; + else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; + else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; + + uint8 shaderType = shaderIndex % 3; + + // Get the render pipeline state + auto renderPipelineState = m_outputShaderCache->GetPipeline(shader, shaderIndex, m_state.m_usesSRGB); + + // Draw to Metal layer + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + renderCommandEncoder->setFragmentTexture(presentTexture, 0); + renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); + + // Set uniforms + float outputSize[2] = {(float)imageWidth, (float)imageHeight}; + switch (shaderType) + { + case 2: + renderCommandEncoder->setFragmentBytes(outputSize, sizeof(outputSize), 0); + break; + default: + break; + } + + renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0}); + renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight}); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + + EndEncoding(); +} + +bool MetalRenderer::BeginFrame(bool mainWindow) +{ + return AcquireDrawable(mainWindow); +} + +void MetalRenderer::Flush(bool waitIdle) +{ + if (m_recordedDrawcalls > 0 || waitIdle) + CommitCommandBuffer(); + + if (waitIdle && m_executingCommandBuffers.size() != 0) + m_executingCommandBuffers.back()->waitUntilCompleted(); +} + +void MetalRenderer::NotifyLatteCommandProcessorIdle() +{ + //if (m_commitOnIdle) + // CommitCommandBuffer(); +} + +bool MetalRenderer::ImguiBegin(bool mainWindow) +{ + if (!Renderer::ImguiBegin(mainWindow)) + return false; + + if (!AcquireDrawable(mainWindow)) + return false; + + EnsureImGuiBackend(); + + // Check if the font texture needs to be built + ImGuiIO& io = ImGui::GetIO(); + if (!io.Fonts->IsBuilt()) + ImGui_ImplMetal_CreateFontsTexture(m_device); + + auto& layer = GetLayer(mainWindow); + + // Render pass descriptor + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + // New frame + ImGui_ImplMetal_NewFrame(renderPassDescriptor); + ImGui_UpdateWindowInformation(mainWindow); + ImGui::NewFrame(); + + if (m_encoderType != MetalEncoderType::Render) + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + + return true; +} + +void MetalRenderer::ImguiEnd() +{ + EnsureImGuiBackend(); + + if (m_encoderType != MetalEncoderType::Render) + { + cemuLog_logOnce(LogType::Force, "no render command encoder, cannot draw ImGui"); + return; + } + + ImGui::Render(); + ImGui_ImplMetal_RenderDrawData(ImGui::GetDrawData(), GetCurrentCommandBuffer(), (MTL::RenderCommandEncoder*)m_commandEncoder); + //ImGui::EndFrame(); + + EndEncoding(); +} + +ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const Vector2i& size) +{ + try + { + std::vector tmp(size.x * size.y * 4); + for (size_t i = 0; i < data.size() / 3; ++i) + { + tmp[(i * 4) + 0] = data[(i * 3) + 0]; + tmp[(i * 4) + 1] = data[(i * 3) + 1]; + tmp[(i * 4) + 2] = data[(i * 3) + 2]; + tmp[(i * 4) + 3] = 0xFF; + } + + NS_STACK_SCOPED MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setTextureType(MTL::TextureType2D); + desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + desc->setWidth(size.x); + desc->setHeight(size.y); + desc->setStorageMode(m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModeManaged); + desc->setUsage(MTL::TextureUsageShaderRead); + + MTL::Texture* texture = m_device->newTexture(desc); + + // TODO: do a GPU copy? + texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0); + + return (ImTextureID)texture; + } + catch (const std::exception& ex) + { + cemuLog_log(LogType::Force, "can't generate imgui texture: {}", ex.what()); + return nullptr; + } +} + +void MetalRenderer::DeleteTexture(ImTextureID id) +{ + EnsureImGuiBackend(); + + ((MTL::Texture*)id)->release(); +} + +void MetalRenderer::DeleteFontTextures() +{ + EnsureImGuiBackend(); + + ImGui_ImplMetal_DestroyFontsTexture(); +} + +void MetalRenderer::AppendOverlayDebugInfo() +{ + ImGui::Text("--- GPU info ---"); + ImGui::Text("GPU %s", m_device->name()->utf8String()); + ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); + ImGui::Text("Supports framebuffer fetch %s", (m_supportsFramebufferFetch ? "yes" : "no")); + ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); + ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); + + ImGui::Text("--- Metal info ---"); + ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); + + ImGui::Text("--- Metal info (per frame) ---"); + ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Clears %u", m_performanceMonitor.m_clears); + ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); + ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); + + ImGui::Text("--- Cache debug info ---"); + + uint32 bufferCacheHeapSize = 0; + uint32 bufferCacheAllocationSize = 0; + uint32 bufferCacheNumAllocations = 0; + + LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations); + + ImGui::Text("Buffer"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations); + + uint32 numBuffers; + size_t totalSize, freeSize; + + m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Staging"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); + + m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Index"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); +} + +void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) +{ + // halfZ is handled in the shader + + m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; +} + +void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) +{ + m_state.m_scissor = MTL::ScissorRect{(uint32)scissorX, (uint32)scissorY, (uint32)scissorWidth, (uint32)scissorHeight}; +} + +LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) +{ + return new CachedFBOMtl(this, key); +} + +void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) +{ + if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo) + m_state.m_activeFBO = {nullptr}; +} + +void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) +{ + m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)}; + m_state.m_fboChanged = true; +} + +void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) +{ + return m_memoryManager->AcquireTextureUploadBuffer(size); +} + +void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) +{ + m_memoryManager->ReleaseTextureUploadBuffer(mem); +} + +TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) +{ + return GetMtlPixelFormatInfo(format, isDepth).textureDecoder; +} + +void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) +{ + if (hostTexture->isDepth) + { + texture_clearDepthSlice(hostTexture, sliceIndex, mipIndex, true, hostTexture->hasStencil, 0.0f, 0); + } + else + { + texture_clearColorSlice(hostTexture, sliceIndex, mipIndex, 0.0f, 0.0f, 0.0f, 0.0f); + } +} + +// TODO: do a cpu copy on Apple Silicon? +void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) +{ + auto textureMtl = (LatteTextureMtl*)hostTexture; + + uint32 offsetZ = 0; + if (textureMtl->Is3DTexture()) + { + offsetZ = sliceIndex; + sliceIndex = 0; + } + + size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->format, textureMtl->isDepth, width); + // No need to set bytesPerImage for 3D textures, since we always load just one slice + //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->isDepth, height, bytesPerRow); + //if (m_isAppleGPU) + //{ + // textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); + //} + //else + //{ + auto blitCommandEncoder = GetBlitCommandEncoder(); + + // Allocate a temporary buffer + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); + memcpy(allocation.memPtr, pixelData, compressedImageSize); + bufferAllocator.FlushReservation(allocation); + + // TODO: specify blit options when copying to a depth stencil texture? + // Copy the data from the temporary buffer to the texture + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); + //} +} + +void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ + if (!FormatIsRenderable(hostTexture->format)) + { + cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); + return; + } + + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a); +} + +void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) +{ + clearStencil = (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil); + if (!clearDepth && !clearStencil) + { + cemuLog_logOnce(LogType::Force, "skipping depth/stencil clear"); + return; + } + + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + if (clearDepth) + { + auto depthAttachment = renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(mtlTexture); + depthAttachment->setClearDepth(depthValue); + depthAttachment->setLoadAction(MTL::LoadActionClear); + depthAttachment->setStoreAction(MTL::StoreActionStore); + depthAttachment->setSlice(sliceIndex); + depthAttachment->setLevel(mipIndex); + } + if (clearStencil) + { + auto stencilAttachment = renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(mtlTexture); + stencilAttachment->setClearStencil(stencilValue); + stencilAttachment->setLoadAction(MTL::LoadActionClear); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + stencilAttachment->setSlice(sliceIndex); + stencilAttachment->setLevel(mipIndex); + } + + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; +} + +LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) +{ + return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); +} + +void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) +{ + m_state.m_textures[textureUnit] = static_cast(textureView); +} + +void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) +{ + // Source size seems to apply to the destination texture as well, therefore we need to adjust it when block size doesn't match + Uvec2 srcBlockTexelSize = GetMtlPixelFormatInfo(src->format, src->isDepth).blockTexelSize; + Uvec2 dstBlockTexelSize = GetMtlPixelFormatInfo(dst->format, dst->isDepth).blockTexelSize; + if (srcBlockTexelSize.x != dstBlockTexelSize.x || srcBlockTexelSize.y != dstBlockTexelSize.y) + { + uint32 multX = (srcBlockTexelSize.x > dstBlockTexelSize.x ? srcBlockTexelSize.x / dstBlockTexelSize.x : dstBlockTexelSize.x / srcBlockTexelSize.x); + effectiveCopyWidth *= multX; + + uint32 multY = (srcBlockTexelSize.y > dstBlockTexelSize.y ? srcBlockTexelSize.y / dstBlockTexelSize.y : dstBlockTexelSize.y / srcBlockTexelSize.y); + effectiveCopyHeight *= multY; + } + + auto blitCommandEncoder = GetBlitCommandEncoder(); + + auto mtlSrc = static_cast(src)->GetTexture(); + auto mtlDst = static_cast(dst)->GetTexture(); + + uint32 srcBaseLayer = 0; + uint32 dstBaseLayer = 0; + uint32 srcOffsetZ = 0; + uint32 dstOffsetZ = 0; + uint32 srcLayerCount = 1; + uint32 dstLayerCount = 1; + uint32 srcDepth = 1; + uint32 dstDepth = 1; + + if (src->Is3DTexture()) + { + srcOffsetZ = srcSlice; + srcDepth = srcDepth_; + } + else + { + srcBaseLayer = srcSlice; + srcLayerCount = srcDepth_; + } + + if (dst->Is3DTexture()) + { + dstOffsetZ = dstSlice; + dstDepth = srcDepth_; + } + else + { + dstBaseLayer = dstSlice; + dstLayerCount = srcDepth_; + } + + // If copying whole textures, we can do a more efficient copy + if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 && + srcOffsetZ == 0 && dstOffsetZ == 0 && + effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && srcDepth == src->GetMipDepth(srcMip) && + effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && dstDepth == dst->GetMipDepth(dstMip) && + srcLayerCount == dstLayerCount) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, mtlDst, dstBaseLayer, dstMip, srcLayerCount, 1); + } + else + { + if (srcLayerCount == dstLayerCount) + { + for (uint32 i = 0; i < srcLayerCount; i++) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, srcDepth), mtlDst, dstBaseLayer + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + else + { + for (uint32 i = 0; i < std::max(srcLayerCount, dstLayerCount); i++) + { + if (srcLayerCount == 1) + srcOffsetZ++; + else + srcSlice++; + + if (dstLayerCount == 1) + dstOffsetZ++; + else + dstSlice++; + + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstBaseLayer, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + } +} + +LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) +{ + size_t uploadSize = static_cast(textureView->baseTexture)->GetTexture()->allocatedSize(); + + if ((m_readbackBufferWriteOffset + uploadSize) > TEXTURE_READBACK_SIZE) + { + m_readbackBufferWriteOffset = 0; + } + + auto* result = new LatteTextureReadbackInfoMtl(this, textureView, m_readbackBufferWriteOffset); + m_readbackBufferWriteOffset += uploadSize; + + return result; +} + +void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) +{ + // scale copy size to effective size + sint32 effectiveCopyWidth = width; + sint32 effectiveCopyHeight = height; + LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0); + //sint32 sourceEffectiveWidth, sourceEffectiveHeight; + //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); + + texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1); +} + +void MetalRenderer::bufferCache_init(const sint32 bufferSize) +{ + m_memoryManager->InitBufferCache(bufferSize); +} + +void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) +{ + m_memoryManager->UploadToBufferCache(buffer, bufferOffset, size); +} + +void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ + m_memoryManager->CopyBufferCache(srcOffset, dstOffset, size); +} + +void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ + if (m_memoryManager->UseHostMemoryForCache()) + dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); + + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); +} + +void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) +{ + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); + + m_state.m_vertexBufferOffsets[bufferIndex] = offset; +} + +void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) +{ + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; +} + +RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) +{ + return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); +} + +void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) +{ + m_state.m_streamoutState.buffers[bufferIndex].enabled = true; + m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset; +} + +void MetalRenderer::streamout_begin() +{ + // Do nothing +} + +void MetalRenderer::streamout_rendererFinishDrawcall() +{ + // Do nothing +} + +void MetalRenderer::draw_beginSequence() +{ + m_state.m_skipDrawSequence = false; + + bool streamoutEnable = LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0; + + // update shader state + LatteSHRC_UpdateActiveShaders(); + if (LatteGPUState.activeShaderHasError) + { + cemuLog_logOnce(LogType::Force, "Skipping drawcalls due to shader error\n"); + m_state.m_skipDrawSequence = true; + cemu_assert_debug(false); + return; + } + + // update render target and texture state + LatteGPUState.requiresTextureBarrier = false; + while (true) + { + LatteGPUState.repeatTextureInitialization = false; + if (!LatteMRT::UpdateCurrentFBO()) + { + cemuLog_logOnce(LogType::Force, "Rendertarget invalid\n"); + m_state.m_skipDrawSequence = true; + return; // no render target + } + + if (!hasValidFramebufferAttached && !streamoutEnable) + { + cemuLog_logOnce(LogType::Force, "Drawcall with no color buffer or depth buffer attached\n"); + m_state.m_skipDrawSequence = true; + return; // no render target + } + LatteTexture_updateTextures(); + if (!LatteGPUState.repeatTextureInitialization) + break; + } + + // apply render target + LatteMRT::ApplyCurrentState(); + + // viewport and scissor box + LatteRenderTarget_updateViewport(); + LatteRenderTarget_updateScissorBox(); + + if (!LatteGPUState.contextNew.IsRasterizationEnabled() && !streamoutEnable) + m_state.m_skipDrawSequence = true; +} + +void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) +{ + if (m_state.m_skipDrawSequence) + { + LatteGPUState.drawCallCounter++; + return; + } + + // fast clear color as depth + if (LatteGPUState.contextNew.GetSpecialStateValues()[8] != 0) + { + LatteDraw_handleSpecialState8_clearAsDepth(); + LatteGPUState.drawCallCounter++; + return; + } + else if (LatteGPUState.contextNew.GetSpecialStateValues()[5] != 0) + { + draw_handleSpecialState5(); + LatteGPUState.drawCallCounter++; + return; + } + + auto& encoderState = m_state.m_encoderState; + + // Shaders + LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + + /* + bool neverSkipAccurateBarrier = false; + + // "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option) + // We always force accurate barriers for known problematic shaders + if (pixelShader) + { + if (pixelShader->baseHash == 0x6f6f6e7b9aae57af && pixelShader->auxHash == 0x00078787f9249249) // BotW lava + neverSkipAccurateBarrier = true; + if (pixelShader->baseHash == 0x4c0bd596e3aef4a6 && pixelShader->auxHash == 0x003c3c3fc9269249) // BotW foam layer for water on the bottom of waterfalls + neverSkipAccurateBarrier = true; + } + + // Check if we need to end the render pass + if (!m_state.m_isFirstDrawInRenderPass && (GetConfig().vk_accurate_barriers || neverSkipAccurateBarrier)) + { + // Fragment shader is most likely to require a render pass flush, so check for it first + bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); + if (!endRenderPass) + endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); + if (!endRenderPass && geometryShader) + endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); + + if (endRenderPass) + { + EndEncoding(); + // TODO: only log in debug? + cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n"); + } + } + */ + + // Primitive type + const LattePrimitiveMode primitiveMode = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + + bool usesGeometryShader = UseGeometryShader(LatteGPUState.contextNew, geometryShader != nullptr); + if (usesGeometryShader && !m_supportsMeshShaders) + return; + + bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); + + // Index buffer + Renderer::INDEX_TYPE hostIndexType; + uint32 hostIndexCount; + uint32 indexMin = 0; + uint32 indexMax = 0; + Renderer::IndexAllocation indexAllocation; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); + auto indexAllocationMtl = static_cast(indexAllocation.rendererInternal); + + // Buffer cache + if (m_memoryManager->UseHostMemoryForCache()) + { + // direct memory access (Wii U memory space imported as a buffer), update buffer bindings + draw_updateVertexBuffersDirectAccess(); + if (vertexShader) + draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); + if (geometryShader) + draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); + if (pixelShader) + draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); + } + else + { + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + } + + // Render pass + auto renderCommandEncoder = GetRenderCommandEncoder(); + + // Render pipeline state + PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew); + if (!pipelineObj->m_pipeline) + return; + + if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState) + { + renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline); + encoderState.m_renderPipelineState = pipelineObj->m_pipeline; + } + + // Depth stencil state + + // Disable depth write when there is no depth attachment + auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; + bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); + if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture) + depthControl.set_Z_WRITE_ENABLE(false); + + MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); + if (depthStencilState != encoderState.m_depthStencilState) + { + renderCommandEncoder->setDepthStencilState(depthStencilState); + encoderState.m_depthStencilState = depthStencilState; + } + + // Restore the original depth write state + depthControl.set_Z_WRITE_ENABLE(depthWriteEnable); + + // Stencil reference + bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + if (stencilEnable) + { + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); + uint32 stencilRefBack; + if (backStencilEnable) + stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + else + stencilRefBack = stencilRefFront; + + if (stencilRefFront != encoderState.m_stencilRefFront || stencilRefBack != encoderState.m_stencilRefBack) + { + renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); + + encoderState.m_stencilRefFront = stencilRefFront; + encoderState.m_stencilRefBack = stencilRefBack; + } + } + + // Blend color + uint32* blendColorConstantU32 = LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + + if (blendColorConstantU32[0] != encoderState.m_blendColor[0] || blendColorConstantU32[1] != encoderState.m_blendColor[1] || blendColorConstantU32[2] != encoderState.m_blendColor[2] || blendColorConstantU32[3] != encoderState.m_blendColor[3]) + { + float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); + + encoderState.m_blendColor[0] = blendColorConstantU32[0]; + encoderState.m_blendColor[1] = blendColorConstantU32[1]; + encoderState.m_blendColor[2] = blendColorConstantU32[2]; + encoderState.m_blendColor[3] = blendColorConstantU32[3]; + } + + // polygon control + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + const auto frontFace = polygonControlReg.get_FRONT_FACE(); + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED(); + + if (polyOffsetFrontEnable) + { + uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); + uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); + uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); + + if (frontOffsetU32 != encoderState.m_depthBias || frontScaleU32 != encoderState.m_depthSlope || offsetClampU32 != encoderState.m_depthClamp) + { + float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); + float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); + float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); + + frontScale /= 16.0f; + + renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); + + encoderState.m_depthBias = frontOffsetU32; + encoderState.m_depthSlope = frontScaleU32; + encoderState.m_depthClamp = offsetClampU32; + } + } + else + { + if (0 != encoderState.m_depthBias || 0 != encoderState.m_depthSlope || 0 != encoderState.m_depthClamp) + { + renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); + + encoderState.m_depthBias = 0; + encoderState.m_depthSlope = 0; + encoderState.m_depthClamp = 0; + } + } + + // Depth clip mode + cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually + bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; + + if (zClipEnable != encoderState.m_depthClipEnable) + { + renderCommandEncoder->setDepthClipMode(zClipEnable ? MTL::DepthClipModeClip : MTL::DepthClipModeClamp); + encoderState.m_depthClipEnable = zClipEnable; + } + + // Visibility result mode + if (m_occlusionQuery.m_active) + { + auto mode = (m_occlusionQuery.m_currentIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); + renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_currentIndex * sizeof(uint64)); + } + + // todo - how does culling behave with rects? + // right now we just assume that their winding is always CW + if (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) + { + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CW) + cullFront = cullBack; + else + cullBack = cullFront; + } + + // Cull mode + + // Cull front and back is handled by disabling rasterization + if (!(cullFront && cullBack)) + { + MTL::CullMode cullMode; + if (cullFront) + cullMode = MTL::CullModeFront; + else if (cullBack) + cullMode = MTL::CullModeBack; + else + cullMode = MTL::CullModeNone; + + if (cullMode != encoderState.m_cullMode) + { + renderCommandEncoder->setCullMode(cullMode); + encoderState.m_cullMode = cullMode; + } + } + + // Front face + MTL::Winding frontFaceWinding; + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW) + frontFaceWinding = MTL::WindingCounterClockwise; + else + frontFaceWinding = MTL::WindingClockwise; + + if (frontFaceWinding != encoderState.m_frontFaceWinding) + { + renderCommandEncoder->setFrontFacingWinding(frontFaceWinding); + encoderState.m_frontFaceWinding = frontFaceWinding; + } + + // Viewport + if (m_state.m_viewport.originX != encoderState.m_viewport.originX || + m_state.m_viewport.originY != encoderState.m_viewport.originY || + m_state.m_viewport.width != encoderState.m_viewport.width || + m_state.m_viewport.height != encoderState.m_viewport.height || + m_state.m_viewport.znear != encoderState.m_viewport.znear || + m_state.m_viewport.zfar != encoderState.m_viewport.zfar) + { + renderCommandEncoder->setViewport(m_state.m_viewport); + + encoderState.m_viewport = m_state.m_viewport; + } + + // Scissor + if (m_state.m_scissor.x != encoderState.m_scissor.x || + m_state.m_scissor.y != encoderState.m_scissor.y || + m_state.m_scissor.width != encoderState.m_scissor.width || + m_state.m_scissor.height != encoderState.m_scissor.height) + { + encoderState.m_scissor = m_state.m_scissor; + + // TODO: clamp scissor to render target dimensions? + //scissor.width = ; + //scissor.height = ; + renderCommandEncoder->setScissorRect(encoderState.m_scissor); + } + + // Resources + + // Vertex buffers + for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + { + size_t offset = m_state.m_vertexBufferOffsets[i]; + if (offset != INVALID_OFFSET) + { + // Bind + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + } + } + + // Prepare streamout + m_state.m_streamoutState.verticesPerInstance = count; + LatteStreamout_PrepareDrawcall(count, instanceCount); + + // Uniform buffers, textures and samplers + BindStageResources(renderCommandEncoder, vertexShader, usesGeometryShader); + if (usesGeometryShader && geometryShader) + BindStageResources(renderCommandEncoder, geometryShader, usesGeometryShader); + BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader); + + // Draw + if (usesGeometryShader) + { + if (hostIndexType != INDEX_TYPE::NONE) + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding); + + uint8 hostIndexTypeU8 = (uint8)hostIndexType; + renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); + encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; + + uint32 verticesPerPrimitive = GetVerticesPerPrimitive(primitiveMode); + uint32 threadgroupCount = count * instanceCount; + if (PrimitiveRequiresConnection(primitiveMode)) + threadgroupCount -= verticesPerPrimitive - 1; + else + threadgroupCount /= verticesPerPrimitive; + + renderCommandEncoder->drawMeshThreadgroups(MTL::Size(threadgroupCount, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); + } + else + { + if (hostIndexType != INDEX_TYPE::NONE) + { + auto mtlIndexType = GetMtlIndexType(hostIndexType); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance); + } + else + { + renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + } + } + + m_state.m_isFirstDrawInRenderPass = false; + + // Occlusion queries + if (m_occlusionQuery.m_active) + m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; + + // Streamout + LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); + + // Debug + if (fetchVertexManually) + m_performanceMonitor.m_manualVertexFetchDraws++; + if (usesGeometryShader) + m_performanceMonitor.m_meshDraws++; + if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN) + m_performanceMonitor.m_triangleFans++; + + LatteGPUState.drawCallCounter++; +} + +void MetalRenderer::draw_endSequence() +{ + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + // post-drawcall logic + if (pixelShader) + LatteRenderTarget_trackUpdates(); + bool hasReadback = LatteTextureReadback_Update(); + m_recordedDrawcalls++; + // The number of draw calls needs to twice as big, since we are interrupting the render pass + // TODO: ucomment? + if (m_recordedDrawcalls >= m_commitTreshold * 2/* || hasReadback*/) + { + CommitCommandBuffer(); + + // TODO: where should this be called? + LatteTextureReadback_UpdateFinishedTransfers(false); + } +} + +void MetalRenderer::draw_updateVertexBuffersDirectAccess() +{ + LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); + if (!parsedFetchShader) + return; + + for (auto& bufferGroup : parsedFetchShader->bufferGroups) + { + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; + + if (bufferAddress == MPTR_NULL) [[unlikely]] + bufferAddress = m_memoryManager->GetImportedMemBaseAddress(); + + m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); + } +} + +void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) +{ + if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (const auto& buf : shader->list_quickBufferList) + { + sint32 i = buf.index; + MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; + uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; + + if (physicalAddr == MPTR_NULL) [[unlikely]] + { + cemu_assert_unimplemented(); + continue; + } + uniformSize = std::min(uniformSize, buf.size); + + cemu_assert_debug(physicalAddr < 0x50000000); + + uint32 bufferIndex = i; + cemu_assert_debug(bufferIndex < 16); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); + } + } +} + +void MetalRenderer::draw_handleSpecialState5() +{ + LatteMRT::UpdateCurrentFBO(); + LatteRenderTarget_updateViewport(); + + LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); + LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); + auto colorTextureMtl = static_cast(colorBuffer); + auto depthTextureMtl = static_cast(depthBuffer); + + sint32 vpWidth, vpHeight; + LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); + + // Get the pipeline + MTL::PixelFormat colorPixelFormat = colorTextureMtl->GetRGBAView()->pixelFormat(); + auto& pipeline = m_copyDepthToColorPipelines[colorPixelFormat]; + if (!pipeline) + { + m_copyDepthToColorDesc->colorAttachments()->object(0)->setPixelFormat(colorPixelFormat); + + NS::Error* error = nullptr; + pipeline = m_device->newRenderPipelineState(m_copyDepthToColorDesc, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); + } + } + + // Sadly, we need to end encoding to ensure that the depth data is up-to-date + EndEncoding(); + + // Copy depth to color + auto renderCommandEncoder = GetRenderCommandEncoder(); + + auto& encoderState = m_state.m_encoderState; + + renderCommandEncoder->setRenderPipelineState(pipeline); + // TODO: make a helper function for this + encoderState.m_renderPipelineState = pipeline; + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, depthTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); + // TODO: make a helper function for this + renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0)); + encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); +} + +Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size) +{ + auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128); + + return {allocation->memPtr, allocation}; +} + +void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) +{ + m_memoryManager->GetIndexAllocator().FreeReservation(static_cast(allocation.rendererInternal)); +} + +void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) +{ + m_memoryManager->GetIndexAllocator().FlushReservation(static_cast(allocation.rendererInternal)); +} + +LatteQueryObject* MetalRenderer::occlusionQuery_create() { + return new LatteQueryObjectMtl(this); +} + +void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { + auto queryObjMtl = static_cast(queryObj); + delete queryObjMtl; +} + +void MetalRenderer::occlusionQuery_flush() { + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted(); +} + +void MetalRenderer::occlusionQuery_updateState() { + ProcessFinishedCommandBuffers(); +} + +void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) +{ + auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index]; + if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset) + return; + + if (buffer == boundBuffer.m_buffer) + { + // Update just the offset + boundBuffer.m_offset = offset; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBufferOffset(offset, index); + break; + } + + return; + } + + boundBuffer = {buffer, offset}; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBuffer(buffer, offset, index); + break; + } +} + +void MetalRenderer::SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) +{ + auto& boundTexture = m_state.m_encoderState.m_textures[shaderType][index]; + if (texture == boundTexture) + return; + + boundTexture = texture; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexTexture(texture, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectTexture(texture, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshTexture(texture, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentTexture(texture, index); + break; + } +} + +void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) +{ + auto& boundSamplerState = m_state.m_encoderState.m_samplers[shaderType][index]; + if (samplerState == boundSamplerState) + return; + + boundSamplerState = samplerState; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentSamplerState(samplerState, index); + break; + } +} + +MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() +{ + bool needsNewCommandBuffer = (!m_currentCommandBuffer.m_commandBuffer || m_currentCommandBuffer.m_commited); + if (needsNewCommandBuffer) + { + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); + + MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); + m_currentCommandBuffer = {mtlCommandBuffer}; + + // Wait for the previous command buffer + if (m_eventValue != -1) + mtlCommandBuffer->encodeWait(m_event, m_eventValue); + + m_recordedDrawcalls = 0; + m_commitTreshold = m_defaultCommitTreshlod; + + // Debug + m_performanceMonitor.m_commandBuffers++; + + return mtlCommandBuffer; + } + else + { + return m_currentCommandBuffer.m_commandBuffer; + } +} + +MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + EndEncoding(); + + auto commandBuffer = GetCommandBuffer(); + + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Temporary render command encoder", renderCommandEncoder)); +#endif + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + // Debug + m_performanceMonitor.m_renderPasses++; + + return renderCommandEncoder; +} + +// Some render passes clear the attachments, forceRecreate is supposed to be used in those cases +MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate) +{ + bool fboChanged = m_state.m_fboChanged; + m_state.m_fboChanged = false; + + // Check if we need to begin a new render pass + if (m_commandEncoder) + { + if (!forceRecreate) + { + if (m_encoderType == MetalEncoderType::Render) + { + bool needsNewRenderPass = false; + if (fboChanged) + { + needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); + if (!needsNewRenderPass) + { + for (uint8 i = 0; i < 8; i++) + { + if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) + { + needsNewRenderPass = true; + break; + } + } + } + + if (!needsNewRenderPass) + { + if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) + { + needsNewRenderPass = true; + } + } + } + + if (!needsNewRenderPass) + { + return (MTL::RenderCommandEncoder*)m_commandEncoder; + } + } + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor()); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); +#endif + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + // Update state + m_state.m_lastUsedFBO = m_state.m_activeFBO; + m_state.m_isFirstDrawInRenderPass = true; + + ResetEncoderState(); + + // Debug + m_performanceMonitor.m_renderPasses++; + + return renderCommandEncoder; +} + +MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType == MetalEncoderType::Compute) + { + return (MTL::ComputeCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto computeCommandEncoder = commandBuffer->computeCommandEncoder(); + m_commandEncoder = computeCommandEncoder; + m_encoderType = MetalEncoderType::Compute; + + ResetEncoderState(); + + return computeCommandEncoder; +} + +MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType == MetalEncoderType::Blit) + { + return (MTL::BlitCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto blitCommandEncoder = commandBuffer->blitCommandEncoder(); + m_commandEncoder = blitCommandEncoder; + m_encoderType = MetalEncoderType::Blit; + + ResetEncoderState(); + + return blitCommandEncoder; +} + +void MetalRenderer::EndEncoding() +{ + if (m_commandEncoder) + { + m_commandEncoder->endEncoding(); + m_commandEncoder->release(); + m_commandEncoder = nullptr; + m_encoderType = MetalEncoderType::None; + + // Commit the command buffer if enough draw calls have been recorded + if (m_recordedDrawcalls >= m_commitTreshold) + CommitCommandBuffer(); + } +} + +void MetalRenderer::CommitCommandBuffer() +{ + if (!m_currentCommandBuffer.m_commandBuffer) + return; + + EndEncoding(); + + ProcessFinishedCommandBuffers(); + + // Commit the command buffer + if (!m_currentCommandBuffer.m_commited) + { + // Handled differently, since it seems like Metal doesn't always call the completion handler + //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { + // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + //}); + + // Signal event + m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP; + auto mtlCommandBuffer = m_currentCommandBuffer.m_commandBuffer; + mtlCommandBuffer->encodeSignalEvent(m_event, m_eventValue); + + mtlCommandBuffer->commit(); + m_currentCommandBuffer.m_commited = true; + + m_executingCommandBuffers.push_back(mtlCommandBuffer); + + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); + } +} + +void MetalRenderer::ProcessFinishedCommandBuffers() +{ + // Check for finished command buffers + for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) + { + auto commandBuffer = *it; + if (CommandBufferCompleted(commandBuffer)) + { + m_memoryManager->CleanupBuffers(commandBuffer); + commandBuffer->release(); + it = m_executingCommandBuffers.erase(it); + } + else + { + ++it; + } + } +} + +bool MetalRenderer::AcquireDrawable(bool mainWindow) +{ + auto& layer = GetLayer(mainWindow); + if (!layer.GetLayer()) + return false; + + const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; + if (latteBufferUsesSRGB != m_state.m_usesSRGB) + { + layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + m_state.m_usesSRGB = latteBufferUsesSRGB; + } + + return layer.AcquireDrawable(); +} + +/* +bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) +{ + sint32 textureCount = shader->resourceMapping.getTextureCount(); + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + + // Texture is accessed as a framebuffer fetch, therefore there is no need to flush it + if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + + auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } + + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + continue; + + LatteTexture* baseTexture = textureView->baseTexture; + + // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; + if (colorTarget && colorTarget->baseTexture == baseTexture) + return true; + } + } + + return false; +} +*/ + +void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) +{ + auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader); + + sint32 textureCount = shader->resourceMapping.getTextureCount(); + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + + // Don't bind textures that are accessed with a framebuffer fetch + if (m_supportsFramebufferFetch && shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } + + // TODO: correct? + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; + if (binding >= MAX_MTL_TEXTURES) + { + cemuLog_logOnce(LogType::Force, "invalid texture binding {}", binding); + continue; + } + + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + { + if (textureDim == Latte::E_DIM::DIM_1D) + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); + else + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); + SetSamplerState(renderCommandEncoder, mtlShaderType, m_nearestSampler, binding); + continue; + } + + if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D)) + { + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); + continue; + } + else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA)) + { + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); + continue; + } + + LatteTexture* baseTexture = textureView->baseTexture; + + uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; + MTL::SamplerState* sampler; + if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) + { + uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); + _LatteRegisterSetSampler* samplerWords = LatteGPUState.contextNew.SQ_TEX_SAMPLER + samplerIndex; + + // Overwriting + + // Lod bias + //if (baseTexture->overwriteInfo.hasLodBias) + // samplerWords->WORD1.set_LOD_BIAS(baseTexture->overwriteInfo.lodBias); + //else if (baseTexture->overwriteInfo.hasRelativeLodBias) + // samplerWords->WORD1.set_LOD_BIAS(samplerWords->WORD1.get_LOD_BIAS() + baseTexture->overwriteInfo.relativeLodBias); + + // Max anisotropy + if (baseTexture->overwriteInfo.anisotropicLevel >= 0) + samplerWords->WORD0.set_MAX_ANISO_RATIO(baseTexture->overwriteInfo.anisotropicLevel); + + sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, shader->shaderType, stageSamplerIndex, samplerWords); + } + else + { + sampler = m_nearestSampler; + } + SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); + + // get texture register word 0 + uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; + auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; + MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); + SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding); + } + + // Support buffer + auto GET_UNIFORM_DATA_PTR = [&](size_t index) { return supportBufferData + (index / 4); }; + + sint32 shaderAluConst; + sint32 shaderUniformRegisterOffset; + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + shaderAluConst = 0x400; + shaderUniformRegisterOffset = mmSQ_VTX_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Pixel: + shaderAluConst = 0; + shaderUniformRegisterOffset = mmSQ_PS_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Geometry: + shaderAluConst = 0; // geometry shader has no ALU const + shaderUniformRegisterOffset = mmSQ_GS_UNIFORM_BLOCK_START; + break; + default: + UNREACHABLE; + } + + if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) + { + if (shader->uniform.list_ufTexRescale.empty() == false) + { + for (auto& entry : shader->uniform.list_ufTexRescale) + { + float* xyScale = LatteTexture_getEffectiveTextureScale(shader->shaderType, entry.texUnit); + memcpy(entry.currentValue, xyScale, sizeof(float) * 2); + memcpy(GET_UNIFORM_DATA_PTR(entry.uniformLocation), xyScale, sizeof(float) * 2); + } + } + if (shader->uniform.loc_alphaTestRef >= 0) + { + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_alphaTestRef) = LatteGPUState.contextNew.SX_ALPHA_REF.get_ALPHA_TEST_REF(); + } + if (shader->uniform.loc_pointSize >= 0) + { + const auto& pointSizeReg = LatteGPUState.contextNew.PA_SU_POINT_SIZE; + float pointWidth = (float)pointSizeReg.get_WIDTH() / 8.0f; + if (pointWidth == 0.0f) + pointWidth = 1.0f / 8.0f; // minimum size + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_pointSize) = pointWidth; + } + if (shader->uniform.loc_remapped >= 0) + { + LatteBufferCache_LoadRemappedUniforms(shader, GET_UNIFORM_DATA_PTR(shader->uniform.loc_remapped)); + } + if (shader->uniform.loc_uniformRegister >= 0) + { + uint32* uniformRegData = (uint32*)(LatteGPUState.contextRegister + mmSQ_ALU_CONSTANT0_0 + shaderAluConst); + memcpy(GET_UNIFORM_DATA_PTR(shader->uniform.loc_uniformRegister), uniformRegData, shader->uniform.count_uniformRegister * 16); + } + if (shader->uniform.loc_windowSpaceToClipSpaceTransform >= 0) + { + sint32 viewportWidth; + sint32 viewportHeight; + LatteRenderTarget_GetCurrentVirtualViewportSize(&viewportWidth, &viewportHeight); // always call after _updateViewport() + float* v = GET_UNIFORM_DATA_PTR(shader->uniform.loc_windowSpaceToClipSpaceTransform); + v[0] = 2.0f / (float)viewportWidth; + v[1] = 2.0f / (float)viewportHeight; + } + if (shader->uniform.loc_fragCoordScale >= 0) + { + LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); + } + if (shader->uniform.loc_verticesPerInstance >= 0) + { + *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance; + for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) + { + if (shader->uniform.loc_streamoutBufferBase[b] >= 0) + { + *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset; + } + } + } + + size_t size = shader->uniform.uniformRangeSize; + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto allocation = bufferAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, supportBufferData, size); + bufferAllocator.FlushReservation(allocation); + + SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); + } + + // Uniform buffers + for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) + { + uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; + if (binding >= MAX_MTL_BUFFERS) + { + cemuLog_logOnce(LogType::Force, "invalid buffer binding {}", binding); + continue; + } + + size_t offset = m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][i]; + if (offset == INVALID_OFFSET) + continue; + + SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding); + } + } + + // Storage buffer + if (shader->resourceMapping.tfStorageBindingPoint >= 0) + { + SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + } +} + +void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(mtlTexture); + colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); + colorAttachment->setLoadAction(MTL::LoadActionClear); + colorAttachment->setStoreAction(MTL::StoreActionStore); + colorAttachment->setSlice(sliceIndex); + colorAttachment->setLevel(mipIndex); + + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; +} + +void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) +{ + // TODO: uncomment and fix performance issues + // Do the copy in a vertex shader on Apple GPUs + /* + if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_commandEncoder); + + MTL::Resource* barrierBuffers[] = {src}; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex); + + renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); + m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); + + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, src, srcOffset, GET_HELPER_BUFFER_BINDING(0)); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dst, dstOffset, GET_HELPER_BUFFER_BINDING(1)); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); + + barrierBuffers[0] = dst; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, before | MTL::RenderStageVertex, before); + } + else + { + */ + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); + //} +} + +void MetalRenderer::SwapBuffer(bool mainWindow) +{ + if (!AcquireDrawable(mainWindow)) + return; + + auto commandBuffer = GetCommandBuffer(); + GetLayer(mainWindow).PresentDrawable(commandBuffer); +} + +void MetalRenderer::EnsureImGuiBackend() +{ + if (!ImGui::GetIO().BackendRendererUserData) + { + ImGui_ImplMetal_Init(m_device); + //ImGui_ImplMetal_CreateFontsTexture(m_device); + } +} + +void MetalRenderer::StartCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + auto desc = MTL::CaptureDescriptor::alloc()->init(); + desc->setCaptureObject(m_device); + + // Check if a debugger with support for GPU capture is attached + if (captureManager->supportsDestination(MTL::CaptureDestinationDeveloperTools)) + { + desc->setDestination(MTL::CaptureDestinationDeveloperTools); + } + else + { + if (GetConfig().gpu_capture_dir.GetValue().empty()) + { + cemuLog_log(LogType::Force, "No GPU capture directory specified, cannot do a GPU capture"); + return; + } + + // Check if the GPU trace document destination is available + if (!captureManager->supportsDestination(MTL::CaptureDestinationGPUTraceDocument)) + { + cemuLog_log(LogType::Force, "GPU trace document destination is not available, cannot do a GPU capture"); + return; + } + + // Get current date and time as a string + auto now = std::chrono::system_clock::now(); + std::time_t now_time = std::chrono::system_clock::to_time_t(now); + std::ostringstream oss; + oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d_%H-%M-%S"); + std::string now_str = oss.str(); + + std::string capturePath = fmt::format("{}/cemu_{}.gputrace", GetConfig().gpu_capture_dir.GetValue(), now_str); + desc->setDestination(MTL::CaptureDestinationGPUTraceDocument); + desc->setOutputURL(ToNSURL(capturePath)); + } + + NS::Error* error = nullptr; + captureManager->startCapture(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "Failed to start GPU capture: {}", error->localizedDescription()->utf8String()); + } + + m_capturing = true; +} + +void MetalRenderer::EndCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + captureManager->stopCapture(); + + m_capturing = false; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h new file mode 100644 index 0000000000..428284f20d --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -0,0 +1,570 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Renderer.h" + +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" + +enum MetalGeneralShaderType +{ + METAL_GENERAL_SHADER_TYPE_VERTEX, + METAL_GENERAL_SHADER_TYPE_GEOMETRY, + METAL_GENERAL_SHADER_TYPE_FRAGMENT, + + METAL_GENERAL_SHADER_TYPE_TOTAL +}; + +inline MetalGeneralShaderType GetMtlGeneralShaderType(LatteConst::ShaderType shaderType) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + return METAL_GENERAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_GENERAL_SHADER_TYPE_GEOMETRY; + case LatteConst::ShaderType::Pixel: + return METAL_GENERAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_GENERAL_SHADER_TYPE_TOTAL; + } +} + +enum MetalShaderType +{ + METAL_SHADER_TYPE_VERTEX, + METAL_SHADER_TYPE_OBJECT, + METAL_SHADER_TYPE_MESH, + METAL_SHADER_TYPE_FRAGMENT, + + METAL_SHADER_TYPE_TOTAL +}; + +inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType, bool usesGeometryShader) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + if (usesGeometryShader) + return METAL_SHADER_TYPE_OBJECT; + else + return METAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_SHADER_TYPE_MESH; + case LatteConst::ShaderType::Pixel: + return METAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_SHADER_TYPE_TOTAL; + } +} + +struct MetalEncoderState +{ + MTL::RenderPipelineState* m_renderPipelineState = nullptr; + MTL::DepthStencilState* m_depthStencilState = nullptr; + MTL::CullMode m_cullMode = MTL::CullModeNone; + MTL::Winding m_frontFaceWinding = MTL::WindingClockwise; + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; + uint32 m_stencilRefFront = 0; + uint32 m_stencilRefBack = 0; + uint32 m_blendColor[4] = {0}; + uint32 m_depthBias = 0; + uint32 m_depthSlope = 0; + uint32 m_depthClamp = 0; + bool m_depthClipEnable = true; + struct { + MTL::Buffer* m_buffer; + size_t m_offset; + } m_buffers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + MTL::Texture* m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; + MTL::SamplerState* m_samplers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_SAMPLERS]; +}; + +struct MetalStreamoutState +{ + struct + { + bool enabled; + uint32 ringBufferOffset; + } buffers[LATTE_NUM_STREAMOUT_BUFFER]; + sint32 verticesPerInstance; +}; + +struct MetalActiveFBOState +{ + class CachedFBOMtl* m_fbo = nullptr; + MetalAttachmentsInfo m_attachmentsInfo; +}; + +struct MetalState +{ + MetalEncoderState m_encoderState{}; + + bool m_usesSRGB = false; + + bool m_skipDrawSequence = false; + bool m_isFirstDrawInRenderPass = true; + + MetalActiveFBOState m_activeFBO; + // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change + MetalActiveFBOState m_lastUsedFBO; + bool m_fboChanged = false; + + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; + class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS * 3] = {nullptr}; + size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; + + MetalStreamoutState m_streamoutState; +}; + +struct MetalCommandBuffer +{ + MTL::CommandBuffer* m_commandBuffer = nullptr; + bool m_commited = false; +}; + +enum class MetalEncoderType +{ + None, + Render, + Compute, + Blit, +}; + +class MetalRenderer : public Renderer +{ +public: + static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; + static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + + struct DeviceInfo + { + std::string name; + uint64 uuid; + }; + + static std::vector GetDevices(); + + MetalRenderer(); + ~MetalRenderer() override; + + RendererAPI GetType() override + { + return RendererAPI::Metal; + } + + static MetalRenderer* GetInstance() { + return static_cast(g_renderer.get()); + } + + // Helper functions + MTL::Device* GetDevice() const { + return m_device; + } + + void InitializeLayer(const Vector2i& size, bool mainWindow); + void ShutdownLayer(bool mainWindow); + void ResizeLayer(const Vector2i& size, bool mainWindow); + + void Initialize() override; + void Shutdown() override; + bool IsPadWindowActive() override; + + bool GetVRAMInfo(int& usageInMB, int& totalInMB) const override; + + void ClearColorbuffer(bool padView) override; + void DrawEmptyFrame(bool mainWindow) override; + void SwapBuffers(bool swapTV, bool swapDRC) override; + + void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override; + + void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) override; + bool BeginFrame(bool mainWindow) override; + + // flush control + void Flush(bool waitIdle = false) override; // called when explicit flush is required (e.g. by imgui) + void NotifyLatteCommandProcessorIdle() override; // called when command processor has no more commands available or when stalled + + // imgui + bool ImguiBegin(bool mainWindow) override; + void ImguiEnd() override; + ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override; + void DeleteTexture(ImTextureID id) override; + void DeleteFontTextures() override; + + bool UseTFViaSSBO() const override { return true; } + void AppendOverlayDebugInfo() override; + + // rendertarget + void renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ = false) override; + void renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) override; + + LatteCachedFBO* rendertarget_createCachedFBO(uint64 key) override; + void rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) override; + void rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) override; + + // texture functions + void* texture_acquireTextureUploadBuffer(uint32 size) override; + void texture_releaseTextureUploadBuffer(uint8* mem) override; + + TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; + + void texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) override; + void texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) override; + void texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) override; + void texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) override; + + LatteTexture* texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) override; + + void texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) override; + void texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) override; + + LatteTextureReadbackInfo* texture_createReadback(LatteTextureView* textureView) override; + + // surface copy + void surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) override; + + // buffer cache + void bufferCache_init(const sint32 bufferSize) override; + void bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) override; + void bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + void bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + + void buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) override; + void buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) override; + + // shader + RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) override; + + // streamout + void streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) override; + void streamout_begin() override; + void streamout_rendererFinishDrawcall() override; + + // core drawing logic + void draw_beginSequence() override; + void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; + void draw_endSequence() override; + + void draw_updateVertexBuffersDirectAccess(); + void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + + void draw_handleSpecialState5(); + + // index + IndexAllocation indexData_reserveIndexMemory(uint32 size) override; + void indexData_releaseIndexMemory(IndexAllocation& allocation) override; + void indexData_uploadIndexMemory(IndexAllocation& allocation) override; + + // occlusion queries + LatteQueryObject* occlusionQuery_create() override; + void occlusionQuery_destroy(LatteQueryObject* queryObj) override; + void occlusionQuery_flush() override; + void occlusionQuery_updateState() override; + + // Helpers + MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } + + void SetShouldMaximizeConcurrentCompilation(bool shouldMaximizeConcurrentCompilation) + { + if (m_supportsMetal3) + m_device->setShouldMaximizeConcurrentCompilation(shouldMaximizeConcurrentCompilation); + } + + bool IsCommandBufferActive() const + { + return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited); + } + + MTL::CommandBuffer* GetCurrentCommandBuffer() const + { + cemu_assert_debug(m_currentCommandBuffer.m_commandBuffer); + + return m_currentCommandBuffer.m_commandBuffer; + } + + MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() const + { + // The command buffer has been commited and has finished execution + if (m_currentCommandBuffer.m_commited && m_executingCommandBuffers.size() == 0) + return nullptr; + + return GetCurrentCommandBuffer()->retain(); + } + + void RequestSoonCommit() + { + m_commitTreshold = m_recordedDrawcalls + 8; + } + + MTL::CommandEncoder* GetCommandEncoder() + { + return m_commandEncoder; + } + + MetalEncoderType GetEncoderType() + { + return m_encoderType; + } + + void ResetEncoderState() + { + m_state.m_encoderState = {}; + + // TODO: set viewport and scissor to render target dimensions if render commands + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_encoderState.m_buffers[i][j] = {nullptr}; + for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) + m_state.m_encoderState.m_textures[i][j] = nullptr; + for (uint32 j = 0; j < MAX_MTL_SAMPLERS; j++) + m_state.m_encoderState.m_samplers[i][j] = nullptr; + } + } + + MetalEncoderState& GetEncoderState() + { + return m_state.m_encoderState; + } + + void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index); + void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index); + void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); + + MTL::CommandBuffer* GetCommandBuffer(); + MTL::RenderCommandEncoder* GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false); + MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); + MTL::BlitCommandEncoder* GetBlitCommandEncoder(); + void EndEncoding(); + void CommitCommandBuffer(); + void ProcessFinishedCommandBuffers(); + + bool AcquireDrawable(bool mainWindow); + + //bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); + + void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); + + void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before); + + // Getters + bool GetPositionInvariance() const + { + return m_positionInvariance; + } + + bool IsAppleGPU() const + { + return m_isAppleGPU; + } + + bool SupportsFramebufferFetch() const + { + return m_supportsFramebufferFetch; + } + + bool HasUnifiedMemory() const + { + return m_hasUnifiedMemory; + } + + bool SupportsMetal3() const + { + return m_supportsMetal3; + } + + bool SupportsMeshShaders() const + { + return m_supportsMeshShaders; + } + + //MTL::StorageMode GetOptimalTextureStorageMode() const + //{ + // return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); + //} + + MTL::ResourceOptions GetOptimalBufferStorageMode() const + { + return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); + } + + MTL::Texture* GetNullTexture2D() const + { + return m_nullTexture2D; + } + + MTL::Buffer* GetTextureReadbackBuffer() + { + if (!m_readbackBuffer) + { + m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); +#endif + } + + return m_readbackBuffer; + } + + MTL::Buffer* GetXfbRingBuffer() + { + if (!m_xfbRingBuffer) + { + // HACK: using just LatteStreamout_GetRingBufferSize will cause page faults + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 4, MTL::ResourceStorageModePrivate); +#ifdef CEMU_DEBUG_ASSERT + m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); +#endif + } + + return m_xfbRingBuffer; + } + + MTL::Buffer* GetOcclusionQueryResultBuffer() const + { + return m_occlusionQuery.m_resultBuffer; + } + + uint64* GetOcclusionQueryResultsPtr() + { + return m_occlusionQuery.m_resultsPtr; + } + + uint32 GetOcclusionQueryIndex() + { + return m_occlusionQuery.m_currentIndex; + } + + void BeginOcclusionQuery() + { + m_occlusionQuery.m_active = true; + } + + void EndOcclusionQuery() + { + m_occlusionQuery.m_active = false; + + // Release the old command buffer + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->release(); + + // Get and retain the current command buffer + m_occlusionQuery.m_lastCommandBuffer = GetAndRetainCurrentCommandBufferIfNotCompleted(); + } + + // GPU capture + void CaptureFrame() + { + m_captureFrame = true; + } + +private: + MetalLayerHandle m_mainLayer; + MetalLayerHandle m_padLayer; + + MetalPerformanceMonitor m_performanceMonitor; + + // Options + bool m_positionInvariance; + + // Metal objects + MTL::Device* m_device = nullptr; + MTL::CommandQueue* m_commandQueue; + + // Feature support + bool m_isAppleGPU; + bool m_supportsFramebufferFetch; + bool m_hasUnifiedMemory; + bool m_supportsMetal3; + bool m_supportsMeshShaders; + uint32 m_recommendedMaxVRAMUsage; + MetalPixelFormatSupport m_pixelFormatSupport; + + // Managers and caches + class MetalMemoryManager* m_memoryManager; + class MetalOutputShaderCache* m_outputShaderCache; + class MetalPipelineCache* m_pipelineCache; + class MetalDepthStencilCache* m_depthStencilCache; + class MetalSamplerCache* m_samplerCache; + + // Pipelines + MTL::RenderPipelineDescriptor* m_copyDepthToColorDesc; + std::map m_copyDepthToColorPipelines; + + // Void vertex pipelines + class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; + + // Synchronization resources + MTL::Event* m_event; + int32_t m_eventValue = -1; + + // Resources + MTL::SamplerState* m_nearestSampler; + MTL::SamplerState* m_linearSampler; + + // Null resources + MTL::Texture* m_nullTexture1D; + MTL::Texture* m_nullTexture2D; + + // Texture readback + MTL::Buffer* m_readbackBuffer = nullptr; + uint32 m_readbackBufferWriteOffset = 0; + + // Transform feedback + MTL::Buffer* m_xfbRingBuffer = nullptr; + + // Occlusion queries + struct + { + MTL::Buffer* m_resultBuffer; + uint64* m_resultsPtr; + uint32 m_currentIndex = 0; + bool m_active = false; + MTL::CommandBuffer* m_lastCommandBuffer = nullptr; + } m_occlusionQuery; + + // Autorelease pool + NS::AutoreleasePool* m_autoreleasePool; + + // Active objects + MetalCommandBuffer m_currentCommandBuffer{}; + std::vector m_executingCommandBuffers; + MetalEncoderType m_encoderType = MetalEncoderType::None; + MTL::CommandEncoder* m_commandEncoder = nullptr; + + uint32 m_recordedDrawcalls; + uint32 m_defaultCommitTreshlod; + uint32 m_commitTreshold; + + // State + MetalState m_state; + + // GPU capture + bool m_captureFrame = false; + bool m_capturing = false; + + // Helpers + MetalLayerHandle& GetLayer(bool mainWindow) + { + return (mainWindow ? m_mainLayer : m_padLayer); + } + + void SwapBuffer(bool mainWindow); + + void EnsureImGuiBackend(); + + // GPU capture + void StartCapture(); + void EndCapture(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp new file mode 100644 index 0000000000..3a1371a51d --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -0,0 +1,190 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +MTL::SamplerBorderColor GetBorderColor(LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords, bool logWorkaround = false) +{ + auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + + MTL::SamplerBorderColor borderColor; + if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) + borderColor = MTL::SamplerBorderColorTransparentBlack; + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) + borderColor = MTL::SamplerBorderColorOpaqueBlack; + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else [[unlikely]] + { + _LatteRegisterSetSamplerBorderColor* borderColorReg; + if (shaderType == LatteConst::ShaderType::Vertex) + borderColorReg = LatteGPUState.contextNew.TD_VS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else if (shaderType == LatteConst::ShaderType::Pixel) + borderColorReg = LatteGPUState.contextNew.TD_PS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else // geometry + borderColorReg = LatteGPUState.contextNew.TD_GS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + float r = borderColorReg->red.get_channelValue(); + float g = borderColorReg->green.get_channelValue(); + float b = borderColorReg->blue.get_channelValue(); + float a = borderColorReg->alpha.get_channelValue(); + + // Metal doesn't support custom border color + // Let's find the best match + bool opaque = (a == 1.0f); + bool white = (r == 1.0f); + if (opaque) + { + if (white) + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else + borderColor = MTL::SamplerBorderColorOpaqueBlack; + } + else + { + borderColor = MTL::SamplerBorderColorTransparentBlack; + } + + if (logWorkaround) + { + float newR, newG, newB, newA; + switch (borderColor) + { + case MTL::SamplerBorderColorTransparentBlack: + newR = 0.0f; + newG = 0.0f; + newB = 0.0f; + newA = 0.0f; + break; + case MTL::SamplerBorderColorOpaqueBlack: + newR = 0.0f; + newG = 0.0f; + newB = 0.0f; + newA = 1.0f; + break; + case MTL::SamplerBorderColorOpaqueWhite: + newR = 1.0f; + newG = 1.0f; + newB = 1.0f; + newA = 1.0f; + break; + } + + if (r != newR || g != newG || b != newB || a != newA) + cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using ({}, {}, {}, {}) instead", r, g, b, a, newR, newG, newB, newA); + } + } + + return borderColor; +} + +MetalSamplerCache::~MetalSamplerCache() +{ + for (auto& pair : m_samplerCache) + { + pair.second->release(); + } + m_samplerCache.clear(); +} + +MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords) +{ + uint64 stateHash = CalculateSamplerHash(lcr, shaderType, stageSamplerIndex, samplerWords); + auto& samplerState = m_samplerCache[stateHash]; + if (samplerState) + return samplerState; + + // Sampler state + + + NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + + // lod + uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); + uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); + //sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); + + auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); + if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp(0.0f); + samplerDescriptor->setLodMaxClamp(0.25f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::LINEAR) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else + { + // fallback for invalid constants + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + + auto filterMin = samplerWords->WORD0.get_XY_MIN_FILTER(); + cemu_assert_debug(filterMin != Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BICUBIC); // todo + samplerDescriptor->setMinFilter((filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterMag = samplerWords->WORD0.get_XY_MAG_FILTER(); + samplerDescriptor->setMagFilter((filterMag == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterZ = samplerWords->WORD0.get_Z_FILTER(); + // todo: z-filter for texture array samplers is customizable for GPU7 but OpenGL/Vulkan doesn't expose this functionality? + + auto clampX = samplerWords->WORD0.get_CLAMP_X(); + auto clampY = samplerWords->WORD0.get_CLAMP_Y(); + auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); + + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); + + auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); + + if (maxAniso > 0) + samplerDescriptor->setMaxAnisotropy(1 << maxAniso); + + // TODO: set lod bias + //samplerInfo.mipLodBias = (float)iLodBias / 64.0f; + + // depth compare + //uint8 depthCompareMode = shader->textureUsesDepthCompare[relative_textureUnit] ? 1 : 0; + // TODO: is it okay to just cast? + samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); + + // Border color + auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords, true); + samplerDescriptor->setBorderColor(borderColor); + + samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); + + return samplerState; +} + +uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords) +{ + uint64 hash = 0; + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD0.getRawValue(); + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD1.getRawValue(); + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD2.getRawValue(); + + auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords); + + hash = std::rotl(hash, 5); + hash += (uint64)borderColor; + + // TODO: check this + return hash; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h new file mode 100644 index 0000000000..cbb02cf3b1 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "HW/Latte/Core/LatteConst.h" +#include "HW/Latte/ISA/LatteReg.h" + +class MetalSamplerCache +{ +public: + MetalSamplerCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalSamplerCache(); + + MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords); + +private: + class MetalRenderer* m_mtlr; + + std::map m_samplerCache; + + uint64 CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp new file mode 100644 index 0000000000..7e810e67b7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp @@ -0,0 +1,23 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + +MetalVoidVertexPipeline::MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName) +{ + // Render pipeline state + NS_STACK_SCOPED MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); + + NS_STACK_SCOPED MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexFunction); + renderPipelineDescriptor->setRasterizationEnabled(false); + + NS::Error* error = nullptr; + m_renderPipelineState = mtlRenderer->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); + } +} + +MetalVoidVertexPipeline::~MetalVoidVertexPipeline() +{ + m_renderPipelineState->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h new file mode 100644 index 0000000000..57666a57a7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h @@ -0,0 +1,16 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLLibrary.hpp" +#include "Metal/MTLRenderPipeline.hpp" + +class MetalVoidVertexPipeline +{ +public: + MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName); + ~MetalVoidVertexPipeline(); + + MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } + +private: + MTL::RenderPipelineState* m_renderPipelineState; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp new file mode 100644 index 0000000000..f00af85aca --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -0,0 +1,407 @@ +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +//#include "Cemu/FileCache/FileCache.h" +//#include "config/ActiveSettings.h" +#include "Cemu/Logging/CemuLogging.h" +#include "Common/precompiled.h" +#include "GameProfile/GameProfile.h" +#include "util/helpers/helpers.h" + +#define METAL_AIR_CACHE_NAME "Cemu_AIR_cache" +#define METAL_AIR_CACHE_PATH "/Volumes/" METAL_AIR_CACHE_NAME +#define METAL_AIR_CACHE_SIZE (16 * 1024 * 1024) +#define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) + +static bool s_isLoadingShadersMtl{false}; +//static bool s_hasRAMFilesystem{false}; +//class FileCache* s_airCache{nullptr}; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +class ShaderMtlThreadPool +{ +public: + void StartThreads() + { + if (m_threadsActive.exchange(true)) + return; + + // Create thread pool + const uint32 threadCount = 2; + for (uint32 i = 0; i < threadCount; ++i) + s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); + + // Create AIR cache thread + /* + s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); + + // Set priority + sched_param schedParam; + schedParam.sched_priority = 20; + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_FIFO, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set FIFO thread priority"); + } + + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_RR, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set RR thread priority"); + } + */ + } + + void StopThreads() + { + if (!m_threadsActive.exchange(false)) + return; + for (uint32 i = 0; i < s_threads.size(); ++i) + s_compilationQueueCount.increment(); + for (auto& it : s_threads) + it.join(); + s_threads.clear(); + + /* + if (s_airCacheThread) + { + s_airCacheQueueCount.increment(); + s_airCacheThread->join(); + delete s_airCacheThread; + } + */ + } + + ~ShaderMtlThreadPool() + { + StopThreads(); + } + + void CompilerThreadFunc() + { + SetThreadName("mtlShaderComp"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_compilationQueueCount.decrementWithWait(); + s_compilationQueueMutex.lock(); + if (s_compilationQueue.empty()) + { + // queue empty again, shaders compiled synchronously via PreponeCompilation() + s_compilationQueueMutex.unlock(); + continue; + } + RendererShaderMtl* job = s_compilationQueue.front(); + s_compilationQueue.pop_front(); + // set compilation state + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::QUEUED); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::COMPILING); + s_compilationQueueMutex.unlock(); + // compile + job->CompileInternal(); + if (job->ShouldCountCompilation()) + ++g_compiled_shaders_async; + // mark as compiled + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::COMPILING); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::DONE); + } + } + + /* + void AIRCacheThreadFunc() + { + SetThreadName("mtlAIRCache"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_airCacheQueueCount.decrementWithWait(); + s_airCacheQueueMutex.lock(); + if (s_airCacheQueue.empty()) + { + s_airCacheQueueMutex.unlock(); + continue; + } + + // Create RAM filesystem + if (!s_hasRAMFilesystem) + { + executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); + s_hasRAMFilesystem = true; + } + + RendererShaderMtl* job = s_airCacheQueue.front(); + s_airCacheQueue.pop_front(); + s_airCacheQueueMutex.unlock(); + // compile + job->CompileToAIR(); + } + } + */ + + bool HasThreadsRunning() const { return m_threadsActive; } + +public: + std::vector s_threads; + //std::thread* s_airCacheThread{nullptr}; + + std::deque s_compilationQueue; + CounterSemaphore s_compilationQueueCount; + std::mutex s_compilationQueueMutex; + + /* + std::deque s_airCacheQueue; + CounterSemaphore s_airCacheQueueCount; + std::mutex s_airCacheQueueMutex; + */ + +private: + std::atomic m_threadsActive; +} shaderMtlThreadPool; + +// TODO: find out if it would be possible to cache compiled Metal shaders +void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) +{ + s_isLoadingShadersMtl = true; + + // Open AIR cache + /* + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + uint32 airCacheMagic = GeneratePrecompiledCacheId(); + const std::string cacheFilename = fmt::format("{:016x}_air.bin", cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + s_airCache = FileCache::Open(cachePath, true, airCacheMagic); + if (!s_airCache) + cemuLog_log(LogType::Force, "Unable to open AIR cache {}", cacheFilename); + */ + + // Maximize shader compilation speed + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(true); +} + +void RendererShaderMtl::ShaderCacheLoading_end() +{ + s_isLoadingShadersMtl = false; + + // Reset shader compilation speed + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); +} + +void RendererShaderMtl::ShaderCacheLoading_Close() +{ + // Close the AIR cache + /* + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + + // Close RAM filesystem + if (s_hasRAMFilesystem) + executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + */ +} + +void RendererShaderMtl::Initialize() +{ + shaderMtlThreadPool.StartThreads(); +} + +void RendererShaderMtl::Shutdown() +{ + shaderMtlThreadPool.StopThreads(); +} + +RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) + : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer}, m_mslCode{mslCode} +{ + // start async compilation + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + m_compilationState.setValue(COMPILATION_STATE::QUEUED); + shaderMtlThreadPool.s_compilationQueue.push_back(this); + shaderMtlThreadPool.s_compilationQueueCount.increment(); + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + cemu_assert_debug(shaderMtlThreadPool.HasThreadsRunning()); // make sure .StartThreads() was called +} + +RendererShaderMtl::~RendererShaderMtl() +{ + if (m_function) + m_function->release(); +} + +void RendererShaderMtl::PreponeCompilation(bool isRenderThread) +{ + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + bool isStillQueued = m_compilationState.hasState(COMPILATION_STATE::QUEUED); + if (isStillQueued) + { + // remove from queue + shaderMtlThreadPool.s_compilationQueue.erase(std::remove(shaderMtlThreadPool.s_compilationQueue.begin(), shaderMtlThreadPool.s_compilationQueue.end(), this), shaderMtlThreadPool.s_compilationQueue.end()); + m_compilationState.setValue(COMPILATION_STATE::COMPILING); + } + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + if (!isStillQueued) + { + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + if (ShouldCountCompilation()) + --g_compiled_shaders_async; // compilation caused a stall so we don't consider this one async + return; + } + else + { + // compile synchronously + CompileInternal(); + m_compilationState.setValue(COMPILATION_STATE::DONE); + } +} + +bool RendererShaderMtl::IsCompiled() +{ + return m_compilationState.hasState(COMPILATION_STATE::DONE); +}; + +bool RendererShaderMtl::WaitForCompiled() +{ + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + return true; +} + +bool RendererShaderMtl::ShouldCountCompilation() const +{ + return !s_isLoadingShadersMtl && m_isGameShader; +} + +MTL::Library* RendererShaderMtl::LibraryFromSource() +{ + // Compile from source + NS_STACK_SCOPED MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + if (g_current_game_profile->GetShaderFastMath()) + options->setFastMathEnabled(true); + + if (m_mtlr->GetPositionInvariance()) + { + // TODO: filter out based on GPU state + options->setPreserveInvariance(true); + } + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from source: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); + return nullptr; + } + + return library; +} + +/* +MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) +{ + dispatch_data_t dispatchData = dispatch_data_create(data.data(), data.size(), nullptr, DISPATCH_DATA_DESTRUCTOR_DEFAULT); + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(dispatchData, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from AIR: {}", error->localizedDescription()->utf8String()); + return nullptr; + } + + return library; +} +*/ + +void RendererShaderMtl::CompileInternal() +{ + MTL::Library* library = nullptr; + + // First, try to retrieve the compiled shader from the AIR cache + /* + if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) + { + cemu_assert_debug(m_baseHash != 0); + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + std::vector cacheFileData; + if (s_airCache->GetFile({ h1, h2 }, cacheFileData)) + { + library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); + FinishCompilation(); + } + } + */ + + // Not in the cache, compile from source + if (!library) + { + // Compile from source + library = LibraryFromSource(); + FinishCompilation(); + if (!library) + return; + + // Store in the AIR cache + /* + shaderMtlThreadPool.s_airCacheQueueMutex.lock(); + shaderMtlThreadPool.s_airCacheQueue.push_back(this); + shaderMtlThreadPool.s_airCacheQueueCount.increment(); + shaderMtlThreadPool.s_airCacheQueueMutex.unlock(); + */ + } + + m_function = library->newFunction(ToNSString("main0")); + library->release(); + + // Count shader compilation + if (ShouldCountCompilation()) + g_compiled_shaders_total++; +} + +/* +void RendererShaderMtl::CompileToAIR() +{ + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + + // The shader is not in the cache, compile it + std::string baseFilename = fmt::format("{}/{}_{}", METAL_AIR_CACHE_PATH, h1, h2); + + // Source + std::ofstream mslFile; + mslFile.open(fmt::format("{}.metal", baseFilename)); + mslFile << m_mslCode; + mslFile.close(); + + // Compile + if (!executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename)) + return; + if (!executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename)) + return; + + // Clean up + executeCommand("rm {}.metal", baseFilename); + executeCommand("rm {}.ir", baseFilename); + + // Load from the newly generated AIR + MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); + std::span airData = std::span(airFile.data(), airFile.size()); + //library = LibraryFromAIR(std::span(airData.data(), airData.size())); + + // Store in the cache + s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + + // Clean up + executeCommand("rm {}.metallib", baseFilename); + + FinishCompilation(); +} +*/ + +void RendererShaderMtl::FinishCompilation() +{ + m_mslCode.clear(); + m_mslCode.shrink_to_fit(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h new file mode 100644 index 0000000000..9953ba7467 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -0,0 +1,79 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/RendererShader.h" +#include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/Semaphore.h" + +#include + +class RendererShaderMtl : public RendererShader +{ + friend class ShaderMtlThreadPool; + + enum class COMPILATION_STATE : uint32 + { + NONE, + QUEUED, + COMPILING, + DONE + }; + +public: + static void ShaderCacheLoading_begin(uint64 cacheTitleId); + static void ShaderCacheLoading_end(); + static void ShaderCacheLoading_Close(); + + static void Initialize(); + static void Shutdown(); + + RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); + virtual ~RendererShaderMtl(); + + MTL::Function* GetFunction() const + { + return m_function; + } + + sint32 GetUniformLocation(const char* name) override + { + cemu_assert_suspicious(); + return 0; + } + + void SetUniform2fv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } + + void SetUniform4iv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } + + void PreponeCompilation(bool isRenderThread) override; + bool IsCompiled() override; + bool WaitForCompiled() override; + +private: + class MetalRenderer* m_mtlr; + + MTL::Function* m_function = nullptr; + + StateSemaphore m_compilationState{ COMPILATION_STATE::NONE }; + + std::string m_mslCode; + + bool ShouldCountCompilation() const; + + MTL::Library* LibraryFromSource(); + + //MTL::Library* LibraryFromAIR(std::span data); + + void CompileInternal(); + + //void CompileToAIR(); + + void FinishCompilation(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h new file mode 100644 index 0000000000..2041f4f88a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -0,0 +1,51 @@ +#pragma once + +#define __STRINGIFY(x) #x +#define _STRINGIFY(x) __STRINGIFY(x) + +constexpr const char* utilityShaderSource = R"(#include +using namespace metal; + +#define GET_BUFFER_BINDING(index) (28 + index) +#define GET_TEXTURE_BINDING(index) (29 + index) +#define GET_SAMPLER_BINDING(index) (14 + index) + +constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; + +struct VertexOut { + float4 position [[position]]; + float2 texCoord; +}; + +vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { + VertexOut out; + out.position = float4(positions[vid], 0.0, 1.0); + out.texCoord = positions[vid] * 0.5 + 0.5; + out.texCoord.y = 1.0 - out.texCoord.y; + + return out; +} + +//fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], //sampler samplr [[sampler(0)]]) { +// return tex.sample(samplr, in.texCoord); +//} + +vertex void vertexCopyBufferToBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]]) { + dst[vid] = src[vid]; +} + +fragment float4 fragmentCopyDepthToColor(VertexOut in [[stage_in]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]]) { + return float4(src.read(uint2(in.position.xy)).r, 0.0, 0.0, 0.0); +} + +//struct RestrideParams { +// uint oldStride; +// uint newStride; +//}; + +//vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer//(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant //RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { +// for (uint32_t i = 0; i < params.oldStride; i++) { +// dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; +// } +//} +)"; diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.h b/src/Cafe/HW/Latte/Renderer/MetalView.h new file mode 100644 index 0000000000..43e5c7b3f9 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.h @@ -0,0 +1,7 @@ +#pragma once + +#import +#import + +@interface MetalView : NSView +@end diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.mm b/src/Cafe/HW/Latte/Renderer/MetalView.mm new file mode 100644 index 0000000000..5ca17b5ef3 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.mm @@ -0,0 +1,26 @@ +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +@implementation MetalView + +-(BOOL) wantsUpdateLayer { return YES; } + ++(Class) layerClass { return [CAMetalLayer class]; } + +// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m + +-(CALayer*) makeBackingLayer +{ + CALayer* layer = [self.class.layerClass layer]; + CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; + layer.contentsScale = MIN(viewScale.width, viewScale.height); + return layer; +} + +-(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window +{ + if (newScale == layer.contentsScale) { return NO; } + + layer.contentsScale = newScale; + return YES; +} +@end diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h index eaa2c3a337..3ed4c27bd4 100644 --- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h @@ -270,11 +270,11 @@ class OpenGLRenderer : public Renderer // occlusion queries std::vector list_queryCacheOcclusion; // cache for unused queries - // resource garbage collection + // resource garbage collection struct BufferCacheReleaseQueueEntry { BufferCacheReleaseQueueEntry(VirtualBufferHeap_t* heap, VirtualBufferHeapEntry_t* entry) : m_heap(heap), m_entry(entry) {}; - + void free() { virtualBufferHeap_free(m_heap, m_entry); diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 5e5090d16f..01a6d3b2e6 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -33,6 +33,7 @@ enum class RendererAPI { OpenGL, Vulkan, + Metal, MAX }; @@ -70,9 +71,9 @@ class Renderer void CancelScreenshotRequest(); virtual void HandleScreenshotRequest(LatteTextureView* texView, bool padView){} - - virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, - sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + + virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) = 0; virtual bool BeginFrame(bool mainWindow) = 0; @@ -88,6 +89,7 @@ class Renderer virtual void DeleteFontTextures() = 0; GfxVendor GetVendor() const { return m_vendor; } + virtual bool UseTFViaSSBO() const { return false; } virtual void AppendOverlayDebugInfo() = 0; // rendertarget diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index afe53a16cb..9613d9734a 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -9,6 +9,19 @@ void main() } )"; +const std::string RendererOutputShader::s_copy_shader_source_mtl = +R"(#include +using namespace metal; + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + return float4(textureSrc.sample(samplr, in.uv).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_bicubic_shader_source = R"( vec4 cubic(float x) @@ -55,6 +68,57 @@ void main(){ } )"; +const std::string RendererOutputShader::s_bicubic_shader_source_mtl = +R"(#include +using namespace metal; + +float4 cubic(float x) { + float x2 = x * x; + float x3 = x2 * x; + float4 w; + w.x = -x3 + 3 * x2 - 3 * x + 1; + w.y = 3 * x3 - 6 * x2 + 4; + w.z = -3 * x3 + 3 * x2 + 3 * x + 1; + w.w = x3; + return w / 6.0; +} + +float4 bcFilter(texture2d textureSrc, sampler samplr, float2 texcoord, float2 texscale) { + float fx = fract(texcoord.x); + float fy = fract(texcoord.y); + texcoord.x -= fx; + texcoord.y -= fy; + + float4 xcubic = cubic(fx); + float4 ycubic = cubic(fy); + + float4 c = float4(texcoord.x - 0.5, texcoord.x + 1.5, texcoord.y - 0.5, texcoord.y + 1.5); + float4 s = float4(xcubic.x + xcubic.y, xcubic.z + xcubic.w, ycubic.x + ycubic.y, ycubic.z + ycubic.w); + float4 offset = c + float4(xcubic.y, xcubic.w, ycubic.y, ycubic.w) / s; + + float4 sample0 = textureSrc.sample(samplr, float2(offset.x, offset.z) * texscale); + float4 sample1 = textureSrc.sample(samplr, float2(offset.y, offset.z) * texscale); + float4 sample2 = textureSrc.sample(samplr, float2(offset.x, offset.w) * texscale); + float4 sample3 = textureSrc.sample(samplr, float2(offset.y, offset.w) * texscale); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix( + mix(sample3, sample2, sx), + mix(sample1, sample0, sx), sy); +} + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + float2 textureSrcResolution = float2(textureSrc.get_width(), textureSrc.get_height()); + return float4(bcFilter(textureSrc, samplr, in.uv * textureSrcResolution, float2(1.0, 1.0) / textureSrcResolution).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_hermite_shader_source = R"( // https://www.shadertoy.com/view/MllSzX @@ -67,7 +131,7 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; vec3 c = -A/2.0 + C/2.0; vec3 d = B; - + return a*t3 + b*t2 + c*t + d; } @@ -75,36 +139,36 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 BicubicHermiteTexture(vec2 uv, vec4 texelSize) { vec2 pixel = uv*texelSize.zw + 0.5; - vec2 frac = fract(pixel); + vec2 frac = fract(pixel); pixel = floor(pixel) / texelSize.zw - vec2(texelSize.xy/2.0); - + vec4 doubleSize = texelSize*2.0; vec3 C00 = texture(textureSrc, pixel + vec2(-texelSize.x ,-texelSize.y)).rgb; vec3 C10 = texture(textureSrc, pixel + vec2( 0.0 ,-texelSize.y)).rgb; vec3 C20 = texture(textureSrc, pixel + vec2( texelSize.x ,-texelSize.y)).rgb; vec3 C30 = texture(textureSrc, pixel + vec2( doubleSize.x,-texelSize.y)).rgb; - + vec3 C01 = texture(textureSrc, pixel + vec2(-texelSize.x , 0.0)).rgb; vec3 C11 = texture(textureSrc, pixel + vec2( 0.0 , 0.0)).rgb; vec3 C21 = texture(textureSrc, pixel + vec2( texelSize.x , 0.0)).rgb; - vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; - + vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; + vec3 C02 = texture(textureSrc, pixel + vec2(-texelSize.x , texelSize.y)).rgb; vec3 C12 = texture(textureSrc, pixel + vec2( 0.0 , texelSize.y)).rgb; vec3 C22 = texture(textureSrc, pixel + vec2( texelSize.x , texelSize.y)).rgb; - vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; - + vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; + vec3 C03 = texture(textureSrc, pixel + vec2(-texelSize.x , doubleSize.y)).rgb; vec3 C13 = texture(textureSrc, pixel + vec2( 0.0 , doubleSize.y)).rgb; vec3 C23 = texture(textureSrc, pixel + vec2( texelSize.x , doubleSize.y)).rgb; - vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; - + vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; + vec3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); vec3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); vec3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); vec3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); - + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); } @@ -114,9 +178,77 @@ void main(){ } )"; +const std::string RendererOutputShader::s_hermite_shader_source_mtl = +R"(#include +using namespace metal; + +// https://www.shadertoy.com/view/MllSzX + +float3 CubicHermite(float3 A, float3 B, float3 C, float3 D, float t) { + float t2 = t*t; + float t3 = t*t*t; + float3 a = -A/2.0 + (3.0*B)/2.0 - (3.0*C)/2.0 + D/2.0; + float3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; + float3 c = -A/2.0 + C/2.0; + float3 d = B; + + return a*t3 + b*t2 + c*t + d; +} + + +float3 BicubicHermiteTexture(texture2d textureSrc, sampler samplr, float2 uv, float4 texelSize) { + float2 pixel = uv*texelSize.zw + 0.5; + float2 frac = fract(pixel); + pixel = floor(pixel) / texelSize.zw - float2(texelSize.xy/2.0); + + float4 doubleSize = texelSize*texelSize; + + float3 C00 = textureSrc.sample(samplr, pixel + float2(-texelSize.x ,-texelSize.y)).rgb; + float3 C10 = textureSrc.sample(samplr, pixel + float2( 0.0 ,-texelSize.y)).rgb; + float3 C20 = textureSrc.sample(samplr, pixel + float2( texelSize.x ,-texelSize.y)).rgb; + float3 C30 = textureSrc.sample(samplr, pixel + float2( doubleSize.x,-texelSize.y)).rgb; + + float3 C01 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , 0.0)).rgb; + float3 C11 = textureSrc.sample(samplr, pixel + float2( 0.0 , 0.0)).rgb; + float3 C21 = textureSrc.sample(samplr, pixel + float2( texelSize.x , 0.0)).rgb; + float3 C31 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, 0.0)).rgb; + + float3 C02 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , texelSize.y)).rgb; + float3 C12 = textureSrc.sample(samplr, pixel + float2( 0.0 , texelSize.y)).rgb; + float3 C22 = textureSrc.sample(samplr, pixel + float2( texelSize.x , texelSize.y)).rgb; + float3 C32 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, texelSize.y)).rgb; + + float3 C03 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , doubleSize.y)).rgb; + float3 C13 = textureSrc.sample(samplr, pixel + float2( 0.0 , doubleSize.y)).rgb; + float3 C23 = textureSrc.sample(samplr, pixel + float2( texelSize.x , doubleSize.y)).rgb; + float3 C33 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, doubleSize.y)).rgb; + + float3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); + float3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); + float3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); + float3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); + + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); +} + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]], constant float2& outputResolution [[buffer(0)]]) { + float4 texelSize = float4(1.0 / outputResolution.xy, outputResolution.xy); + return float4(BicubicHermiteTexture(textureSrc, samplr, in.uv, texelSize), 1.0); +} +)"; + RendererOutputShader::RendererOutputShader(const std::string& vertex_source, const std::string& fragment_source) { - auto finalFragmentSrc = PrependFragmentPreamble(fragment_source); + std::string finalFragmentSrc; + if (g_renderer->GetType() == RendererAPI::Metal) + finalFragmentSrc = fragment_source; + else + finalFragmentSrc = PrependFragmentPreamble(fragment_source); m_vertex_shader.reset(g_renderer->shader_create(RendererShader::ShaderType::kVertex, 0, 0, vertex_source, false, false)); m_fragment_shader.reset(g_renderer->shader_create(RendererShader::ShaderType::kFragment, 0, 0, finalFragmentSrc, false, false)); @@ -190,9 +322,9 @@ std::string RendererOutputShader::GetOpenGlVertexSource(bool render_upside_down) R"(#version 420 layout(location = 0) smooth out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -226,7 +358,7 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); } )"; return vertex_source.str(); @@ -240,9 +372,9 @@ std::string RendererOutputShader::GetVulkanVertexSource(bool render_upside_down) R"(#version 450 layout(location = 0) out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -276,7 +408,45 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); +} +)"; + return vertex_source.str(); +} + +std::string RendererOutputShader::GetMetalVertexSource(bool render_upside_down) +{ + // vertex shader + std::ostringstream vertex_source; + vertex_source << + R"(#include +using namespace metal; + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +vertex VertexOut main0(ushort vid [[vertex_id]]) { + VertexOut out; + float2 pos; + if (vid == 0) pos = float2(-1.0, -3.0); + else if (vid == 1) pos = float2(-1.0, 1.0); + else if (vid == 2) pos = float2(3.0, 1.0); + out.uv = pos * 0.5 + 0.5; + out.uv.y = 1.0 - out.uv.y; +)"; + + if (render_upside_down) + { + vertex_source << + R"( pos.y = -pos.y; + )"; + } + + vertex_source << + R"( out.position = float4(pos, 0.0, 1.0); + return out; } )"; return vertex_source.str(); @@ -304,26 +474,43 @@ layout(location = 0) out vec4 colorOut0; } void RendererOutputShader::InitializeStatic() { - std::string vertex_source, vertex_source_ud; - // vertex shader - if (g_renderer->GetType() == RendererAPI::OpenGL) - { - vertex_source = GetOpenGlVertexSource(false); - vertex_source_ud = GetOpenGlVertexSource(true); - } - else - { - vertex_source = GetVulkanVertexSource(false); - vertex_source_ud = GetVulkanVertexSource(true); - } - s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); - s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); - - s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); - s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); - - s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); - s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + if (g_renderer->GetType() == RendererAPI::Metal) + { + std::string vertex_source = GetMetalVertexSource(false); + std::string vertex_source_ud = GetMetalVertexSource(true); + + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source_mtl); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source_mtl); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source_mtl); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source_mtl); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source_mtl); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source_mtl); + } + else + { + std::string vertex_source, vertex_source_ud; + // vertex shader + if (g_renderer->GetType() == RendererAPI::OpenGL) + { + vertex_source = GetOpenGlVertexSource(false); + vertex_source_ud = GetOpenGlVertexSource(true); + } + else if (g_renderer->GetType() == RendererAPI::Vulkan) + { + vertex_source = GetVulkanVertexSource(false); + vertex_source_ud = GetVulkanVertexSource(true); + } + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + } } void RendererOutputShader::ShutdownStatic() diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h index b12edf8b4a..a8f240d245 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h @@ -41,8 +41,9 @@ class RendererOutputShader static RendererOutputShader* s_hermit_shader; static RendererOutputShader* s_hermit_shader_ud; - static std::string GetVulkanVertexSource(bool render_upside_down); static std::string GetOpenGlVertexSource(bool render_upside_down); + static std::string GetVulkanVertexSource(bool render_upside_down); + static std::string GetMetalVertexSource(bool render_upside_down); static std::string PrependFragmentPreamble(const std::string& shaderSrc); @@ -64,4 +65,8 @@ class RendererOutputShader static const std::string s_bicubic_shader_source_vk; static const std::string s_hermite_shader_source_vk; + + static const std::string s_copy_shader_source_mtl; + static const std::string s_bicubic_shader_source_mtl; + static const std::string s_hermite_shader_source_mtl; }; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm index 731a6a2673..a68174c93f 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm @@ -1,36 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" -#import -#import - -@interface MetalView : NSView -@end - -@implementation MetalView - --(BOOL) wantsUpdateLayer { return YES; } - -+(Class) layerClass { return [CAMetalLayer class]; } - -// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m - --(CALayer*) makeBackingLayer -{ - CALayer* layer = [self.class.layerClass layer]; - CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; - layer.contentsScale = MIN(viewScale.width, viewScale.height); - return layer; -} - --(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window -{ - if (newScale == layer.contentsScale) { return NO; } - - layer.contentsScale = newScale; - return YES; -} -@end +#include "Cafe/HW/Latte/Renderer/MetalView.h" VkSurfaceKHR CreateCocoaSurface(VkInstance instance, void* handle) { diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 60db6ff769..33b83809e2 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -65,7 +65,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(VkDebugUtilsMessageSeverityFla if (strstr(pCallbackData->pMessage, "consumes input location")) return VK_FALSE; // false means we dont care if (strstr(pCallbackData->pMessage, "blend")) - return VK_FALSE; // + return VK_FALSE; // // note: Check if previously used location in VK_EXT_debug_report callback is the same as messageIdNumber under the new extension // validation errors which are difficult to fix @@ -403,8 +403,8 @@ VulkanRenderer::VulkanRenderer() auto surface = CreateFramebufferSurface(m_instance, WindowSystem::GetWindowInfo().window_main); auto& config = GetConfig(); - decltype(config.graphic_device_uuid) zero{}; - const bool has_device_set = config.graphic_device_uuid != zero; + decltype(config.vk_graphic_device_uuid) zero{}; + const bool has_device_set = config.vk_graphic_device_uuid != zero; VkPhysicalDevice fallbackDevice = VK_NULL_HANDLE; @@ -424,7 +424,7 @@ VulkanRenderer::VulkanRenderer() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - if (memcmp(config.graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) + if (memcmp(config.vk_graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) continue; } @@ -437,7 +437,7 @@ VulkanRenderer::VulkanRenderer() { cemuLog_log(LogType::Force, "The selected GPU could not be found or is not suitable. Falling back to first available device instead"); m_physicalDevice = fallbackDevice; - config.graphic_device_uuid = {}; // resetting device selection + config.vk_graphic_device_uuid = {}; // resetting device selection } else if (m_physicalDevice == VK_NULL_HANDLE) { @@ -2398,7 +2398,7 @@ void VulkanRenderer::GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isD } else { formatInfoOut->vkImageFormat = VK_FORMAT_R4G4B4A4_UNORM_PACK16; - formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_RGBA4_vk::getInstance(); + formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_ABGR4::getInstance(); } } else diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 7290fdd7dc..5942f10536 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -73,11 +73,11 @@ class PipelineInfo return true; } - + template struct direct_hash { - size_t operator()(const uint64& k) const noexcept + size_t operator()(const uint64& k) const noexcept { return k; } @@ -282,7 +282,6 @@ class VulkanRenderer : public Renderer // texture functions void* texture_acquireTextureUploadBuffer(uint32 size) override; void texture_releaseTextureUploadBuffer(uint8* mem) override; - TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; @@ -376,7 +375,7 @@ class VulkanRenderer : public Renderer VkRect2D currentScissorRect{}; // vertex bindings - struct + struct { uint32 offset; }currentVertexBinding[LATTE_MAX_VERTEX_BUFFERS]{}; @@ -471,12 +470,12 @@ class VulkanRenderer : public Renderer bool debug_utils = false; // VK_EXT_DEBUG_UTILS }instanceExtensions; - struct + struct { bool useTFEmulationViaSSBO = true; // emulate transform feedback via shader writes to a storage buffer }mode; - struct + struct { uint32 minUniformBufferOffsetAlignment = 256; uint32 nonCoherentAtomSize = 256; @@ -506,7 +505,7 @@ class VulkanRenderer : public Renderer void CreateCommandBuffers(); void swapchain_createDescriptorSetLayout(); - + // shader bool IsAsyncPipelineAllowed(uint32 numIndices); @@ -521,6 +520,8 @@ class VulkanRenderer : public Renderer void DeleteFontTextures() override; bool BeginFrame(bool mainWindow) override; + bool UseTFViaSSBO() const override { return m_featureControl.mode.useTFEmulationViaSSBO; } + // drawcall emulation PipelineInfo* draw_createGraphicsPipeline(uint32 indexCount); PipelineInfo* draw_getOrCreateGraphicsPipeline(uint32 indexCount); @@ -583,7 +584,7 @@ class VulkanRenderer : public Renderer VkDevice m_logicalDevice = VK_NULL_HANDLE; VkDebugUtilsMessengerEXT m_debugCallback = nullptr; volatile bool m_destructionRequested = false; - + QueueFamilyIndices m_indices{}; Semaphore m_pipeline_cache_semaphore; @@ -594,7 +595,7 @@ class VulkanRenderer : public Renderer std::unordered_map m_backbufferBlitDescriptorSetCache; VkPipelineLayout m_pipelineLayout{nullptr}; VkCommandPool m_commandPool{ nullptr }; - + // buffer to cache uniform vars VkBuffer m_uniformVarBuffer = VK_NULL_HANDLE; VkDeviceMemory m_uniformVarBufferMemory = VK_NULL_HANDLE; @@ -666,19 +667,19 @@ class VulkanRenderer : public Renderer bool m_submitOnIdle{}; // submit current buffer if Latte command processor goes into idle state (no more commands or waiting for externally signaled condition) // tracking for dynamic offsets - struct + struct { uint32 uniformVarBufferOffset[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; - struct + struct { uint32 uniformBufferOffset[LATTE_NUM_MAX_UNIFORM_BUFFERS]; }shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; }dynamicOffsetInfo{}; // streamout - struct + struct { - struct + struct { bool enabled; uint32 ringBufferOffset; @@ -728,11 +729,11 @@ class VulkanRenderer : public Renderer accessFlags = 0; if constexpr ((TSyncOp & BUFFER_SHADER_READ) != 0) { - // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated + // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; accessFlags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_SHADER_READ_BIT; } - + if constexpr ((TSyncOp & BUFFER_SHADER_WRITE) != 0) { stages |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; @@ -935,7 +936,6 @@ class VulkanRenderer : public Renderer public: bool GetDisableMultithreadedCompilation() const { return m_featureControl.disableMultithreadedCompilation; } - bool UseTFViaSSBO() const { return m_featureControl.mode.useTFEmulationViaSSBO; } bool HasSPRIVRoundingModeRTE32() const { return m_featureControl.shaderFloatControls.shaderRoundingModeRTEFloat32; } bool IsDebugUtilsEnabled() const { return m_featureControl.debugMarkersSupported && m_featureControl.instanceExtensions.debug_utils; } @@ -945,7 +945,7 @@ class VulkanRenderer : public Renderer void debug_genericBarrier(); // shaders - struct + struct { RendererShaderVk* copySurface_vs{}; RendererShaderVk* copySurface_psDepth2Color{}; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 32ef700764..4534e03eb6 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -60,7 +60,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader uint64 stateHash; stateHash = draw_calculateMinimalGraphicsPipelineHash(fetchShader, lcr); stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; - + uint32* ctxRegister = lcr.GetRawView(); if (vertexShader) @@ -103,7 +103,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader } stateHash += renderPassObj->m_hashForPipeline; - + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; bool stencilTestEnable = depthControl & 1; if (stencilTestEnable) @@ -111,7 +111,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader stateHash += ctxRegister[mmDB_STENCILREFMASK]; stateHash = std::rotl(stateHash, 17); if(depthControl & (1<<7)) // back stencil enable - { + { stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; stateHash = std::rotl(stateHash, 13); } @@ -303,7 +303,7 @@ PipelineInfo* VulkanRenderer::draw_createGraphicsPipeline(uint32 indexCount) pipelineCompiler->TrackAsCached(vsBaseHash, pipelineHash); // use heuristics based on parameter patterns to determine if the current drawcall is essential (non-skipable) - bool allowAsyncCompile = false; + bool allowAsyncCompile = false; if (GetConfig().async_compile) allowAsyncCompile = IsAsyncPipelineAllowed(indexCount); @@ -736,8 +736,8 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* LatteTexture* baseTexture = textureView->baseTexture; // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; - - auto imageViewObj = textureView->GetSamplerView(word4); + + auto imageViewObj = textureView->GetSamplerView(word4); info.imageView = imageViewObj->m_textureImageView; vkObjDS->addRef(imageViewObj); @@ -807,7 +807,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* VK_SAMPLER_ADDRESS_MODE_REPEAT, // WRAP VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, // MIRROR VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // CLAMP_LAST_TEXEL - VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL + VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // unsupported HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // unsupported MIRROR_ONCE_HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // CLAMP_BORDER @@ -935,7 +935,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* uniformVarsBufferInfo.buffer = m_uniformVarBuffer; uniformVarsBufferInfo.offset = 0; // fixed offset is always zero since we only use dynamic offsets uniformVarsBufferInfo.range = shader->uniform.uniformRangeSize; - + VkWriteDescriptorSet write_descriptor{}; write_descriptor.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; write_descriptor.dstSet = result; @@ -1235,7 +1235,7 @@ void VulkanRenderer::draw_setRenderPass() draw_endRenderPass(); if (m_state.descriptorSetsChanged) sync_inputTexturesChanged(); - + // assume that FBO changed, update self-dependency state m_state.hasRenderSelfDependency = fboVk->CheckForCollision(m_state.activeVertexDS, m_state.activeGeometryDS, m_state.activePixelDS); diff --git a/src/Cemu/FileCache/FileCache.cpp b/src/Cemu/FileCache/FileCache.cpp index b284b66bd1..820115d665 100644 --- a/src/Cemu/FileCache/FileCache.cpp +++ b/src/Cemu/FileCache/FileCache.cpp @@ -111,7 +111,7 @@ FileCache* FileCache::Create(const fs::path& path, uint32 extraVersion) fileCache->fileTableEntries[0].fileOffset = fileCache->fileTableOffset; fileCache->fileTableEntries[0].fileSize = fileCache->fileTableSize; // write header - + fs->writeU32(FILECACHE_MAGIC_V3); fs->writeU32(fileCache->extraVersion); fs->writeU64(fileCache->dataOffset); @@ -316,7 +316,7 @@ bool _uncompressFileData(const uint8* rawData, size_t rawSize, std::vectorSetPosition(this->dataOffset + currentStartOffset); fileStream->writeData(rawData, rawSize); +#ifdef __APPLE__ + fileStream->Flush(); +#endif // write file table entry fileStream->SetPosition(this->dataOffset + this->fileTableOffset + (uint64)(sizeof(FileTableEntry)*entryIndex)); fileStream->writeData(this->fileTableEntries + entryIndex, sizeof(FileTableEntry)); +#ifdef __APPLE__ + fileStream->Flush(); +#endif if (isCompressed) free(rawData); } diff --git a/src/Cemu/Logging/CemuLogging.cpp b/src/Cemu/Logging/CemuLogging.cpp index 6a01e75a61..811ac6b497 100644 --- a/src/Cemu/Logging/CemuLogging.cpp +++ b/src/Cemu/Logging/CemuLogging.cpp @@ -220,7 +220,7 @@ bool cemuLog_log(LogType type, std::string_view text) bool cemuLog_log(LogType type, std::u8string_view text) { - std::basic_string_view s((char*)text.data(), text.size()); + std::basic_string_view s((char*)text.data(), text.size()); return cemuLog_log(type, s); } diff --git a/src/Cemu/Logging/CemuLogging.h b/src/Cemu/Logging/CemuLogging.h index 88b4f3206d..39b27e308d 100644 --- a/src/Cemu/Logging/CemuLogging.h +++ b/src/Cemu/Logging/CemuLogging.h @@ -52,7 +52,7 @@ enum class LogType : sint32 template <> struct fmt::formatter : formatter { template - auto format(std::u8string_view v, FormatContext& ctx) + auto format(std::u8string_view v, FormatContext& ctx) { string_view s((char*)v.data(), v.size()); return formatter::format(s, ctx); @@ -100,7 +100,7 @@ bool cemuLog_log(LogType type, std::basic_string formatStr, TArgs&&... args) } return true; } - + template bool cemuLog_log(LogType type, const T* format, TArgs&&... args) { diff --git a/src/Common/unix/FileStream_unix.cpp b/src/Common/unix/FileStream_unix.cpp index 4bc9b52636..0e9f118952 100644 --- a/src/Common/unix/FileStream_unix.cpp +++ b/src/Common/unix/FileStream_unix.cpp @@ -116,6 +116,11 @@ void FileStream::extract(std::vector& data) readData(data.data(), fileSize); } +void FileStream::Flush() +{ + m_fileStream.flush(); +} + uint32 FileStream::readData(void* data, uint32 length) { SyncReadWriteSeek(false); diff --git a/src/Common/unix/FileStream_unix.h b/src/Common/unix/FileStream_unix.h index 12c971d148..0a2fa7ed9f 100644 --- a/src/Common/unix/FileStream_unix.h +++ b/src/Common/unix/FileStream_unix.h @@ -22,6 +22,8 @@ class FileStream bool SetEndOfFile(); void extract(std::vector& data); + void Flush(); + // reading uint32 readData(void* data, uint32 length); bool readU64(uint64& v); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 620f005e8c..dcf136eb76 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -29,7 +29,7 @@ XMLConfigParser CemuConfig::Load(XMLConfigParser& parser) mlc_path = mlc; permanent_storage = parser.get("permanent_storage", permanent_storage); - + proxy_server = parser.get("proxy_server", ""); disable_screensaver = parser.get("disable_screensaver", disable_screensaver); play_boot_sound = parser.get("play_boot_sound", play_boot_sound); @@ -113,7 +113,7 @@ XMLConfigParser CemuConfig::Load(XMLConfigParser& parser) { graphic_pack_entries[path].try_emplace("_disabled", "true"); } - + for (auto preset = element.get("Preset"); preset.valid(); preset = element.get("Preset", preset)) { const std::string category = preset.get("category", ""); @@ -121,13 +121,14 @@ XMLConfigParser CemuConfig::Load(XMLConfigParser& parser) graphic_pack_entries[path].try_emplace(category, active_preset); } } - + } // graphics auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); - graphic.get("device", graphic_device_uuid); + graphic.get("vkDevice", vk_graphic_device_uuid); + mtl_graphic_device_uuid = graphic.get("mtlDevice", 0); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); @@ -135,6 +136,7 @@ XMLConfigParser CemuConfig::Load(XMLConfigParser& parser) fullscreen_scaling = graphic.get("FullscreenScaling", kKeepAspectRatio); async_compile = graphic.get("AsyncCompile", async_compile); vk_accurate_barriers = graphic.get("vkAccurateBarriers", true); // this used to be "VulkanAccurateBarriers" but because we changed the default to true in 1.27.1 the option name had to be changed + force_mesh_shaders = graphic.get("ForceMeshShaders", false); auto overlay_node = graphic.get("Overlay"); if(overlay_node.valid()) @@ -261,6 +263,8 @@ XMLConfigParser CemuConfig::Load(XMLConfigParser& parser) crash_dump = debug.get("CrashDumpUnix", crash_dump); #endif gdb_port = debug.get("GDBPort", 1337); + gpu_capture_dir = debug.get("GPUCaptureDir", ""); + framebuffer_fetch = debug.get("FramebufferFetch", true); // input auto input = parser.get("Input"); @@ -291,7 +295,7 @@ XMLConfigParser CemuConfig::Save(XMLConfigParser& parser) // config.set("cpu_mode", cpu_mode.GetValue()); //config.set("console_region", console_region.GetValue()); config.set("console_language", console_language.GetValue()); - + // game paths auto game_path_parser = config.set("GamePaths"); for (const auto& entry : game_paths) @@ -332,11 +336,11 @@ XMLConfigParser CemuConfig::Save(XMLConfigParser& parser) entry.set_attribute("disabled", true); continue; } - + auto preset = entry.set("Preset"); if(!kv.first.empty()) preset.set("category", kv.first.c_str()); - + preset.set("preset", kv.second.c_str()); } } @@ -344,9 +348,11 @@ XMLConfigParser CemuConfig::Save(XMLConfigParser& parser) // graphics auto graphic = config.set("Graphic"); graphic.set("api", graphic_api); - graphic.set("device", graphic_device_uuid); + graphic.set("vkDevice", vk_graphic_device_uuid); + graphic.set("mtlDevice", mtl_graphic_device_uuid); graphic.set("VSync", vsync); graphic.set("GX2DrawdoneSync", gx2drawdone_sync); + graphic.set("ForceMeshShaders", force_mesh_shaders); //graphic.set("PrecompiledShaders", precompiled_shaders.GetValue()); graphic.set("UpscaleFilter", upscale_filter); graphic.set("DownscaleFilter", downscale_filter); @@ -413,6 +419,8 @@ XMLConfigParser CemuConfig::Save(XMLConfigParser& parser) debug.set("CrashDumpUnix", crash_dump.GetValue()); #endif debug.set("GDBPort", gdb_port); + debug.set("GPUCaptureDir", gpu_capture_dir); + debug.set("FramebufferFetch", framebuffer_fetch); // input auto input = config.set("Input"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index ff892fb801..0cdd642e43 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -29,7 +29,7 @@ struct GameEntry std::wstring save_folder; std::wstring update_folder; std::wstring dlc_folder; - + uint64 legacy_time_played = 0; uint64 legacy_last_played = 0; @@ -71,6 +71,7 @@ enum GraphicAPI { kOpenGL = 0, kVulkan, + kMetal, }; enum AudioChannels @@ -102,7 +103,7 @@ enum class ScreenPosition kTopRight, kBottomLeft, kBottomCenter, - kBottomRight, + kBottomRight, }; enum class PrecompiledShaderOption @@ -120,6 +121,23 @@ enum class AccurateShaderMulOption }; ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, AccurateShaderMulOption::True); +enum class MetalBufferCacheMode +{ + Auto, + DevicePrivate, + DeviceShared, + Host, +}; +ENABLE_ENUM_ITERATORS(MetalBufferCacheMode, MetalBufferCacheMode::Auto, MetalBufferCacheMode::Host); + +enum class PositionInvariance +{ + Auto, + False, + True, +}; +ENABLE_ENUM_ITERATORS(PositionInvariance, PositionInvariance::False, PositionInvariance::True); + enum class CPUMode { SinglecoreInterpreter = 0, @@ -131,7 +149,7 @@ enum class CPUMode ENABLE_ENUM_ITERATORS(CPUMode, CPUMode::SinglecoreInterpreter, CPUMode::Auto); -enum class CPUModeLegacy +enum class CPUModeLegacy { SinglecoreInterpreter = 0, SinglecoreRecompiler = 1, @@ -218,6 +236,37 @@ struct fmt::formatter : formatter { } }; template <> +struct fmt::formatter : formatter { + template + auto format(const MetalBufferCacheMode c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case MetalBufferCacheMode::Auto: name = "auto"; break; + case MetalBufferCacheMode::DevicePrivate: name = "device private"; break; + case MetalBufferCacheMode::DeviceShared: name = "device shared"; break; + case MetalBufferCacheMode::Host: name = "host"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> +struct fmt::formatter : formatter { + template + auto format(const PositionInvariance c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case PositionInvariance::Auto: name = "auto"; break; + case PositionInvariance::False: name = "false"; break; + case PositionInvariance::True: name = "true"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> struct fmt::formatter : formatter { template auto format(const CPUMode c, FormatContext &ctx) const { @@ -267,7 +316,7 @@ struct fmt::formatter : formatter { case CafeConsoleRegion::TWN: name = TR_NOOP("Taiwan"); break; case CafeConsoleRegion::Auto: name = TR_NOOP("Auto"); break; default: name = TR_NOOP("many"); break; - + } return formatter::format(name, ctx); } @@ -309,7 +358,7 @@ struct fmt::formatter : formatter { case CrashDump::Lite: name = "Lite"; break; case CrashDump::Full: name = "Full"; break; default: name = "unknown"; break; - + } return formatter::format(name, ctx); } @@ -349,7 +398,7 @@ struct CemuConfig ConfigValue advanced_ppc_logging{ false }; ConfigValue permanent_storage{ true }; - + ConfigValue mlc_path{}; ConfigValue proxy_server{}; @@ -369,7 +418,7 @@ struct CemuConfig // optimized access std::set game_cache_favorites; // per titleId - + struct _path_hash { std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); @@ -383,11 +432,13 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; - std::array graphic_device_uuid; - ConfigValue vsync{ 0 }; // 0 = off, 1+ = on depending on render backend - ConfigValue gx2drawdone_sync {true}; + std::array vk_graphic_device_uuid; + uint64 mtl_graphic_device_uuid{ 0 }; + ConfigValue vsync{ 0 }; // 0 = off, 1+ = depending on render backend + ConfigValue gx2drawdone_sync { true }; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; + ConfigValue force_mesh_shaders{ false }; ConfigValue vk_accurate_barriers{ true }; @@ -446,6 +497,8 @@ struct CemuConfig // debug ConfigValueBounds crash_dump{ CrashDump::Disabled }; ConfigValue gdb_port{ 1337 }; + ConfigValue gpu_capture_dir{ "" }; + ConfigValue framebuffer_fetch{ true }; XMLConfigParser Load(XMLConfigParser& parser); XMLConfigParser Save(XMLConfigParser& parser); @@ -457,7 +510,7 @@ struct CemuConfig NetworkService GetAccountNetworkService(uint32 persistentId); void SetAccountSelectedService(uint32 persistentId, NetworkService serviceIndex); - + // emulated usb devices struct { diff --git a/src/gui/wxgui/CMakeLists.txt b/src/gui/wxgui/CMakeLists.txt index 1aa3e0a541..ff66e3d2cd 100644 --- a/src/gui/wxgui/CMakeLists.txt +++ b/src/gui/wxgui/CMakeLists.txt @@ -1,4 +1,4 @@ -add_library(CemuWxGui +add_library(CemuWxGui canvas/IRenderCanvas.h canvas/OpenGLCanvas.cpp canvas/OpenGLCanvas.h @@ -118,6 +118,13 @@ add_library(CemuWxGui wxHelper.h ) +if(ENABLE_METAL) + target_sources(CemuWxGui PRIVATE + canvas/MetalCanvas.cpp + canvas/MetalCanvas.h + ) +endif() + set_property(TARGET CemuWxGui PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") @@ -170,4 +177,4 @@ endif() if(ALLOW_PORTABLE) target_compile_definitions(CemuWxGui PRIVATE CEMU_ALLOW_PORTABLE) -endif () \ No newline at end of file +endif () diff --git a/src/gui/wxgui/CemuApp.cpp b/src/gui/wxgui/CemuApp.cpp index c35fb12e7b..1df8611a2e 100644 --- a/src/gui/wxgui/CemuApp.cpp +++ b/src/gui/wxgui/CemuApp.cpp @@ -591,5 +591,3 @@ void CemuApp::ActivateApp(wxActivateEvent& event) g_window_info.app_active = event.GetActive(); event.Skip(); } - - diff --git a/src/gui/wxgui/GameProfileWindow.cpp b/src/gui/wxgui/GameProfileWindow.cpp index 0092bb84bb..f183539540 100644 --- a/src/gui/wxgui/GameProfileWindow.cpp +++ b/src/gui/wxgui/GameProfileWindow.cpp @@ -61,7 +61,7 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) const sint32 m_cpu_modeNChoices = std::size(cpu_modes); m_cpu_mode = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, m_cpu_modeNChoices, cpu_modes, 0); m_cpu_mode->SetToolTip(_("Set the CPU emulation mode")); - first_row->Add(m_cpu_mode, 0, wxALL, 5); + first_row->Add(m_cpu_mode, 0, wxALL, 5); first_row->Add(new wxStaticText(box, wxID_ANY, _("Thread quantum")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); @@ -112,10 +112,14 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString gapi_values[] = { "", "OpenGL", "Vulkan" }; + wxString gapi_values[] = { "", "OpenGL", "Vulkan", +#if ENABLE_METAL + "Metal" +#endif + }; m_graphic_api = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(gapi_values), gapi_values); first_row->Add(m_graphic_api, 0, wxALL, 5); - + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Shader multiplication accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString mul_values[] = { _("false"), _("true")}; @@ -123,6 +127,27 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Shader fast math")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString math_values[] = { _("false"), _("true") }; + m_shader_fast_math = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(math_values), math_values); + m_shader_fast_math->SetToolTip(_("EXPERT OPTION\nEnables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); + first_row->Add(m_shader_fast_math, 0, wxALL, 5); + + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Metal buffer cache mode")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString cache_values[] = { _("auto"), _("device private"), _("device shared"), _("host") }; + m_metal_buffer_cache_mode = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); + m_metal_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: auto")); + first_row->Add(m_metal_buffer_cache_mode, 0, wxALL, 5); + + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Position invariance")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString pos_values[] = { _("auto"), _("false"), _("true") }; + m_position_invariance = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(pos_values), pos_values); + m_position_invariance->SetToolTip(_("EXPERT OPTION\nDisables most optimizations for vertex positions. May fix polygon cutouts or flickering in some games.\n\nMetal only\n\nRecommended: auto")); + first_row->Add(m_position_invariance, 0, wxALL, 5); + /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; m_cache_accuracy = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(accuarcy_values), accuarcy_values); @@ -249,7 +274,7 @@ void GameProfileWindow::ApplyProfile() // general m_load_libs->SetValue(m_game_profile.m_loadSharedLibraries.value()); m_start_with_padview->SetValue(m_game_profile.m_startWithPadView); - + // cpu // wxString cpu_modes[] = { _("Singlecore-Interpreter"), _("Singlecore-Recompiler"), _("Triplecore-Recompiler"), _("Auto (recommended)") }; switch(m_game_profile.m_cpuMode.value()) @@ -258,24 +283,27 @@ void GameProfileWindow::ApplyProfile() case CPUMode::SinglecoreRecompiler: m_cpu_mode->SetSelection(1); break; case CPUMode::DualcoreRecompiler: m_cpu_mode->SetSelection(2); break; case CPUMode::MulticoreRecompiler: m_cpu_mode->SetSelection(2); break; - default: m_cpu_mode->SetSelection(3); + default: m_cpu_mode->SetSelection(3); } - + m_thread_quantum->SetStringSelection(fmt::format("{}", m_game_profile.m_threadQuantum)); // gpu if (!m_game_profile.m_graphics_api.has_value()) m_graphic_api->SetSelection(0); // selecting "" else - m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan + m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); + m_shader_fast_math->SetSelection((int)m_game_profile.m_shaderFastMath); + m_metal_buffer_cache_mode->SetSelection((int)m_game_profile.m_metalBufferCacheMode); + m_position_invariance->SetSelection((int)m_game_profile.m_positionInvariance); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); // controller auto profiles = InputManager::get_profiles(); - + for (const auto& cb : m_controller_profile) { cb->Clear(); @@ -293,7 +321,7 @@ void GameProfileWindow::ApplyProfile() const auto& v = m_game_profile.m_controllerProfile[i].value(); m_controller_profile[i]->SetStringSelection(wxString::FromUTF8(v)); } - + else m_controller_profile[i]->SetSelection(wxNOT_FOUND); } @@ -317,7 +345,7 @@ void GameProfileWindow::SaveProfile() m_game_profile.m_cpuMode = CPUMode::Auto; } - + const wxString thread_quantum = m_thread_quantum->GetStringSelection(); if (!thread_quantum.empty()) { @@ -330,11 +358,14 @@ void GameProfileWindow::SaveProfile() m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value + m_game_profile.m_shaderFastMath = (bool)m_shader_fast_math->GetSelection(); + m_game_profile.m_metalBufferCacheMode = (MetalBufferCacheMode)m_metal_buffer_cache_mode->GetSelection(); + m_game_profile.m_positionInvariance = (PositionInvariance)m_position_invariance->GetSelection(); if (m_graphic_api->GetSelection() == 0) m_game_profile.m_graphics_api = {}; else - m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan + m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan, Metal // controller for (int i = 0; i < 8; ++i) diff --git a/src/gui/wxgui/GameProfileWindow.h b/src/gui/wxgui/GameProfileWindow.h index 6ca36de681..6eaed1a1a4 100644 --- a/src/gui/wxgui/GameProfileWindow.h +++ b/src/gui/wxgui/GameProfileWindow.h @@ -40,6 +40,9 @@ class GameProfileWindow : public wxFrame wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; + wxChoice* m_shader_fast_math; + wxChoice* m_metal_buffer_cache_mode; + wxChoice* m_position_invariance; //wxChoice* m_cache_accuracy; // audio @@ -47,4 +50,4 @@ class GameProfileWindow : public wxFrame // controller wxComboBox* m_controller_profile[8]; -}; \ No newline at end of file +}; diff --git a/src/gui/wxgui/GeneralSettings2.cpp b/src/gui/wxgui/GeneralSettings2.cpp index 85648be6a3..4c83f6987e 100644 --- a/src/gui/wxgui/GeneralSettings2.cpp +++ b/src/gui/wxgui/GeneralSettings2.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,9 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#endif #include "Cafe/Account/Account.h" #include @@ -94,6 +98,19 @@ class wxVulkanUUID : public wxClientData VulkanRenderer::DeviceInfo m_device_info; }; +#if ENABLE_METAL +class wxMetalUUID : public wxClientData +{ +public: + wxMetalUUID(const MetalRenderer::DeviceInfo& info) + : m_device_info(info) {} + const MetalRenderer::DeviceInfo& GetDeviceInfo() const { return m_device_info; } + +private: + MetalRenderer::DeviceInfo m_device_info; +}; +#endif + class wxAccountData : public wxClientData { public: @@ -102,7 +119,7 @@ class wxAccountData : public wxClientData Account& GetAccount() { return m_account; } const Account& GetAccount() const { return m_account; } - + private: Account m_account; }; @@ -336,12 +353,14 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) row->Add(new wxStaticText(box, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); sint32 api_size = 1; - wxString choices[2] = { "OpenGL" }; + wxString choices[3] = { "OpenGL" }; if (g_vulkan_available) { - choices[1] = "Vulkan"; - api_size = 2; + choices[api_size++] = "Vulkan"; } +#if ENABLE_METAL + choices[api_size++] = "Metal"; +#endif m_graphic_api = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, api_size, choices); m_graphic_api->SetSelection(0); @@ -373,6 +392,10 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) m_gx2drawdone_sync->SetToolTip(_("If synchronization is requested by the game, the emulated CPU will wait for the GPU to finish all operations.\nThis is more accurate behavior, but may cause lower performance")); graphic_misc_row->Add(m_gx2drawdone_sync, 0, wxALL, 5); + m_force_mesh_shaders = new wxCheckBox(box, wxID_ANY, _("Force mesh shaders")); + m_force_mesh_shaders->SetToolTip(_("Force mesh shaders on all GPUs that support them. Mesh shaders are disabled by default on Intel GPUs due to potential stability issues")); + graphic_misc_row->Add(m_force_mesh_shaders, 0, wxALL, 5); + box_sizer->Add(graphic_misc_row, 1, wxEXPAND, 5); graphics_panel_sizer->Add(box_sizer, 0, wxEXPAND | wxALL, 5); } @@ -818,7 +841,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) auto* row = new wxFlexGridSizer(0, 2, 0, 0); row->SetFlexibleDirection(wxBOTH); row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); - + const wxImage tmp = wxBITMAP_PNG_FROM_DATA(PNG_ERROR).ConvertToImage(); m_validate_online = new wxBitmapButton(box, wxID_ANY, tmp.Scale(16, 16)); m_validate_online->Bind(wxEVT_BUTTON, &GeneralSettings2::OnShowOnlineValidator, this); @@ -828,7 +851,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) row->Add(m_online_status, 1, wxALL | wxALIGN_CENTRE_VERTICAL, 5); box_sizer->Add(row, 1, wxEXPAND, 5); - + auto* tutorial_link = new wxHyperlinkCtrl(box, wxID_ANY, _("Online play tutorial"), "https://cemu.info/online-guide"); box_sizer->Add(tutorial_link, 0, wxALL, 5); @@ -928,6 +951,33 @@ wxPanel* GeneralSettings2::AddDebugPage(wxNotebook* notebook) debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); } + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + debug_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU capture save directory"), wxDefaultPosition, wxDefaultSize, 0), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + m_gpu_capture_dir = new wxTextCtrl(panel, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, wxTE_DONTWRAP); + m_gpu_capture_dir->SetMinSize(wxSize(150, -1)); + m_gpu_capture_dir->SetToolTip(_("Cemu will save the GPU captures done by selecting Debug -> GPU capture in the menu bar in this directory. If a debugger with support for GPU captures (like Xcode) is attached, the capture will be opened in that debugger instead. If such debugger is not attached, METAL_CAPTURE_ENABLED must be set to 1 as an environment variable.")); + + debug_row->Add(m_gpu_capture_dir, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + m_framebuffer_fetch = new wxCheckBox(panel, wxID_ANY, _("Framebuffer fetch")); + m_framebuffer_fetch->SetToolTip(_("Enable framebuffer fetch for eligible textures on supported devices.")); + + debug_row->Add(m_framebuffer_fetch, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + panel->SetSizerAndFit(debug_panel_sizer); return panel; @@ -943,14 +993,14 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) notebook->AddPage(AddGeneralPage(notebook), _("General")); notebook->AddPage(AddGraphicsPage(notebook), _("Graphics")); - notebook->AddPage(AddAudioPage(notebook), _("Audio")); + notebook->AddPage(AddAudioPage(notebook), _("Audio")); notebook->AddPage(AddOverlayPage(notebook), _("Overlay")); notebook->AddPage(AddAccountPage(notebook), _("Account")); notebook->AddPage(AddDebugPage(notebook), _("Debug")); Bind(wxEVT_CLOSE_WINDOW, &GeneralSettings2::OnClose, this); - // + // sizer->Add(notebook, 1, wxEXPAND | wxALL, 5); @@ -965,7 +1015,7 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) ApplyConfig(); HandleGraphicsApiSelection(); - + DisableSettings(game_launched); } @@ -977,7 +1027,7 @@ uint32 GeneralSettings2::GetSelectedAccountPersistentId() return dynamic_cast(m_active_account->GetClientObject(active_account))->GetAccount().GetPersistentId(); } -void GeneralSettings2::StoreConfig() +void GeneralSettings2::StoreConfig() { auto* app = (CemuApp*)wxTheApp; auto& config = GetConfig(); @@ -1002,7 +1052,6 @@ void GeneralSettings2::StoreConfig() ScreenSaver::SetInhibit(config.disable_screensaver); } - // -1 is default wx widget value -> set to dummy 0 so mainwindow and padwindow will update it wxGuiConfig.window_position = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; wxGuiConfig.window_size = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; @@ -1045,7 +1094,7 @@ void GeneralSettings2::StoreConfig() config.pad_channels = kStereo; // (AudioChannels)m_pad_channels->GetSelection(); //config.input_channels = (AudioChannels)m_input_channels->GetSelection(); config.input_channels = kMono; // (AudioChannels)m_input_channels->GetSelection(); - + config.tv_volume = m_tv_volume->GetValue(); config.pad_volume = m_pad_volume->GetValue(); config.input_volume = m_input_volume->GetValue(); @@ -1091,26 +1140,45 @@ void GeneralSettings2::StoreConfig() config.graphic_api = (GraphicAPI)m_graphic_api->GetSelection(); selection = m_graphic_device->GetSelection(); - if(selection != wxNOT_FOUND) - { - const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); - if(info) - config.graphic_device_uuid = info->GetDeviceInfo().uuid; - else - config.graphic_device_uuid = {}; + if (config.graphic_api == GraphicAPI::kVulkan) + { + if (selection != wxNOT_FOUND) + { + const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.vk_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.vk_graphic_device_uuid = {}; + } + else + config.vk_graphic_device_uuid = {}; + } + else if (config.graphic_api == GraphicAPI::kMetal) + { + if (selection != wxNOT_FOUND) + { +#if ENABLE_METAL + const auto* info = (wxMetalUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.mtl_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.mtl_graphic_device_uuid = {}; +#endif + } + else + config.mtl_graphic_device_uuid = {}; } - else - config.graphic_device_uuid = {}; - + config.vsync = m_vsync->GetSelection(); config.gx2drawdone_sync = m_gx2drawdone_sync->IsChecked(); + config.force_mesh_shaders = m_force_mesh_shaders->IsChecked(); config.async_compile = m_async_compile->IsChecked(); - + config.upscale_filter = m_upscale_filter->GetSelection(); config.downscale_filter = m_downscale_filter->GetSelection(); config.fullscreen_scaling = m_fullscreen_scaling->GetSelection(); - + config.overlay.position = (ScreenPosition)m_overlay_position->GetSelection(); wxASSERT((int)config.overlay.position <= (int)ScreenPosition::kBottomRight); config.overlay.text_color = m_overlay_font_color->GetColour().GetRGBA(); config.overlay.text_scale = m_overlay_scale->GetSelection() * 25 + 50; @@ -1137,6 +1205,8 @@ void GeneralSettings2::StoreConfig() // debug config.crash_dump = (CrashDump)m_crash_dump->GetSelection(); config.gdb_port = m_gdb_port->GetValue(); + config.gpu_capture_dir = m_gpu_capture_dir->GetValue().utf8_string(); + config.framebuffer_fetch = m_framebuffer_fetch->IsChecked(); GetConfigHandle().Save(); } @@ -1178,7 +1248,7 @@ void GeneralSettings2::OnAudioLatencyChanged(wxCommandEvent& event) void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) { - + if(event.GetEventObject() == m_input_volume) { std::shared_lock lock(g_audioInputMutex); @@ -1206,7 +1276,7 @@ void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) g_portalAudio->SetVolume(event.GetInt()); } } - + event.Skip(); } @@ -1299,7 +1369,7 @@ void GeneralSettings2::UpdateAudioDeviceList() // todo reset global instance of audio device } -void GeneralSettings2::ResetAccountInformation() +void GeneralSettings2::ResetAccountInformation() { m_account_grid->SetSplitterPosition(100); m_active_account->SetSelection(0); @@ -1327,7 +1397,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) Account account(dialog.GetPersistentId(), dialog.GetMiiName().ToStdWstring()); account.Save(); Account::RefreshAccounts(); - + const int index = m_active_account->Append(account.ToString(), new wxAccountData(account)); // update ui @@ -1336,7 +1406,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) m_create_account->Enable(m_active_account->GetCount() < 0xC); m_delete_account->Enable(m_active_account->GetCount() > 1); - + // send main window event wxASSERT(GetParent()); wxCommandEvent refresh_event(wxEVT_ACCOUNTLIST_REFRESH); @@ -1366,7 +1436,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) return; // todo: ask if saves should be deleted too? - + const fs::path path = account.GetFileName(); try { @@ -1384,7 +1454,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) SystemException sys(ex); cemuLog_log(LogType::Force, sys.what()); } - + } void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) @@ -1439,7 +1509,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) else if (property->GetName() == kPropertyEmail) { account.SetEmail(value.As().ToStdString()); - + } else if (property->GetName() == kPropertyCountry) { @@ -1447,7 +1517,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) } else cemu_assert_debug(false); - + account.Save(); Account::RefreshAccounts(); // refresh internal account list UpdateAccountInformation(); // refresh on invalid values @@ -1487,7 +1557,7 @@ void GeneralSettings2::UpdateAccountInformation() gender_property->SetChoiceSelection(std::min(gender_property->GetChoices().GetCount() - 1, (uint32)account.GetGender())); m_account_grid->GetProperty(kPropertyEmail)->SetValueFromString(std::string{ account.GetEmail() }); - + auto* country_property = dynamic_cast(m_account_grid->GetProperty(kPropertyCountry)); wxASSERT(country_property); int index = (country_property)->GetIndexForValue(account.GetCountry()); @@ -1571,9 +1641,9 @@ void GeneralSettings2::HandleGraphicsApiSelection() int selection = m_vsync->GetSelection(); if(selection == wxNOT_FOUND) selection = GetConfig().vsync; - + m_vsync->Clear(); - if(m_graphic_api->GetSelection() == 0) + if (m_graphic_api->GetSelection() == 0) { // OpenGL m_vsync->AppendString(_("Off")); @@ -1588,12 +1658,14 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_gx2drawdone_sync->Enable(); m_async_compile->Disable(); + m_force_mesh_shaders->Disable(); } - else + else if (m_graphic_api->GetSelection() == 1) { // Vulkan m_gx2drawdone_sync->Disable(); m_async_compile->Enable(); + m_force_mesh_shaders->Disable(); m_vsync->AppendString(_("Off")); m_vsync->AppendString(_("Double buffering")); @@ -1603,7 +1675,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() #endif m_vsync->Select(selection); - + m_graphic_device->Enable(); auto devices = VulkanRenderer::GetDevices(); m_graphic_device->Clear(); @@ -1618,7 +1690,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() const auto& config = GetConfig(); for(size_t i = 0; i < devices.size(); ++i) { - if(config.graphic_device_uuid == devices[i].uuid) + if(config.vk_graphic_device_uuid == devices[i].uuid) { m_graphic_device->SetSelection(i); break; @@ -1626,6 +1698,42 @@ void GeneralSettings2::HandleGraphicsApiSelection() } } } + else + { + // Metal + m_gx2drawdone_sync->Disable(); + m_async_compile->Enable(); + m_force_mesh_shaders->Enable(); + + m_vsync->AppendString(_("Off")); + m_vsync->AppendString(_("On")); + + m_vsync->Select(selection); + + m_graphic_device->Enable(); + m_graphic_device->Clear(); +#if ENABLE_METAL + auto devices = MetalRenderer::GetDevices(); + if(!devices.empty()) + { + for (const auto& device : devices) + { + m_graphic_device->Append(device.name, new wxMetalUUID(device)); + } + m_graphic_device->SetSelection(0); + + const auto& config = GetConfig(); + for (size_t i = 0; i < devices.size(); ++i) + { + if (config.mtl_graphic_device_uuid == devices[i].uuid) + { + m_graphic_device->SetSelection(i); + break; + } + } + } +#endif + } } void GeneralSettings2::ApplyConfig() @@ -1683,6 +1791,7 @@ void GeneralSettings2::ApplyConfig() m_vsync->SetSelection(config.vsync); m_async_compile->SetValue(config.async_compile); m_gx2drawdone_sync->SetValue(config.gx2drawdone_sync); + m_force_mesh_shaders->SetValue(config.force_mesh_shaders); m_upscale_filter->SetSelection(config.upscale_filter); m_downscale_filter->SetSelection(config.downscale_filter); m_fullscreen_scaling->SetSelection(config.fullscreen_scaling); @@ -1733,7 +1842,7 @@ void GeneralSettings2::ApplyConfig() m_pad_channels->SetSelection(0); //m_input_channels->SetSelection(config.pad_channels); m_input_channels->SetSelection(0); - + SendSliderEvent(m_tv_volume, config.tv_volume); if (!config.tv_device.empty() && m_tv_device->HasClientObjectData()) @@ -1750,7 +1859,7 @@ void GeneralSettings2::ApplyConfig() } else m_tv_device->SetSelection(0); - + SendSliderEvent(m_pad_volume, config.pad_volume); if (!config.pad_device.empty() && m_pad_device->HasClientObjectData()) { @@ -1819,6 +1928,8 @@ void GeneralSettings2::ApplyConfig() // debug m_crash_dump->SetSelection((int)config.crash_dump.GetValue()); m_gdb_port->SetValue(config.gdb_port.GetValue()); + m_gpu_capture_dir->SetValue(wxHelper::FromUtf8(config.gpu_capture_dir.GetValue())); + m_framebuffer_fetch->SetValue(config.framebuffer_fetch); } void GeneralSettings2::OnAudioAPISelected(wxCommandEvent& event) @@ -1886,7 +1997,7 @@ void GeneralSettings2::UpdateAudioDevice() } } } - + // pad audio device { const auto selection = m_pad_device->GetSelection(); @@ -1980,7 +2091,7 @@ void GeneralSettings2::UpdateAudioDevice() channels = g_portalAudio->GetChannels(); else channels = 1; - + try { g_portalAudio = IAudioAPI::CreateDevice((IAudioAPI::AudioAPI)config.audio_api, description->GetDescription(), 8000, 1, 32, 16); @@ -2013,14 +2124,14 @@ void GeneralSettings2::OnAudioChannelsSelected(wxCommandEvent& event) { if (config.tv_channels == (AudioChannels)obj->GetSelection()) return; - + config.tv_channels = (AudioChannels)obj->GetSelection(); } else if (obj == m_pad_channels) { if (config.pad_channels == (AudioChannels)obj->GetSelection()) return; - + config.pad_channels = (AudioChannels)obj->GetSelection(); } else @@ -2163,23 +2274,23 @@ void GeneralSettings2::OnShowOnlineValidator(wxCommandEvent& event) const auto selection = m_active_account->GetSelection(); if (selection == wxNOT_FOUND) return; - + const auto* obj = dynamic_cast(m_active_account->GetClientObject(selection)); wxASSERT(obj); const auto& account = obj->GetAccount(); - + const auto validator = account.ValidateOnlineFiles(); if (validator) // everything valid? shouldn't happen return; - + wxString err; err << _("The following error(s) have been found:") << '\n'; - + if (validator.otp == OnlineValidator::FileState::Missing) err << _("otp.bin missing in Cemu directory") << '\n'; else if(validator.otp == OnlineValidator::FileState::Corrupted) err << _("otp.bin is invalid") << '\n'; - + if (validator.seeprom == OnlineValidator::FileState::Missing) err << _("seeprom.bin missing in Cemu directory") << '\n'; else if(validator.seeprom == OnlineValidator::FileState::Corrupted) diff --git a/src/gui/wxgui/GeneralSettings2.h b/src/gui/wxgui/GeneralSettings2.h index fb0cfe872d..dbd2443360 100644 --- a/src/gui/wxgui/GeneralSettings2.h +++ b/src/gui/wxgui/GeneralSettings2.h @@ -28,7 +28,7 @@ class GeneralSettings2 : public wxDialog bool m_has_account_change = false; // keep track of dirty state of accounts - + wxPanel* AddGeneralPage(wxNotebook* notebook); wxPanel* AddGraphicsPage(wxNotebook* notebook); wxPanel* AddAudioPage(wxNotebook* notebook); @@ -56,7 +56,7 @@ class GeneralSettings2 : public wxDialog // Graphics wxChoice* m_graphic_api, * m_graphic_device; wxChoice* m_vsync; - wxCheckBox *m_async_compile, *m_gx2drawdone_sync; + wxCheckBox *m_async_compile, *m_gx2drawdone_sync, *m_force_mesh_shaders; wxRadioBox* m_upscale_filter, *m_downscale_filter, *m_fullscreen_scaling; wxChoice* m_overlay_position, *m_notification_position, *m_overlay_scale, *m_notification_scale; wxCheckBox* m_controller_profile_name, *m_controller_low_battery, *m_shader_compiling, *m_friends_data; @@ -82,6 +82,8 @@ class GeneralSettings2 : public wxDialog // Debug wxChoice* m_crash_dump; wxSpinCtrl* m_gdb_port; + wxTextCtrl* m_gpu_capture_dir; + wxCheckBox* m_framebuffer_fetch; void OnAccountCreate(wxCommandEvent& event); void OnAccountDelete(wxCommandEvent& event); @@ -110,11 +112,10 @@ class GeneralSettings2 : public wxDialog void UpdateAudioDevice(); // refreshes audio device list for dropdown void UpdateAudioDeviceList(); - + void ResetAccountInformation(); void UpdateAccountInformation(); void UpdateOnlineAccounts(); void HandleGraphicsApiSelection(); void ApplyConfig(); }; - diff --git a/src/gui/wxgui/LoggingWindow.cpp b/src/gui/wxgui/LoggingWindow.cpp index c4bfdccb6d..3fd68c43a2 100644 --- a/src/gui/wxgui/LoggingWindow.cpp +++ b/src/gui/wxgui/LoggingWindow.cpp @@ -18,7 +18,7 @@ LoggingWindow::LoggingWindow(wxFrame* parent) filter_row->Add(new wxStaticText( this, wxID_ANY, _("Filter")), 0, wxALIGN_CENTER_VERTICAL|wxALL, 5 ); - wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer"}; + wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer", "Metal debug output"}; m_filter = new wxComboBox( this, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, std::size(choices), choices, 0 ); m_filter->Bind(wxEVT_COMBOBOX, &LoggingWindow::OnFilterChange, this); m_filter->Bind(wxEVT_TEXT, &LoggingWindow::OnFilterChange, this); @@ -69,7 +69,7 @@ void LoggingWindow::Log(std::string_view filter, std::wstring_view message) void LoggingWindow::OnLogMessage(wxLogEvent& event) { - m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); + m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); } void LoggingWindow::OnFilterChange(wxCommandEvent& event) @@ -83,4 +83,3 @@ void LoggingWindow::OnFilterMessageChange(wxCommandEvent& event) m_log_list->SetFilterMessage(m_filter_message->GetValue()); event.Skip(); } - diff --git a/src/gui/wxgui/MainWindow.cpp b/src/gui/wxgui/MainWindow.cpp index c4ace87fde..e0aa68ab22 100644 --- a/src/gui/wxgui/MainWindow.cpp +++ b/src/gui/wxgui/MainWindow.cpp @@ -14,6 +14,7 @@ #include "AudioDebuggerWindow.h" #include "wxgui/canvas/OpenGLCanvas.h" #include "wxgui/canvas/VulkanCanvas.h" +#include "wxgui/canvas/MetalCanvas.h" #include "Cafe/OS/libs/nfc/nfc.h" #include "Cafe/OS/libs/swkbd/swkbd.h" #include "wxgui/debugger/DebuggerWindow2.h" @@ -64,6 +65,10 @@ #include "gamemode_client.h" #endif +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#endif + #include "Cafe/TitleList/TitleInfo.h" #include "Cafe/TitleList/TitleList.h" #include "wxHelper.h" @@ -99,7 +104,7 @@ enum // options -> account MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_1 = 20350, MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_12 = 20350 + 11, - + // options -> system language MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_JAPANESE = 20500, MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_ENGLISH, @@ -142,6 +147,7 @@ enum MAINFRAME_MENU_ID_DEBUG_VIEW_TEXTURE_RELATIONS, MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, + MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, // debug->logging MAINFRAME_MENU_ID_DEBUG_LOGGING_MESSAGE = 21499, @@ -223,6 +229,7 @@ EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_CURL_REQUESTS, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, MainWindow::OnDebugSetting) +EVT_MENU(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_RAM, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_FST, MainWindow::OnDebugSetting) // debug -> View ... @@ -255,7 +262,7 @@ class wxGameDropTarget : public wxFileDropTarget { if(!m_window->IsGameLaunched() && filenames.GetCount() == 1) return m_window->FileLoad(_utf8ToPath(filenames[0].utf8_string()), wxLaunchGameEvent::INITIATED_BY::DRAG_AND_DROP); - + return false; } @@ -471,7 +478,7 @@ bool MainWindow::InstallUpdate(const fs::path& metaFilePath) { throw std::runtime_error(frame.GetExceptionMessage()); } - } + } } catch(const AbortException&) { @@ -655,13 +662,13 @@ void MainWindow::OnFileMenu(wxCommandEvent& event) _("Wii U executable (*.rpx, *.elf)"), _("All files (*.*)") ); - + wxFileDialog openFileDialog(this, _("Open file to launch"), wxEmptyString, wxEmptyString, wildcard, wxFD_OPEN | wxFD_FILE_MUST_EXIST); if (openFileDialog.ShowModal() == wxID_CANCEL || openFileDialog.GetPath().IsEmpty()) return; - const wxString wxStrFilePath = openFileDialog.GetPath(); + const wxString wxStrFilePath = openFileDialog.GetPath(); FileLoad(_utf8ToPath(wxStrFilePath.utf8_string()), wxLaunchGameEvent::INITIATED_BY::MENU); } else if (menuId >= MAINFRAME_MENU_ID_FILE_RECENT_0 && menuId <= MAINFRAME_MENU_ID_FILE_RECENT_LAST) @@ -805,7 +812,7 @@ void MainWindow::TogglePadView() { if (m_padView) return; - + m_padView = new PadViewFrame(this); m_padView->Bind(wxEVT_CLOSE_WINDOW, &MainWindow::OnPadClose, this); @@ -1020,7 +1027,7 @@ void MainWindow::OnConsoleLanguage(wxCommandEvent& event) // GetConfig().cpu_mode = CPUMode::TriplecoreRecompiler; // else // cemu_assert_debug(false); -// +// // GetConfigHandle().Save(); //} @@ -1034,6 +1041,14 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) if(!GetConfig().vk_accurate_barriers) wxMessageBox(_("Warning: Disabling the accurate barriers option will lead to flickering graphics but may improve performance. It is highly recommended to leave it turned on."), _("Accurate barriers are off"), wxOK); } + else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE) + { + cemu_assert_debug(g_renderer->GetType() == RendererAPI::Metal); + +#if ENABLE_METAL + static_cast(g_renderer.get())->CaptureFrame(); +#endif + } else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY) ActiveSettings::EnableAudioOnlyAux(event.IsChecked()); else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_DUMP_RAM) @@ -1084,7 +1099,7 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) ActiveSettings::SetTimerShiftFactor(6); else cemu_assert_debug(false); - + GetConfigHandle().Save(); } @@ -1156,7 +1171,7 @@ void MainWindow::OnLoggingWindow(wxCommandEvent& event) return; m_logging_window = new LoggingWindow(this); - m_logging_window->Bind(wxEVT_CLOSE_WINDOW, + m_logging_window->Bind(wxEVT_CLOSE_WINDOW, [this](wxCloseEvent& event) { m_logging_window = nullptr; event.Skip(); @@ -1331,7 +1346,7 @@ void MainWindow::SaveSettings() { auto lock = GetConfigHandle().Lock(); auto& config = GetWxGUIConfig(); - + if (config.window_position != Vector2i{ -1,-1 }) { config.window_position.x = m_restored_position.x; @@ -1368,7 +1383,7 @@ void MainWindow::SaveSettings() if(m_game_list) m_game_list->SaveConfig(); - + g_wxConfig.Save(); } @@ -1398,14 +1413,14 @@ void MainWindow::OnMouseMove(wxMouseEvent& event) void MainWindow::OnMouseLeft(wxMouseEvent& event) { auto& instance = InputManager::instance(); - + std::scoped_lock lock(instance.m_main_mouse.m_mutex); instance.m_main_mouse.left_down = event.ButtonDown(wxMOUSE_BTN_LEFT); auto physPos = ToPhys(event.GetPosition()); instance.m_main_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_main_mouse.left_down_toggle = true; - + event.Skip(); } @@ -1419,7 +1434,7 @@ void MainWindow::OnMouseRight(wxMouseEvent& event) instance.m_main_mouse.position = { physPos.x, physPos.y }; if(event.ButtonDown(wxMOUSE_BTN_RIGHT)) instance.m_main_mouse.right_down_toggle = true; - + event.Skip(); } @@ -1476,7 +1491,7 @@ void MainWindow::OnKeyDown(wxKeyEvent& event) #endif else { - event.Skip(); + event.Skip(); } } @@ -1484,7 +1499,7 @@ void MainWindow::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + // event.Skip(); } @@ -1509,7 +1524,7 @@ void MainWindow::OnToolsInput(wxCommandEvent& event) case MAINFRAME_MENU_ID_TOOLS_DOWNLOAD_MANAGER: { const auto default_tab = id == MAINFRAME_MENU_ID_TOOLS_TITLE_MANAGER ? TitleManagerPage::TitleManager : TitleManagerPage::DownloadManager; - + if (m_title_manager) m_title_manager->SetFocusAndTab(default_tab); else @@ -1559,7 +1574,7 @@ void MainWindow::OnGesturePan(wxPanGestureEvent& event) instance.m_main_touch.left_down = event.IsGestureStart() || !event.IsGestureEnd(); if (event.IsGestureStart() || !event.IsGestureEnd()) instance.m_main_touch.left_down_toggle = true; - + event.Skip(); } @@ -1593,8 +1608,12 @@ void MainWindow::CreateCanvas() // create canvas if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(m_game_panel, wxSize(1280, 720), true); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(m_game_panel, wxSize(1280, 720), true); +#if ENABLE_METAL + else + m_render_canvas = new MetalCanvas(m_game_panel, wxSize(1280, 720), true); +#endif // mouse events m_render_canvas->Bind(wxEVT_MOTION, &MainWindow::OnMouseMove, this); @@ -1774,10 +1793,10 @@ void MainWindow::UpdateNFCMenu() const auto& entry = config.recent_nfc_files[i]; if (entry.empty()) continue; - + if (!fs::exists(_utf8ToPath(entry))) continue; - + if (recentFileIndex == 0) m_nfcMenuSeparator0 = m_nfcMenu->AppendSeparator(); @@ -1828,7 +1847,7 @@ void MainWindow::OnTimer(wxTimerEvent& event) { ShowCursor(false); } - + } #define BUILD_DATE __DATE__ " " __TIME__ @@ -2097,9 +2116,9 @@ void MainWindow::RecreateMenu() m_menuBar->Destroy(); m_menuBar = nullptr; } - + auto& guiConfig = GetWxGUIConfig(); - + m_menuBar = new wxMenuBar(); // file submenu m_fileMenu = new wxMenu(); @@ -2152,7 +2171,7 @@ void MainWindow::RecreateMenu() item->Check(account_id == account.GetPersistentId()); if (m_game_launched || LaunchSettings::GetPersistentId().has_value()) item->Enable(false); - + ++index; } @@ -2184,8 +2203,8 @@ void MainWindow::RecreateMenu() // options submenu wxMenu* optionsMenu = new wxMenu(); m_fullscreenMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_FULLSCREEN, _("&Fullscreen"), wxEmptyString); - m_fullscreenMenuItem->Check(FullscreenEnabled()); - + m_fullscreenMenuItem->Check(FullscreenEnabled()); + optionsMenu->Append(MAINFRAME_MENU_ID_OPTIONS_GRAPHIC_PACKS2, _("&Graphic packs")); m_padViewMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_SECOND_WINDOW_PADVIEW, _("&Separate GamePad view"), wxEmptyString); m_padViewMenuItem->Check(wxConfig.pad_open); @@ -2284,7 +2303,7 @@ void MainWindow::RecreateMenu() debugMenu->AppendSubMenu(debugLoggingMenu, _("&Logging")); debugMenu->AppendSubMenu(debugDumpMenu, _("&Dump")); debugMenu->AppendSeparator(); - + auto upsidedownItem = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, _("&Render upside-down"), wxEmptyString); upsidedownItem->Check(ActiveSettings::RenderUpsideDownEnabled()); if(LaunchSettings::RenderUpsideDownEnabled().has_value()) @@ -2293,6 +2312,9 @@ void MainWindow::RecreateMenu() auto accurateBarriers = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&Accurate barriers (Vulkan)"), wxEmptyString); accurateBarriers->Check(GetConfig().vk_accurate_barriers); + auto gpuCapture = debugMenu->Append(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, _("&GPU capture (Metal)")); + gpuCapture->Enable(m_game_launched && g_renderer->GetType() == RendererAPI::Metal); + debugMenu->AppendSeparator(); #ifdef CEMU_DEBUG_ASSERT diff --git a/src/gui/wxgui/PadViewFrame.cpp b/src/gui/wxgui/PadViewFrame.cpp index 556db579ff..dcb5a65f84 100644 --- a/src/gui/wxgui/PadViewFrame.cpp +++ b/src/gui/wxgui/PadViewFrame.cpp @@ -8,6 +8,7 @@ #include "Cafe/OS/libs/swkbd/swkbd.h" #include "wxgui/canvas/OpenGLCanvas.h" #include "wxgui/canvas/VulkanCanvas.h" +#include "wxgui/canvas/MetalCanvas.h" #include "config/CemuConfig.h" #include "wxgui/MainWindow.h" #include "wxgui/helpers/wxHelpers.h" @@ -74,8 +75,12 @@ void PadViewFrame::InitializeRenderCanvas() { if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(this, wxSize(854, 480), false); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(this, wxSize(854, 480), false); +#if ENABLE_METAL + else + m_render_canvas = new MetalCanvas(this, wxSize(854, 480), false); +#endif sizer->Add(m_render_canvas, 1, wxEXPAND, 0, nullptr); } SetSizer(sizer); @@ -173,7 +178,7 @@ void PadViewFrame::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + event.Skip(); } @@ -198,7 +203,7 @@ void PadViewFrame::OnMouseLeft(wxMouseEvent& event) instance.m_pad_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_pad_mouse.left_down_toggle = true; - + } void PadViewFrame::OnMouseRight(wxMouseEvent& event) diff --git a/src/gui/wxgui/canvas/MetalCanvas.cpp b/src/gui/wxgui/canvas/MetalCanvas.cpp new file mode 100644 index 0000000000..ee58abcd5d --- /dev/null +++ b/src/gui/wxgui/canvas/MetalCanvas.cpp @@ -0,0 +1,61 @@ +#include "wxgui/canvas/MetalCanvas.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +#include +#include + +MetalCanvas::MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window) + : IRenderCanvas(is_main_window), wxWindow(parent, wxID_ANY, wxDefaultPosition, size, wxNO_FULL_REPAINT_ON_RESIZE | wxWANTS_CHARS) +{ + Bind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Bind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + auto& canvas = is_main_window ? WindowSystem::GetWindowInfo().canvas_main : WindowSystem::GetWindowInfo().canvas_pad; + canvas = initHandleContextFromWxWidgetsWindow(this); + + try + { + if (is_main_window) + g_renderer = std::make_unique(); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->InitializeLayer({size.x, size.y}, is_main_window); + } + catch(const std::exception& ex) + { + cemuLog_log(LogType::Force, "Error when initializing Metal renderer: {}", ex.what()); + auto msg = formatWxString(_("Error when initializing Metal renderer:\n{}"), ex.what()); + wxMessageDialog dialog(this, msg, _("Error"), wxOK | wxCENTRE | wxICON_ERROR); + dialog.ShowModal(); + exit(0); + } + + wxWindow::EnableTouchEvents(wxTOUCH_PAN_GESTURES); +} + +MetalCanvas::~MetalCanvas() +{ + Unbind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Unbind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + MetalRenderer* mtlr = (MetalRenderer*)g_renderer.get(); + if (mtlr) + mtlr->ShutdownLayer(m_is_main_window); +} + +void MetalCanvas::OnPaint(wxPaintEvent& event) +{ +} + +void MetalCanvas::OnResize(wxSizeEvent& event) +{ + const wxSize size = GetSize(); + if (size.GetWidth() == 0 || size.GetHeight() == 0) + return; + + const wxRect refreshRect(size); + RefreshRect(refreshRect, false); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->ResizeLayer({size.x, size.y}, m_is_main_window); +} diff --git a/src/gui/wxgui/canvas/MetalCanvas.h b/src/gui/wxgui/canvas/MetalCanvas.h new file mode 100644 index 0000000000..a0a3a616d3 --- /dev/null +++ b/src/gui/wxgui/canvas/MetalCanvas.h @@ -0,0 +1,19 @@ +#pragma once + +#include "wxgui/canvas/IRenderCanvas.h" + +#include + +#include + +class MetalCanvas : public IRenderCanvas, public wxWindow +{ +public: + MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window); + ~MetalCanvas(); + +private: + + void OnPaint(wxPaintEvent& event); + void OnResize(wxSizeEvent& event); +}; diff --git a/src/gui/wxgui/components/wxGameList.cpp b/src/gui/wxgui/components/wxGameList.cpp index f807042c19..db7d5bcea0 100644 --- a/src/gui/wxgui/components/wxGameList.cpp +++ b/src/gui/wxgui/components/wxGameList.cpp @@ -72,8 +72,11 @@ std::list _getCachesPaths(const TitleId& titleId) ActiveSettings::GetCachePath(L"shaderCache/driver/vk/{:016x}.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_spirv.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_gl.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_air.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_shaders.bin", titleId), - ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId)}; + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlshaders.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlpipeline.bin", titleId)}; cachePaths.remove_if( [](const fs::path& cachePath) @@ -251,13 +254,13 @@ void wxGameList::OnGameListSize(wxSizeEvent &event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -1021,13 +1024,13 @@ void wxGameList::OnColumnBeginResize(wxListEvent& event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -1172,7 +1175,7 @@ void wxGameList::OnGameEntryUpdatedByTitleId(wxTitleIdEvent& event) wxString minutesText = formatWxString(wxPLURAL("{} minute", "{} minutes", minutes), minutes); SetItem(index, ColumnGameTime, hoursText + " " + minutesText); } - + // last played if (playTimeStat.last_played.year != 0) { @@ -1387,7 +1390,7 @@ bool wxGameList::QueryIconForTitle(TitleId titleId, int& icon, int& iconSmall) return true; } -void wxGameList::DeleteCachedStrings() +void wxGameList::DeleteCachedStrings() { m_name_cache.clear(); } diff --git a/src/imgui/CMakeLists.txt b/src/imgui/CMakeLists.txt index db7686bd8d..86aeb130fe 100644 --- a/src/imgui/CMakeLists.txt +++ b/src/imgui/CMakeLists.txt @@ -7,6 +7,15 @@ add_library(imguiImpl imgui_extension.h ) +if (ENABLE_METAL) + target_sources(imguiImpl PRIVATE + imgui_impl_metal.mm + imgui_impl_metal.h + ) + + target_compile_definitions(imguiImpl PRIVATE IMGUI_IMPL_METAL_CPP) +endif () + set_property(TARGET imguiImpl PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") target_include_directories(imguiImpl PUBLIC "../") diff --git a/src/imgui/imgui_impl_metal.h b/src/imgui/imgui_impl_metal.h new file mode 100644 index 0000000000..3aaacb9e0a --- /dev/null +++ b/src/imgui/imgui_impl_metal.h @@ -0,0 +1,64 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +#include "imgui.h" // IMGUI_IMPL_API + +//----------------------------------------------------------------------------- +// ObjC API +//----------------------------------------------------------------------------- + +#ifdef __OBJC__ + +@class MTLRenderPassDescriptor; +@protocol MTLDevice, MTLCommandBuffer, MTLRenderCommandEncoder; + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, + id commandBuffer, + id commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif + +//----------------------------------------------------------------------------- +// C++ API +//----------------------------------------------------------------------------- + +// Enable Metal C++ binding support with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file +// More info about using Metal from C++: https://developer.apple.com/metal/cpp/ + +#ifdef IMGUI_IMPL_METAL_CPP +#include +#ifndef __OBJC__ + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif +#endif diff --git a/src/imgui/imgui_impl_metal.mm b/src/imgui/imgui_impl_metal.mm new file mode 100644 index 0000000000..5f0588573c --- /dev/null +++ b/src/imgui/imgui_impl_metal.mm @@ -0,0 +1,575 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +// CHANGELOG +// (minor and older changes stripped away, please see git history for details) +// 2022-08-23: Metal: Update deprecated property 'sampleCount'->'rasterSampleCount'. +// 2022-07-05: Metal: Add dispatch synchronization. +// 2022-06-30: Metal: Use __bridge for ARC based systems. +// 2022-06-01: Metal: Fixed null dereference on exit inside command buffer completion handler. +// 2022-04-27: Misc: Store backend data in a per-context struct, allowing to use this backend with multiple contexts. +// 2022-01-03: Metal: Ignore ImDrawCmd where ElemCount == 0 (very rare but can technically be manufactured by user code). +// 2021-12-30: Metal: Added Metal C++ support. Enable with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file. +// 2021-08-24: Metal: Fixed a crash when clipping rect larger than framebuffer is submitted. (#4464) +// 2021-05-19: Metal: Replaced direct access to ImDrawCmd::TextureId with a call to ImDrawCmd::GetTexID(). (will become a requirement) +// 2021-02-18: Metal: Change blending equation to preserve alpha in output buffer. +// 2021-01-25: Metal: Fixed texture storage mode when building on Mac Catalyst. +// 2019-05-29: Metal: Added support for large mesh (64K+ vertices), enable ImGuiBackendFlags_RendererHasVtxOffset flag. +// 2019-04-30: Metal: Added support for special ImDrawCallback_ResetRenderState callback to reset render state. +// 2019-02-11: Metal: Projecting clipping rectangles correctly using draw_data->FramebufferScale to allow multi-viewports for retina display. +// 2018-11-30: Misc: Setting up io.BackendRendererName so it can be displayed in the About Window. +// 2018-07-05: Metal: Added new Metal backend implementation. + +#include "imgui.h" +#include "imgui_impl_metal.h" +#import +#import + +#pragma mark - Support classes + +// A wrapper around a MTLBuffer object that knows the last time it was reused +@interface MetalBuffer : NSObject +@property (nonatomic, strong) id buffer; +@property (nonatomic, assign) double lastReuseTime; +- (instancetype)initWithBuffer:(id)buffer; +@end + +// An object that encapsulates the data necessary to uniquely identify a +// render pipeline state. These are used as cache keys. +@interface FramebufferDescriptor : NSObject +@property (nonatomic, assign) unsigned long sampleCount; +@property (nonatomic, assign) MTLPixelFormat colorPixelFormat; +@property (nonatomic, assign) MTLPixelFormat depthPixelFormat; +@property (nonatomic, assign) MTLPixelFormat stencilPixelFormat; +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor; +@end + +// A singleton that stores long-lived objects that are needed by the Metal +// renderer backend. Stores the render pipeline state cache and the default +// font texture, and manages the reusable buffer cache. +@interface MetalContext : NSObject +@property (nonatomic, strong) id device; +@property (nonatomic, strong) id depthStencilState; +@property (nonatomic, strong) FramebufferDescriptor* framebufferDescriptor; // framebuffer descriptor for current frame; transient +@property (nonatomic, strong) NSMutableDictionary* renderPipelineStateCache; // pipeline cache; keyed on framebuffer descriptors +@property (nonatomic, strong, nullable) id fontTexture; +@property (nonatomic, strong) NSMutableArray* bufferCache; +@property (nonatomic, assign) double lastBufferCachePurge; +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device; +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device; +@end + +struct ImGui_ImplMetal_Data +{ + MetalContext* SharedMetalContext; + + ImGui_ImplMetal_Data() { memset(this, 0, sizeof(*this)); } +}; + +static ImGui_ImplMetal_Data* ImGui_ImplMetal_CreateBackendData() { return IM_NEW(ImGui_ImplMetal_Data)(); } +static ImGui_ImplMetal_Data* ImGui_ImplMetal_GetBackendData() { return ImGui::GetCurrentContext() ? (ImGui_ImplMetal_Data*)ImGui::GetIO().BackendRendererUserData : nullptr; } +static void ImGui_ImplMetal_DestroyBackendData(){ IM_DELETE(ImGui_ImplMetal_GetBackendData()); } + +static inline CFTimeInterval GetMachAbsoluteTimeInSeconds() { return (CFTimeInterval)(double)(clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1e9); } + +#ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal C++ Backend API + +bool ImGui_ImplMetal_Init(MTL::Device* device) +{ + return ImGui_ImplMetal_Init((__bridge id)(device)); +} + +void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_NewFrame((__bridge MTLRenderPassDescriptor*)(renderPassDescriptor)); +} + +void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder) +{ + ImGui_ImplMetal_RenderDrawData(draw_data, + (__bridge id)(commandBuffer), + (__bridge id)(commandEncoder)); + +} + +bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateFontsTexture((__bridge id)(device)); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateDeviceObjects((__bridge id)(device)); +} + +#endif // #ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal Backend API + +bool ImGui_ImplMetal_Init(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_CreateBackendData(); + ImGuiIO& io = ImGui::GetIO(); + io.BackendRendererUserData = (void*)bd; + io.BackendRendererName = "imgui_impl_metal"; + io.BackendFlags |= ImGuiBackendFlags_RendererHasVtxOffset; // We can honor the ImDrawCmd::VtxOffset field, allowing for large meshes. + + bd->SharedMetalContext = [[MetalContext alloc] init]; + bd->SharedMetalContext.device = device; + + return true; +} + +void ImGui_ImplMetal_Shutdown() +{ + ImGui_ImplMetal_DestroyDeviceObjects(); + ImGui_ImplMetal_DestroyBackendData(); +} + +void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + IM_ASSERT(bd->SharedMetalContext != nil && "No Metal context. Did you call ImGui_ImplMetal_Init() ?"); + bd->SharedMetalContext.framebufferDescriptor = [[FramebufferDescriptor alloc] initWithRenderPassDescriptor:renderPassDescriptor]; + + if (bd->SharedMetalContext.depthStencilState == nil) + ImGui_ImplMetal_CreateDeviceObjects(bd->SharedMetalContext.device); +} + +static void ImGui_ImplMetal_SetupRenderState(ImDrawData* drawData, id commandBuffer, + id commandEncoder, id renderPipelineState, + MetalBuffer* vertexBuffer, size_t vertexBufferOffset) +{ + IM_UNUSED(commandBuffer); + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + [commandEncoder setCullMode:MTLCullModeNone]; + [commandEncoder setDepthStencilState:bd->SharedMetalContext.depthStencilState]; + + // Setup viewport, orthographic projection matrix + // Our visible imgui space lies from draw_data->DisplayPos (top left) to + // draw_data->DisplayPos+data_data->DisplaySize (bottom right). DisplayMin is typically (0,0) for single viewport apps. + MTLViewport viewport = + { + .originX = 0.0, + .originY = 0.0, + .width = (double)(drawData->DisplaySize.x * drawData->FramebufferScale.x), + .height = (double)(drawData->DisplaySize.y * drawData->FramebufferScale.y), + .znear = 0.0, + .zfar = 1.0 + }; + [commandEncoder setViewport:viewport]; + + float L = drawData->DisplayPos.x; + float R = drawData->DisplayPos.x + drawData->DisplaySize.x; + float T = drawData->DisplayPos.y; + float B = drawData->DisplayPos.y + drawData->DisplaySize.y; + float N = (float)viewport.znear; + float F = (float)viewport.zfar; + const float ortho_projection[4][4] = + { + { 2.0f/(R-L), 0.0f, 0.0f, 0.0f }, + { 0.0f, 2.0f/(T-B), 0.0f, 0.0f }, + { 0.0f, 0.0f, 1/(F-N), 0.0f }, + { (R+L)/(L-R), (T+B)/(B-T), N/(F-N), 1.0f }, + }; + [commandEncoder setVertexBytes:&ortho_projection length:sizeof(ortho_projection) atIndex:1]; + + [commandEncoder setRenderPipelineState:renderPipelineState]; + + [commandEncoder setVertexBuffer:vertexBuffer.buffer offset:0 atIndex:0]; + [commandEncoder setVertexBufferOffset:vertexBufferOffset atIndex:0]; +} + +// Metal Render function. +void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, id commandBuffer, id commandEncoder) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MetalContext* ctx = bd->SharedMetalContext; + + // Avoid rendering when minimized, scale coordinates for retina displays (screen coordinates != framebuffer coordinates) + int fb_width = (int)(drawData->DisplaySize.x * drawData->FramebufferScale.x); + int fb_height = (int)(drawData->DisplaySize.y * drawData->FramebufferScale.y); + if (fb_width <= 0 || fb_height <= 0 || drawData->CmdListsCount == 0) + return; + + // Try to retrieve a render pipeline state that is compatible with the framebuffer config for this frame + // The hit rate for this cache should be very near 100%. + id renderPipelineState = ctx.renderPipelineStateCache[ctx.framebufferDescriptor]; + if (renderPipelineState == nil) + { + // No luck; make a new render pipeline state + renderPipelineState = [ctx renderPipelineStateForFramebufferDescriptor:ctx.framebufferDescriptor device:commandBuffer.device]; + + // Cache render pipeline state for later reuse + ctx.renderPipelineStateCache[ctx.framebufferDescriptor] = renderPipelineState; + } + + size_t vertexBufferLength = (size_t)drawData->TotalVtxCount * sizeof(ImDrawVert); + size_t indexBufferLength = (size_t)drawData->TotalIdxCount * sizeof(ImDrawIdx); + MetalBuffer* vertexBuffer = [ctx dequeueReusableBufferOfLength:vertexBufferLength device:commandBuffer.device]; + MetalBuffer* indexBuffer = [ctx dequeueReusableBufferOfLength:indexBufferLength device:commandBuffer.device]; + + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, 0); + + // Will project scissor/clipping rectangles into framebuffer space + ImVec2 clip_off = drawData->DisplayPos; // (0,0) unless using multi-viewports + ImVec2 clip_scale = drawData->FramebufferScale; // (1,1) unless using retina display which are often (2,2) + + // Render command lists + size_t vertexBufferOffset = 0; + size_t indexBufferOffset = 0; + for (int n = 0; n < drawData->CmdListsCount; n++) + { + const ImDrawList* cmd_list = drawData->CmdLists[n]; + + memcpy((char*)vertexBuffer.buffer.contents + vertexBufferOffset, cmd_list->VtxBuffer.Data, (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert)); + memcpy((char*)indexBuffer.buffer.contents + indexBufferOffset, cmd_list->IdxBuffer.Data, (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx)); + + for (int cmd_i = 0; cmd_i < cmd_list->CmdBuffer.Size; cmd_i++) + { + const ImDrawCmd* pcmd = &cmd_list->CmdBuffer[cmd_i]; + if (pcmd->UserCallback) + { + // User callback, registered via ImDrawList::AddCallback() + // (ImDrawCallback_ResetRenderState is a special callback value used by the user to request the renderer to reset render state.) + if (pcmd->UserCallback == ImDrawCallback_ResetRenderState) + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, vertexBufferOffset); + else + pcmd->UserCallback(cmd_list, pcmd); + } + else + { + // Project scissor/clipping rectangles into framebuffer space + ImVec2 clip_min((pcmd->ClipRect.x - clip_off.x) * clip_scale.x, (pcmd->ClipRect.y - clip_off.y) * clip_scale.y); + ImVec2 clip_max((pcmd->ClipRect.z - clip_off.x) * clip_scale.x, (pcmd->ClipRect.w - clip_off.y) * clip_scale.y); + + // Clamp to viewport as setScissorRect() won't accept values that are off bounds + if (clip_min.x < 0.0f) { clip_min.x = 0.0f; } + if (clip_min.y < 0.0f) { clip_min.y = 0.0f; } + if (clip_max.x > fb_width) { clip_max.x = (float)fb_width; } + if (clip_max.y > fb_height) { clip_max.y = (float)fb_height; } + if (clip_max.x <= clip_min.x || clip_max.y <= clip_min.y) + continue; + if (pcmd->ElemCount == 0) // drawIndexedPrimitives() validation doesn't accept this + continue; + + // Apply scissor/clipping rectangle + MTLScissorRect scissorRect = + { + .x = NSUInteger(clip_min.x), + .y = NSUInteger(clip_min.y), + .width = NSUInteger(clip_max.x - clip_min.x), + .height = NSUInteger(clip_max.y - clip_min.y) + }; + [commandEncoder setScissorRect:scissorRect]; + + // Bind texture, Draw + if (ImTextureID tex_id = pcmd->GetTexID()) + [commandEncoder setFragmentTexture:(__bridge id)(tex_id) atIndex:0]; + + [commandEncoder setVertexBufferOffset:(vertexBufferOffset + pcmd->VtxOffset * sizeof(ImDrawVert)) atIndex:0]; + [commandEncoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle + indexCount:pcmd->ElemCount + indexType:sizeof(ImDrawIdx) == 2 ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32 + indexBuffer:indexBuffer.buffer + indexBufferOffset:indexBufferOffset + pcmd->IdxOffset * sizeof(ImDrawIdx)]; + } + } + + vertexBufferOffset += (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert); + indexBufferOffset += (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx); + } + + [commandBuffer addCompletedHandler:^(id) + { + dispatch_async(dispatch_get_main_queue(), ^{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + if (bd != nullptr) + { + @synchronized(bd->SharedMetalContext.bufferCache) + { + [bd->SharedMetalContext.bufferCache addObject:vertexBuffer]; + [bd->SharedMetalContext.bufferCache addObject:indexBuffer]; + } + } + }); + }]; +} + +bool ImGui_ImplMetal_CreateFontsTexture(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); + + // We are retrieving and uploading the font atlas as a 4-channels RGBA texture here. + // In theory we could call GetTexDataAsAlpha8() and upload a 1-channel texture to save on memory access bandwidth. + // However, using a shader designed for 1-channel texture would make it less obvious to use the ImTextureID facility to render users own textures. + // You can make that change in your implementation. + unsigned char* pixels; + int width, height; + io.Fonts->GetTexDataAsRGBA32(&pixels, &width, &height); + MTLTextureDescriptor* textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA8Unorm + width:(NSUInteger)width + height:(NSUInteger)height + mipmapped:NO]; + textureDescriptor.usage = MTLTextureUsageShaderRead; +#if TARGET_OS_OSX || TARGET_OS_MACCATALYST + textureDescriptor.storageMode = MTLStorageModeManaged; +#else + textureDescriptor.storageMode = MTLStorageModeShared; +#endif + id texture = [device newTextureWithDescriptor:textureDescriptor]; + [texture replaceRegion:MTLRegionMake2D(0, 0, (NSUInteger)width, (NSUInteger)height) mipmapLevel:0 withBytes:pixels bytesPerRow:(NSUInteger)width * 4]; + bd->SharedMetalContext.fontTexture = texture; + io.Fonts->SetTexID((__bridge void*)bd->SharedMetalContext.fontTexture); // ImTextureID == void* + + return (bd->SharedMetalContext.fontTexture != nil); +} + +void ImGui_ImplMetal_DestroyFontsTexture() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); + bd->SharedMetalContext.fontTexture = nil; + io.Fonts->SetTexID(nullptr); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MTLDepthStencilDescriptor* depthStencilDescriptor = [[MTLDepthStencilDescriptor alloc] init]; + depthStencilDescriptor.depthWriteEnabled = NO; + depthStencilDescriptor.depthCompareFunction = MTLCompareFunctionAlways; + bd->SharedMetalContext.depthStencilState = [device newDepthStencilStateWithDescriptor:depthStencilDescriptor]; + ImGui_ImplMetal_CreateFontsTexture(device); + + return true; +} + +void ImGui_ImplMetal_DestroyDeviceObjects() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGui_ImplMetal_DestroyFontsTexture(); + [bd->SharedMetalContext.renderPipelineStateCache removeAllObjects]; +} + +#pragma mark - MetalBuffer implementation + +@implementation MetalBuffer +- (instancetype)initWithBuffer:(id)buffer +{ + if ((self = [super init])) + { + _buffer = buffer; + _lastReuseTime = GetMachAbsoluteTimeInSeconds(); + } + return self; +} +@end + +#pragma mark - FramebufferDescriptor implementation + +@implementation FramebufferDescriptor +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor +{ + if ((self = [super init])) + { + _sampleCount = renderPassDescriptor.colorAttachments[0].texture.sampleCount; + _colorPixelFormat = renderPassDescriptor.colorAttachments[0].texture.pixelFormat; + _depthPixelFormat = renderPassDescriptor.depthAttachment.texture.pixelFormat; + _stencilPixelFormat = renderPassDescriptor.stencilAttachment.texture.pixelFormat; + } + return self; +} + +- (nonnull id)copyWithZone:(nullable NSZone*)zone +{ + FramebufferDescriptor* copy = [[FramebufferDescriptor allocWithZone:zone] init]; + copy.sampleCount = self.sampleCount; + copy.colorPixelFormat = self.colorPixelFormat; + copy.depthPixelFormat = self.depthPixelFormat; + copy.stencilPixelFormat = self.stencilPixelFormat; + return copy; +} + +- (NSUInteger)hash +{ + NSUInteger sc = _sampleCount & 0x3; + NSUInteger cf = _colorPixelFormat & 0x3FF; + NSUInteger df = _depthPixelFormat & 0x3FF; + NSUInteger sf = _stencilPixelFormat & 0x3FF; + NSUInteger hash = (sf << 22) | (df << 12) | (cf << 2) | sc; + return hash; +} + +- (BOOL)isEqual:(id)object +{ + FramebufferDescriptor* other = object; + if (![other isKindOfClass:[FramebufferDescriptor class]]) + return NO; + return other.sampleCount == self.sampleCount && + other.colorPixelFormat == self.colorPixelFormat && + other.depthPixelFormat == self.depthPixelFormat && + other.stencilPixelFormat == self.stencilPixelFormat; +} + +@end + +#pragma mark - MetalContext implementation + +@implementation MetalContext +- (instancetype)init +{ + if ((self = [super init])) + { + self.renderPipelineStateCache = [NSMutableDictionary dictionary]; + self.bufferCache = [NSMutableArray array]; + _lastBufferCachePurge = GetMachAbsoluteTimeInSeconds(); + } + return self; +} + +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device +{ + uint64_t now = GetMachAbsoluteTimeInSeconds(); + + @synchronized(self.bufferCache) + { + // Purge old buffers that haven't been useful for a while + if (now - self.lastBufferCachePurge > 1.0) + { + NSMutableArray* survivors = [NSMutableArray array]; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.lastReuseTime > self.lastBufferCachePurge) + [survivors addObject:candidate]; + self.bufferCache = [survivors mutableCopy]; + self.lastBufferCachePurge = now; + } + + // See if we have a buffer we can reuse + MetalBuffer* bestCandidate = nil; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.buffer.length >= length && (bestCandidate == nil || bestCandidate.lastReuseTime > candidate.lastReuseTime)) + bestCandidate = candidate; + + if (bestCandidate != nil) + { + [self.bufferCache removeObject:bestCandidate]; + bestCandidate.lastReuseTime = now; + return bestCandidate; + } + } + + // No luck; make a new buffer + id backing = [device newBufferWithLength:length options:MTLResourceStorageModeShared]; + return [[MetalBuffer alloc] initWithBuffer:backing]; +} + +// Bilinear sampling is required by default. Set 'io.Fonts->Flags |= ImFontAtlasFlags_NoBakedLines' or 'style.AntiAliasedLinesUseTex = false' to allow point/nearest sampling. +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device +{ + NSError* error = nil; + + NSString* shaderSource = @"" + "#include \n" + "using namespace metal;\n" + "\n" + "struct Uniforms {\n" + " float4x4 projectionMatrix;\n" + "};\n" + "\n" + "struct VertexIn {\n" + " float2 position [[attribute(0)]];\n" + " float2 texCoords [[attribute(1)]];\n" + " uchar4 color [[attribute(2)]];\n" + "};\n" + "\n" + "struct VertexOut {\n" + " float4 position [[position]];\n" + " float2 texCoords;\n" + " float4 color;\n" + "};\n" + "\n" + "vertex VertexOut vertex_main(VertexIn in [[stage_in]],\n" + " constant Uniforms &uniforms [[buffer(1)]]) {\n" + " VertexOut out;\n" + " out.position = uniforms.projectionMatrix * float4(in.position, 0, 1);\n" + " out.texCoords = in.texCoords;\n" + " out.color = float4(in.color) / float4(255.0);\n" + " return out;\n" + "}\n" + "\n" + "fragment half4 fragment_main(VertexOut in [[stage_in]],\n" + " texture2d texture [[texture(0)]]) {\n" + " constexpr sampler linearSampler(coord::normalized, min_filter::linear, mag_filter::linear, mip_filter::linear);\n" + " half4 texColor = texture.sample(linearSampler, in.texCoords);\n" + " return half4(in.color) * texColor;\n" + "}\n"; + + id library = [device newLibraryWithSource:shaderSource options:nil error:&error]; + if (library == nil) + { + NSLog(@"Error: failed to create Metal library: %@", error); + return nil; + } + + id vertexFunction = [library newFunctionWithName:@"vertex_main"]; + id fragmentFunction = [library newFunctionWithName:@"fragment_main"]; + + if (vertexFunction == nil || fragmentFunction == nil) + { + NSLog(@"Error: failed to find Metal shader functions in library: %@", error); + return nil; + } + + MTLVertexDescriptor* vertexDescriptor = [MTLVertexDescriptor vertexDescriptor]; + vertexDescriptor.attributes[0].offset = IM_OFFSETOF(ImDrawVert, pos); + vertexDescriptor.attributes[0].format = MTLVertexFormatFloat2; // position + vertexDescriptor.attributes[0].bufferIndex = 0; + vertexDescriptor.attributes[1].offset = IM_OFFSETOF(ImDrawVert, uv); + vertexDescriptor.attributes[1].format = MTLVertexFormatFloat2; // texCoords + vertexDescriptor.attributes[1].bufferIndex = 0; + vertexDescriptor.attributes[2].offset = IM_OFFSETOF(ImDrawVert, col); + vertexDescriptor.attributes[2].format = MTLVertexFormatUChar4; // color + vertexDescriptor.attributes[2].bufferIndex = 0; + vertexDescriptor.layouts[0].stepRate = 1; + vertexDescriptor.layouts[0].stepFunction = MTLVertexStepFunctionPerVertex; + vertexDescriptor.layouts[0].stride = sizeof(ImDrawVert); + + MTLRenderPipelineDescriptor* pipelineDescriptor = [[MTLRenderPipelineDescriptor alloc] init]; + pipelineDescriptor.vertexFunction = vertexFunction; + pipelineDescriptor.fragmentFunction = fragmentFunction; + pipelineDescriptor.vertexDescriptor = vertexDescriptor; + pipelineDescriptor.rasterSampleCount = self.framebufferDescriptor.sampleCount; + pipelineDescriptor.colorAttachments[0].pixelFormat = self.framebufferDescriptor.colorPixelFormat; + pipelineDescriptor.colorAttachments[0].blendingEnabled = YES; + pipelineDescriptor.colorAttachments[0].rgbBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceRGBBlendFactor = MTLBlendFactorSourceAlpha; + pipelineDescriptor.colorAttachments[0].destinationRGBBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.colorAttachments[0].alphaBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceAlphaBlendFactor = MTLBlendFactorOne; + pipelineDescriptor.colorAttachments[0].destinationAlphaBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.depthAttachmentPixelFormat = self.framebufferDescriptor.depthPixelFormat; + pipelineDescriptor.stencilAttachmentPixelFormat = self.framebufferDescriptor.stencilPixelFormat; + + id renderPipelineState = [device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:&error]; + if (error != nil) + NSLog(@"Error: failed to create Metal pipeline state: %@", error); + + return renderPipelineState; +} + +@end diff --git a/src/tools/ShaderCacheMerger.cpp b/src/tools/ShaderCacheMerger.cpp index 14a54252af..7a2727dd07 100644 --- a/src/tools/ShaderCacheMerger.cpp +++ b/src/tools/ShaderCacheMerger.cpp @@ -106,6 +106,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_shaders.bin)"))) MergeShaderCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlshaders.bin)"))) + MergeShaderCacheFile(filename); } printf("\nScanning for pipeline cache files to merge...\n"); for (const auto& it : fs::directory_iterator("shaderCache/transferable/")) @@ -115,6 +117,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_vkpipeline.bin)"))) MergePipelineCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlpipeline.bin)"))) + MergePipelineCacheFile(filename); } }