sketch out pipeline caching

devsh · devsh · commit 5b6b09e2df8f · 2024-10-28T15:44:59.000+01:00
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
@@ -41,7 +41,21 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 		}
 
 		// ctor
-		inline CComputeBlit(core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice) : m_device(std::move(logicalDevice)) {}
+		CComputeBlit(
+			core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice,
+			core::smart_refctd_ptr<asset::IShaderCompiler::CCache>&& cache=nullptr,
+			core::smart_refctd_ptr<system::ILogger>&& logger=nullptr
+		);
+
+		// if you set the balues too small, we'll correct them ourselves anyway
+		struct STask
+		{
+			uint32_t workgroupSizeLog2 : 4 = 0;
+			// the TRUE output format, not the storage view format you might manually encode into
+			hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN;
+			uint32_t sharedMemoryPerInvocation : 6 = 0;
+			uint32_t unused : 14 = 0;
+		};
 		
 		//! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE.
 		inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format)
@@ -66,22 +80,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 					return compatFormat;
 			}
 		}
-/*
-		struct STask
-		{
-			hlsl::vector<uint8_t,3> preloadWindow; 
-			asset::E_FORMAT inFormat;
-			asset::E_FORMAT outFormat;
-			// default no coverage adjustment
-			uint8_t alphaBinCountLog2 : 4 = 0;
-		};
-		inline void initializeTaskDefault(STask& task) const
-		{
-			auto physDev = m_device->getPhysicalDevice();
-			const auto formatTrait = hlsl::format::getTraits(static_cast<hlsl::format::TexelBlockFormat>(task.outFormat));
-			task.alphaBinCountLog2 = hlsl::max(,task.alphaBinCountLog2);
-		}
-*/
+
 #if 0
 		// @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp
 		core::smart_refctd_ptr<video::IGPUShader> createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount);
@@ -666,7 +665,11 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			EBT_COUNT
 		};
 
+		void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks);
+
 		core::smart_refctd_ptr<ILogicalDevice> m_device;
+		system::logger_opt_smart_ptr m_logger;
+		core::smart_refctd_ptr<asset::IShaderCompiler::CCache> m_shaderCache;
 
 		//! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger
 		//! by a pixel in each dimension.
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -5,6 +5,79 @@ using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::video;
 
+
+CComputeBlit::CComputeBlit(smart_refctd_ptr<ILogicalDevice>&& logicalDevice, smart_refctd_ptr<IShaderCompiler::CCache>&& cache, smart_refctd_ptr<ILogger>&& logger) : m_device(std::move(logicalDevice)), m_logger(nullptr)
+{
+	if (logger)
+		m_logger = std::move(logger);
+	else if (auto debugCb=m_device->getPhysicalDevice()->getDebugCallback(); debugCb->getLogger())
+		m_logger = smart_refctd_ptr<system::ILogger>(debugCb->getLogger());
+	
+	if (cache)
+		m_shaderCache = std::move(cache);
+	else
+		m_shaderCache = make_smart_refctd_ptr<IShaderCompiler::CCache>();
+}
+
+void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks)
+{
+	core::vector<smart_refctd_ptr<ICPUComputePipeline>> cpuPplns;
+	cpuPplns.reserve(tasks.size());
+
+	const auto& limits = m_device->getPhysicalDevice()->getLimits();
+	for (auto task : tasks)
+	{
+		// adjust task default values
+		{
+			if (task.workgroupSizeLog2<limits.maxSubgroupSize)
+				task.workgroupSizeLog2 = core::roundDownToPoT(limits.maxComputeWorkGroupInvocations);
+			bool useFloat16 = false;
+			uint16_t channels = 4;
+			using namespace hlsl::format;
+			if (task.outputFormat!=TexelBlockFormat::TBF_UNKNOWN)
+			{
+				channels = getTraits(task.outputFormat).Channels;
+				const auto precisionAt1 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,1.f);
+				const auto precisionAt0 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,0.f);
+				if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits<hlsl::float16_t>::min())
+					useFloat16 = true;
+			}
+			// the absolute minimum needed to store a single pixel
+			const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t));
+			// also slightly more memory is needed
+			task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation);
+		}
+		// create blit pipeline
+		cpuPplns.emplace_back(nullptr);
+		// create optional coverage normalization pipeline
+		cpuPplns.emplace_back(nullptr);
+	}
+
+	CAssetConverter::SInputs inputs = {};
+	inputs.readCache = converter;
+	inputs.logger = m_logger.getRaw();
+	std::get<CAssetConverter::SInputs::asset_span_t<ICPUComputePipeline>>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()};
+	inputs.readShaderCache = m_shaderCache.get();
+	inputs.writeShaderCache = m_shaderCache.get();
+	// no pipeline cache, because we only make the same pipeline once, ever
+	auto reserveResults = converter->reserve(inputs);
+	assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE);
+	// copy over the results
+	{
+		auto rIt = reserveResults.getGPUObjects<ICPUComputePipeline>().data();
+		// TODO: redo
+		for (size_t i=0; i<tasks.size(); i++)
+			*(pipelines++) =  (rIt++)->value;
+	}
+
+	// this just inserts the pipelines into the cache
+	{
+		CAssetConverter::SConvertParams params = {};
+		auto convertResults = reserveResults.convert(params);
+		assert(!convertResults.blocking());
+	}
+}
+
 #if 0
 core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
@@ -39,21 +112,14 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializ
 		   "}\n";
 
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader");
-
-	return  m_device->createShader(std::move(cpuShader.get()));
 }
 
-core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat,
-	const uint32_t alphaBinCount)
+core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
 	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
 	const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
 	const uint32_t blitDimCount = static_cast<uint32_t>(imageType) + 1;
 
-	const auto castedFormat = getOutImageViewFormat(outFormat);
-	assert(outFormat == castedFormat);
-	const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat);
-
 	std::ostringstream shaderSourceStream;
 
 	shaderSourceStream
@@ -67,7 +133,7 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
 		   "[[vk::binding(0, 0)]]\n"
 		   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::combined_sampler_t inCS;\n"
 
-		   "[[vk::image_format(\"" << formatQualifier << "\")]]\n"
+		   "[[vk::image_format(\"unknown\")]]\n"
 		   "[[vk::binding(1, 0)]]\n"
 		   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::image_t outImg;\n"
 
@@ -90,7 +156,5 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
 		   "}\n";
 
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader");
-
-	return m_device->createShader(std::move(cpuShader.get()));
 }
 #endif