Devsh-Graphics-Programming · deprilula28 · Feb 16, 2024 · Feb 16, 2024 · Feb 19, 2024 · Feb 21, 2024
diff --git a/08_HelloSwapchain/main.cpp b/08_HelloSwapchain/main.cpp
@@ -17,6 +17,10 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 	public:
 		using base_t::base_t;
 
+		// We inherit from an application that tries to find Graphics and Compute queues
+		// because applications with presentable images often want to perform Graphics family operations
+		virtual bool isComputeOnly() const {return false;}
+
 		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
 		{
 			auto retval = base_t::getAPIFeaturesToEnable();
@@ -26,22 +30,23 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 		}
 
 		// New function, we neeed to know about surfaces to create ahead of time
-		virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
 
-		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		// We have a very simple heuristic, the device must be able to render to all windows!
+		// (want to make something more complex? you're on your own!)
+		virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
 		{
-			const auto firstFilter = base_t::filterDevices(physicalDevices);
+			base_t::filterDevices(physicalDevices);
 
 			video::SPhysicalDeviceFilter deviceFilter = {};
 
-			const auto surfaces = getSurfaces();
-			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
-			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+			auto surfaces = getSurfaces();
+			deviceFilter.requiredSurfaceCompatibilities = {surfaces};
 
 			return deviceFilter(physicalDevices);
 		}
 
-		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
 			if (!base_t::onAppInitialized(std::move(system)))
@@ -52,6 +57,7 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 		#else
 			#error "Unimplemented!"
 		#endif
+			return true;
 		}
 
 		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
@@ -87,7 +93,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 	public:
 		using base_t::base_t;
 
-		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
 			if (!base_t::onAppInitialized(std::move(system)))
@@ -98,7 +104,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 			return true;
 		}
 
-		virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
 		{
 			return {{m_surface.get()/*,EQF_NONE*/}};
 		}
@@ -112,15 +118,15 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 		}
 
 	protected:
-		virtual IWindow::SCreationParams getWindowCreationParams() const
+		virtual ui::IWindow::SCreationParams getWindowCreationParams() const
 		{
-			IWindow::SCreationParams params = {};
-			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
+			ui::IWindow::SCreationParams params = {};
+			params.callback = core::make_smart_refctd_ptr<IWindowClosedCallback>();
 			params.width = 640;
 			params.height = 480;
 			params.x = 32;
 			params.y = 32;
-			params.flags = IWindow::ECF_NONE;
+			params.flags = ui::IWindow::ECF_NONE;
 			params.windowCaption = "SingleNonResizableWindowApplication";
 			return params;
 		}
@@ -130,6 +136,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
 };
 }
 
+#include "nbl/video/CVulkanSwapchain.h"
 
 using namespace nbl;
 using namespace core;

diff --git a/67_SubAllocatedDescriptorSet/CMakeLists.txt b/67_SubAllocatedDescriptorSet/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
diff --git a/67_SubAllocatedDescriptorSet/config.json.template b/67_SubAllocatedDescriptorSet/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/67_SubAllocatedDescriptorSet/main.cpp b/67_SubAllocatedDescriptorSet/main.cpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+#include "nbl/video/surface/CSurfaceVulkan.h"
+#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"
+
+#include "../common/BasicMultiQueueApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
+class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::MonoDeviceApplication;
+		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
+		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
+		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
+
+		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet> m_subAllocDescriptorSet;
+
+		// This example really lets the advantages of a timeline semaphore shine through!
+		smart_refctd_ptr<ISemaphore> m_timeline;
+		uint64_t m_iteration = 0;
+		constexpr static inline uint64_t MaxIterations = 200;
+
+		constexpr static inline uint32_t MaxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
+		constexpr static inline uint32_t MinDescriptorSetAllocationSize = 1u;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			using nbl::video::IGPUDescriptorSetLayout;
+
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(std::move(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+
+			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
+			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
+			constexpr auto MaxConcurrency = 64;
+
+			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
+			m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);
+
+			// In contrast to fences, we just need one semaphore to rule all dispatches
+			m_timeline = m_device->createSemaphore(m_iteration);
+
+			// Descriptor set sub allocator
+
+			video::IGPUDescriptorSetLayout::SBinding bindings[1];
+			{
+				bindings[0].binding = 0;
+				bindings[0].count = 65536u;
+				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
+				bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
+				bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+			}
+
+			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
+
+			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(
+				bindings, MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
+			);
+
+			std::vector<uint32_t> allocation, size;
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
+			{
+				std::vector<uint32_t> addr, freeSize;
+				for (uint32_t i = 0; i < 512; i+=2)
+				{
+					addr.push_back(allocation[i]);
+					freeSize.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0], &freeSize[0]);
+			}
+
+			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
+			allocation.clear();
+			size.clear();
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(2);
+				}
+				subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return m_iteration<MaxIterations; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override
+		{
+			IQueue* const queue = getComputeQueue();
+
+			// Obtain our command pool once one gets recycled
+			uint32_t poolIx;
+			do
+			{
+				poolIx = m_poolCache->acquirePool();
+			} while (poolIx==ICommandPoolCache::invalid_index);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
+				// lets record, its still a one time submit because we have to re-record with different push constants each time
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+				// COMMAND RECORDING
+
+				auto result = cmdbuf->end();
+				assert(result);
+			}
+
+
+			const auto savedIterNum = m_iteration++;
+			{
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+				{
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+				{
+					.semaphore = m_timeline.get(),
+					.value = m_iteration,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				};
+				// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
+				// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
+				// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
+				// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
+				const IQueue::SSubmitInfo submitInfo = {
+					.waitSemaphores = {},
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signalInfo,1}
+				};
+
+				queue->startCapture();
+				auto statusCode = queue->submit({ &submitInfo,1 });
+				queue->endCapture();
+				assert(statusCode == IQueue::RESULT::SUCCESS);
+			}
+		}
+};
+
+NBL_MAIN_FUNC(SubAllocatedDescriptorSetApp)
diff --git a/67_SubAllocatedDescriptorSet/pipeline.groovy b/67_SubAllocatedDescriptorSet/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+
+		return true
+	}
+
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this