Skip to content

Commit e4a4388

Browse files
authored
Merge pull request #394 from Devsh-Graphics-Programming/glsl_define_injection
GLSL define injection update for limits & features
2 parents 631d872 + fe2fd21 commit e4a4388

File tree

12 files changed

+654
-108
lines changed

12 files changed

+654
-108
lines changed

include/nbl/builtin/glsl/property_pool/copy.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#version 440 core
2-
layout(local_size_x=NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
2+
layout(local_size_x=NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
33

44
#include "nbl/builtin/glsl/property_pool/transfer.glsl"
55

@@ -68,7 +68,7 @@ void main()
6868
const uint dstIndexOffset = transfer.dstIndexOffset-indicesToSkip;
6969
// set up loop
7070
const uint DWORDs = min(transfer.elementCount*propDWORDs,pc.endDWORD);
71-
const uint dispatchSize = gl_NumWorkGroups[0]*NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS;
71+
const uint dispatchSize = gl_NumWorkGroups[0]*NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS;
7272
//
7373
const bool fill = bool(transfer.propertyDWORDsize_flags&(NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_SRC_FILL<<flagsBitOffset));
7474
if (fill)

include/nbl/builtin/glsl/skinning/cache_update.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#version 440 core
2-
layout(local_size_x=NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
2+
layout(local_size_x=NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
33

44
#include <nbl/builtin/glsl/skinning/cache_descriptor_set.glsl>
55
#include <nbl/builtin/glsl/skinning/update_descriptor_set.glsl>
@@ -15,7 +15,7 @@ void main()
1515
return;
1616

1717
const uint totalJointCount = jointCountInclPrefixSum[skinCount-1u];
18-
const uint dispatchSize = NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
18+
const uint dispatchSize = NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
1919
for (uint jointIndex=gl_GlobalInvocationID.x; jointIndex<totalJointCount; jointIndex+=dispatchSize)
2020
{
2121
// TODO: implement via https://moderngpu.github.io/sortedsearch.html, find the upper and lower bounds of the workgroup, then go to town with sorted search

include/nbl/builtin/glsl/subgroup/basic_portability.glsl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
#define nbl_glsl_MinSubgroupSizeLog2 2
1818
#define nbl_glsl_MinSubgroupSize (0x1<<nbl_glsl_MinSubgroupSizeLog2)
1919

20-
#ifdef NBL_IMPL_GL_NV_shader_thread_group
20+
#ifdef NBL_GLSL_IMPL_GL_NV_shader_thread_group
2121
#define nbl_glsl_MaxSubgroupSizeLog2 5
22-
#elif defined(NBL_IMPL_GL_AMD_gcn_shader)||defined(NBL_IMPL_GL_ARB_shader_ballot)
22+
#elif defined(NBL_GLSL_IMPL_GL_AMD_gcn_shader)||defined(NBL_GLSL_IMPL_GL_ARB_shader_ballot)
2323
#define nbl_glsl_MaxSubgroupSizeLog2 6
2424
#else
2525
#define nbl_glsl_MaxSubgroupSizeLog2 7

include/nbl/builtin/glsl/transform_tree/global_transform_update_common.glsl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
layout(local_size_x=NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
1+
layout(local_size_x=NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
22

33
#define NBL_GLSL_TRANSFORM_TREE_POOL_NODE_RECOMPUTED_TIMESTAMP_DESCRIPTOR_QUALIFIERS coherent restrict
44
#define NBL_GLSL_TRANSFORM_TREE_POOL_NODE_GLOBAL_TRANSFORM_DESCRIPTOR_QUALIFIERS coherent restrict
@@ -66,7 +66,7 @@ void main()
6666
#define NBL_GLSL_TRANSFORM_TREE_STACK_SIZE (NBL_GLSL_TRANSFORM_TREE_MAX_DEPTH-1)
6767
uint stack[NBL_GLSL_TRANSFORM_TREE_STACK_SIZE];
6868

69-
const uint dispatchSize = NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
69+
const uint dispatchSize = NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
7070
for (uint nodeID=gl_GlobalInvocationID.x; nodeID<nodesToUpdate.count; nodeID+=dispatchSize)
7171
{
7272
int stackPtr = 0;

include/nbl/builtin/glsl/transform_tree/relative_transform_update.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#version 440 core
2-
layout(local_size_x=NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
2+
layout(local_size_x=NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
33

44
// disable descriptors we dont need
55
#define NBL_GLSL_TRANSFORM_TREE_POOL_NODE_PARENT_DESCRIPTOR_DECLARED
@@ -12,7 +12,7 @@ layout(local_size_x=NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS) in;
1212

1313
void main()
1414
{
15-
const uint dispatchSize = NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
15+
const uint dispatchSize = NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS*gl_NumWorkGroups[0];
1616
for (uint nodeID=gl_GlobalInvocationID.x; nodeID<relativeTransformModificationRequestRanges.rangeCount; nodeID+=dispatchSize)
1717
{
1818
const nbl_glsl_transform_tree_modification_request_range_t requestRange = relativeTransformModificationRequestRanges.data[nodeID];

include/nbl/builtin/glsl/utils/indirect_commands.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ struct nbl_glsl_DispatchIndirectCommand_t
3333
uint nbl_glsl_utils_computeOptimalPersistentWorkgroupDispatchSize(in uint elementCount, in uint workgroupSize, in uint workgroupSpinningProtection)
3434
{
3535
const uint infinitelyWideDeviceWGCount = (elementCount-1u)/(workgroupSize*workgroupSpinningProtection)+1u;
36-
return min(infinitelyWideDeviceWGCount,NBL_LIMIT_MAX_RESIDENT_INVOCATIONS/NBL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS);
36+
return min(infinitelyWideDeviceWGCount,NBL_GLSL_LIMIT_MAX_RESIDENT_INVOCATIONS/NBL_GLSL_LIMIT_MAX_OPTIMALLY_RESIDENT_WORKGROUP_INVOCATIONS);
3737
}
3838
uint nbl_glsl_utils_computeOptimalPersistentWorkgroupDispatchSize(in uint elementCount, in uint workgroupSize)
3939
{

include/nbl/video/SPhysicalDeviceFeatures.h

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,75 @@ enum E_SWAPCHAIN_MODE : uint32_t
1212
/* TODO: KHR_swapchain if SURFACE or DISPLAY flag present & KHR_display_swapchain if DISPLAY flag present */
1313
};
1414

15+
//! [TODOS]:
16+
//!
17+
//! ## LogicalDevice creation enabled features shouldn't necessarily equal the ones it reports as enabled (superset)
18+
//!
19+
//! Basically what I'd imagine the usage of the API to be like.
20+
//!
21+
//! **RARE: Creating a physical device with all advertised features/extensions:**
22+
//! ```cpp
23+
//! auto features = physicalDevice->getFeatures();
24+
//!
25+
//! ILogicalDevice::SCreationParams params = {};
26+
//! params.queueParamsCount = ; // set queue stuff
27+
//! params.queueParams = ; // set queue stuff
28+
//! params.enabledFeatures = features;
29+
//! auto device = physicalDevice->createLogicalDevice(params);
30+
//! ```
31+
//!
32+
//! **FREQUENT: Choosing a physical device with the features**
33+
//! ```cpp
34+
//! IPhysicalDevice::SRequiredProperties props = {}; // default initializes to apiVersion=1.1, deviceType = ET_UNKNOWN, pipelineCacheUUID = '\0', device UUID=`\0`, driverUUID=`\0`, deviceLUID=`\0`, deviceNodeMask= ~0u, driverID=UNKNOWN
35+
//! // example of particular config
36+
//! props.apiVersion = 1.2;
37+
//! props.deviceTypeMask = ~IPhysicalDevice::ET_CPU; // would be good to turn the enum into a mask
38+
//! props.driverIDMask = ~(EDI_AMD_PROPRIETARY|EDI_INTEL_PROPRIETARY_WINDOWS); // would be goot to turn the enum into a mask
39+
//! props.conformanceVersion = 1.2;
40+
//!
41+
//! SDeviceFeatures requiredFeatures = {};
42+
//! requiredFeatures.rayQuery = true;
43+
//!
44+
//! SDeviceLimits minimumLimits = {}; // would default initialize to worst possible values (small values for maximum sizes, large values for alignments, etc.)
45+
//!
46+
//! // TODO: later add some stuff for requiring queue families, formats and minimum memory heap sizes
47+
//!
48+
//! auto physicalDeviceCandidates = api->getCompatiblePhysicalDevices(props,requiredFeatures,minimumLimits,numSwapchains,supportedSwapchains,/*optional: would enforce tighter checks to actually accept compatibility, like formats, present modes and surface caps*/swapchainSupportDecider);
49+
//! if (physicalDeviceCandidates.empty())
50+
//! {
51+
//! logError();
52+
//! exit();
53+
//! }
54+
//!
55+
//! // TODO: later iterate through candidate devices (fulfilling all the required criteria) to find the "best" one
56+
//! // std::sort(physicalDeviceCandidates.begin(),physicalDeviceCandidates.end(),SDefaultPhysicalDeviceOrder());
57+
//! auto physicalDevice = physicalDeviceCandidates.begin();
58+
//! assert(requiredFeatures < physicalDevice->getFeatures());
59+
//! assert(minimumLimits < physicalDevice->getLimits());
60+
//!
61+
//! ILogicalDevice::SCreationParams params = {};
62+
//! params.queueParamsCount = ; // set queue stuff
63+
//! params.queueParams = ; // set queue stuff
64+
//! params.enabledFeatures = requiredFeatures;
65+
//! auto device = physicalDevice->createLogicalDevice(params);
66+
//! // this would be wrong, because during device creation we would enable additional features either due to:
67+
//! // - dependencies (like buffer address for raytracing)
68+
//! // - backend force-enabling them (like in OpenGL, where extensions are just enabled, you have no choice)
69+
//! // assert(requiredFeatures != device->getEnabledFeatures());
70+
//! assert(requiredFeatures < device->getEnabledFeatures());
71+
//! ```
72+
//!
73+
//! ### `SDeviceFeatures` and `SDeviceLimits` should have a `operator<`
74+
//!
75+
//! Basically to let us establish if features or limits are a superset of the requested.
76+
//!
77+
//! If you need a `! = ` operator then define it as
78+
//! ```cpp
79+
//! inline bool operator!=(const& other) const
80+
//! {
81+
//! return *this < other || other < *this;
82+
//! }
83+
//! ```
1584
struct SPhysicalDeviceFeatures
1685
{
1786
/* Vulkan 1.0 Core */
@@ -212,9 +281,9 @@ struct SPhysicalDeviceFeatures
212281
/* Vulkan Extensions */
213282

214283
/* RasterizationOrderAttachmentAccessFeaturesARM *//* VK_ARM_rasterization_order_attachment_access */
215-
bool rasterizationOrderColorAttachmentAccess;
216-
bool rasterizationOrderDepthAttachmentAccess;
217-
bool rasterizationOrderStencilAttachmentAccess;
284+
bool rasterizationOrderColorAttachmentAccess = false;
285+
bool rasterizationOrderDepthAttachmentAccess = false;
286+
bool rasterizationOrderStencilAttachmentAccess = false;
218287

219288
// [DO NOT EXPOSE] Enables certain formats in Vulkan, we just enable them if available or else we need to make format support query functions in LogicalDevice as well
220289
/* 4444FormatsFeaturesEXT *//* VK_EXT_4444_formats */
@@ -421,7 +490,7 @@ struct SPhysicalDeviceFeatures
421490
bool rayTracingMotionBlurPipelineTraceRaysIndirect = false;
422491

423492
/* CoverageReductionModeFeaturesNV *//* VK_NV_coverage_reduction_mode */
424-
bool coverageReductionMode;
493+
bool coverageReductionMode = false;
425494

426495
/* DeviceGeneratedCommandsFeaturesNV *//* VK_NV_device_generated_commands */
427496
bool deviceGeneratedCommands = false;
@@ -643,7 +712,25 @@ struct SPhysicalDeviceFeatures
643712
// [TODO LATER] Won't expose for now, API changes necessary
644713
/* VK_AMD_texture_gather_bias_lod */
645714

646-
// [TODO LATER] when released in the SDK: https://github.com/Devsh-Graphics-Programming/Nabla/pull/357#discussion_r916899420
715+
// [TODO LATER] when released in the SDK:
716+
// -Support for `GLSL_EXT_ray_cull_mask`, lets call it `rayCullMask`
717+
// - new pipeline stage and access masks but only in `KHR_synchronization2` which we don't use
718+
// - two new acceleration structure query parameters
719+
// - `rayTracingPipelineTraceRaysIndirect2` feature, same as `rayTracingPipelineTraceRaysIndirect` but with indirect SBTand dispatch dimensions
720+
//
721+
// Lets have
722+
// ```cpp
723+
// bool accelerationStructureSizeAndBLASPointersQuery = false;
724+
//
725+
// // Do not expose, we don't use KHR_synchronization2 yet
726+
// //bool accelerationStructureCopyStageAndSBTAccessType;
727+
//
728+
// bool rayCullMask = false;
729+
//
730+
// bool rayTracingPipelineTraceRaysIndirectDimensionsAndSBT = false;
731+
// ```
732+
//
733+
// Lets enable `rayTracingMaintenance1`and `rayTracingPipelineTraceRaysIndirect2` whenever required by the above.
647734
/* VK_KHR_ray_tracing_maintenance1 *//* added in vk 1.3.213, the SDK isn't released yet at this moment :D */
648735

649736
// [TODO LATER] requires extra API work to use
@@ -714,6 +801,9 @@ struct SPhysicalDeviceFeatures
714801
// [TODO] Always enable, expose as limit
715802
/* VK_KHR_spirv_1_4 */
716803

804+
// [TODO] handle with a single num
805+
/* VK_KHR_display_swapchain */
806+
717807
// [TODO LATER] (When it has documentation): Always enable, expose as limit
718808
/* VK_AMD_gpu_shader_half_float_fetch */
719809

@@ -951,7 +1041,6 @@ struct SPhysicalDeviceFeatures
9511041
// [TODO] Triage leftover extensions below
9521042

9531043
/* VK_NV_present_barrier */
954-
/* VK_KHR_display_swapchain */
9551044
/* VK_EXT_queue_family_foreign */
9561045
/* VK_EXT_separate_stencil_usage */
9571046
/* VK_KHR_create_renderpass2 */

include/nbl/video/SPhysicalDeviceLimits.h

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,11 @@ struct SPhysicalDeviceLimits
182182
E_TRI_BOOLEAN shaderRoundingModeRTZFloat32 = ETB_DONT_KNOW;
183183
E_TRI_BOOLEAN shaderRoundingModeRTZFloat64 = ETB_DONT_KNOW;
184184

185-
// or VK_EXT_descriptor_indexing:
185+
// expose in 2 phases
186+
// -Update After Bindand nonUniformEXT shader qualifier:
187+
// Descriptor Lifetime Tracking PR #345 will do this, cause I don't want to rewrite the tracking system again.
188+
// -Actual Descriptor Indexing:
189+
// The whole 512k descriptor limits, runtime desc arrays, etc.will come later
186190
uint32_t maxUpdateAfterBindDescriptorsInAllPools = ~0u;
187191
bool shaderUniformBufferArrayNonUniformIndexingNative = false;
188192
bool shaderSampledImageArrayNonUniformIndexingNative = false;
@@ -211,9 +215,6 @@ struct SPhysicalDeviceLimits
211215
bool filterMinmaxSingleComponentFormats = false;
212216
bool filterMinmaxImageComponentMapping = false;
213217

214-
core::bitflag<asset::IImage::E_SAMPLE_COUNT_FLAGS> framebufferIntegerColorSampleCounts = asset::IImage::E_SAMPLE_COUNT_FLAGS(0u);
215-
216-
217218
/* Vulkan 1.3 Core */
218219

219220
// or VK_EXT_subgroup_size_control:
@@ -380,26 +381,6 @@ struct SPhysicalDeviceLimits
380381
//uint32_t maxDescriptorSetInlineUniformBlocks;
381382
//uint32_t maxDescriptorSetUpdateAfterBindInlineUniformBlocks;
382383

383-
// [DO NOT EXPOSE] We will never expose this vendor specific meta-data (no new feature) to the user, but might use the extension to provide some cross platform meta-info in the Nabla section
384-
/* ShaderCoreProperties2AMD *//* provided by VK_AMD_shader_core_properties2 */
385-
//VkShaderCorePropertiesFlagsAMD shaderCoreFeatures;
386-
//uint32_t activeComputeUnitCount;
387-
/* ShaderCorePropertiesAMD *//* provided by VK_AMD_shader_core_properties */
388-
//uint32_t shaderEngineCount;
389-
//uint32_t shaderArraysPerEngineCount;
390-
//uint32_t computeUnitsPerShaderArray;
391-
//uint32_t simdPerComputeUnit;
392-
//uint32_t wavefrontsPerSimd;
393-
//uint32_t wavefrontSize;
394-
//uint32_t sgprsPerSimd;
395-
//uint32_t minSgprAllocation;
396-
//uint32_t maxSgprAllocation;
397-
//uint32_t sgprAllocationGranularity;
398-
//uint32_t vgprsPerSimd;
399-
//uint32_t minVgprAllocation;
400-
//uint32_t maxVgprAllocation;
401-
//uint32_t vgprAllocationGranularity;
402-
403384
// [DO NOT EXPOSE] right now, no idea if we'll ever expose and implement those but they'd all be false for OpenGL
404385
/* BlendOperationAdvancedPropertiesEXT *//* provided by VK_EXT_blend_operation_advanced */
405386
//uint32_t advancedBlendMaxColorAttachments;
@@ -553,6 +534,14 @@ struct SPhysicalDeviceLimits
553534
//! uint32_t maxVertexInputAttributeOffset;
554535
//! uint32_t maxVertexInputBindingStride;
555536

537+
/*
538+
- Spec states minimum supported value should be at least ESCF_1_BIT
539+
- it might be different for each integer format, best way is to query your integer format from physical device using vkGetPhysicalDeviceImageFormatProperties and get the sampleCounts
540+
https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkImageFormatProperties.html
541+
*/
542+
// [DO NOT EXPOSE] because it might be different for every texture format and usage
543+
// core::bitflag<asset::IImage::E_SAMPLE_COUNT_FLAGS> framebufferIntegerColorSampleCounts = asset::IImage::E_SAMPLE_COUNT_FLAGS(0u);
544+
556545
/* Always enabled, reported as limits */
557546
bool shaderOutputViewportIndex = false; // ALIAS: VK_EXT_shader_viewport_index_layer
558547
bool shaderOutputLayer = false; // ALIAS: VK_EXT_shader_viewport_index_layer
@@ -584,9 +573,9 @@ struct SPhysicalDeviceLimits
584573
bool postDepthCoverage = false; /* VK_EXT_post_depth_coverage */
585574
bool shaderStencilExport = false; /* VK_EXT_shader_stencil_export */
586575
bool decorateString = false; /* VK_GOOGLE_decorate_string */
587-
bool externalFence = false; /* VK_KHR_external_fence_fd */ /* VK_KHR_external_fence_win32 */
588-
bool externalMemory = false; /* VK_KHR_external_memory_fd */ /* VK_KHR_external_memory_win32 */
589-
bool externalSemaphore = false; /* VK_KHR_external_semaphore_fd */ /* VK_KHR_external_semaphore_win32 */
576+
bool externalFence = false; /* VK_KHR_external_fence_fd */ /* VK_KHR_external_fence_win32 */ // [TODO] requires instance extensions, add them
577+
bool externalMemory = false; /* VK_KHR_external_memory_fd */ /* VK_KHR_external_memory_win32 */ // [TODO] requires instance extensions, add them
578+
bool externalSemaphore = false; /* VK_KHR_external_semaphore_fd */ /* VK_KHR_external_semaphore_win32 */ // [TODO] requires instance extensions, add them
590579
bool shaderNonSemanticInfo = false; /* VK_KHR_shader_non_semantic_info */
591580
bool fragmentShaderBarycentric = false; /* VK_KHR_fragment_shader_barycentric */
592581
bool geometryShaderPassthrough = false; /* VK_NV_geometry_shader_passthrough */

0 commit comments

Comments
 (0)