diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h index a7fe07a..4aa432b 100644 --- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h +++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h @@ -966,6 +966,17 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf compileOptions->release(); #endif + if (inputLaunchConfiguration.indirectDispatch != 0) { + app->configuration.indirectDispatch = inputLaunchConfiguration.indirectDispatch; + app->configuration.indirectBuffer = inputLaunchConfiguration.indirectBuffer; + if (app->configuration.indirectBufferOffset){ + app->configuration.indirectBufferOffset = inputLaunchConfiguration.indirectBufferOffset; + } + else{ + app->configuration.indirectBufferOffset = 0; + } + app->configuration.indirectHostPointer = inputLaunchConfiguration.indirectHostPointer; + } resFFT = initializeBluesteinAutoPadding(app); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h index 0cc75d1..620c137 100644 --- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h +++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h @@ -43,8 +43,10 @@ static inline void appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc if (sc->inputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %d) buffer DataIn{\n\ - %s inputs[%" PRIu64 "];\n\ -};\n\n", id, inputMemoryType->name, sc->inputBufferBlockSize / typeSize); + %s inputs[];\n\ +};\n\n", id, inputMemoryType->name); // use runtime-sized arrays so that the same shader can be reused for different batch numbers + + PfAppendLine(sc); } else { @@ -70,9 +72,11 @@ static inline void appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* s if (sc->inputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %d) buffer DataOut{\n\ - %s outputs[%" PRIu64 "];\n\ -};\n\n", id, outputMemoryType->name, sc->outputBufferBlockSize / typeSize); - PfAppendLine(sc); + %s outputs[];\n\ +};\n\n", id, outputMemoryType->name); // use runtime-sized arrays so that the same shader can be reused for different batch numbers + + + PfAppendLine(sc); } else { sc->tempLen = sprintf(sc->tempStr, "\ diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h index 2d633d3..5116cf8 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h @@ -153,7 +153,28 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a if (axis->pushConstants.structSize > 0) { vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)axis->pushConstants.structSize, axis->pushConstants.data); } - vkCmdDispatch(app->configuration.commandBuffer[0], (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1], (uint32_t)dispatchSize[2]); + bool indirect_dispatch; + if (axis->specializationConstants.inverse){ + indirect_dispatch = ((app->configuration.indirectDispatch & 0x2) && (app->configuration.indirectHostPointer != nullptr)); + } + else{ + indirect_dispatch = ((app->configuration.indirectDispatch & 0x1) && (app->configuration.indirectHostPointer != nullptr)); + } + + pfUINT indirect_offset; + if (indirect_dispatch){ + unsigned int* host_indirect = (unsigned int*)((char*)app->configuration.indirectHostPointer + app->configuration.indirectBufferOffset + app->indirectDispatchID*16); + host_indirect[0] = (uint32_t)dispatchSize[0]; + host_indirect[1] = (uint32_t)dispatchSize[1]; + host_indirect[2] = (uint32_t)dispatchSize[2]; + host_indirect[3] = axis->batchWorkGroup + 10000*axis->specializationConstants.inverse; + indirect_offset = app->configuration.indirectBufferOffset + 16*app->indirectDispatchID; + vkCmdDispatchIndirect(app->configuration.commandBuffer[0], app->configuration.indirectBuffer, indirect_offset); + app->indirectDispatchID++; + } + else { + vkCmdDispatch(app->configuration.commandBuffer[0], (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1], (uint32_t)dispatchSize[2]); + } #elif(VKFFT_BACKEND==1) void* args[10]; CUresult result = CUDA_SUCCESS; diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h index dcf7b4d..f005340 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h @@ -563,6 +563,7 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]); #endif if ((axis_id == (app->configuration.FFTdim-1)) && (axis_upload_id == 0) && (app->configuration.performConvolution)) { + axis->specializationConstants.convolutionBindingID = (int)axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = (int)axis->specializationConstants.kernelBlockNum; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; @@ -571,6 +572,7 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V } if (app->configuration.useLUT == 1) { + axis->specializationConstants.LUTBindingID = (int)axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; @@ -774,6 +776,7 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) { + if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { axis->specializationConstants.inputOffset.type = 31; axis->specializationConstants.outputOffset.type = 31; @@ -1198,6 +1201,7 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) { + if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { #if(VKFFT_BACKEND==0) const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; @@ -1518,7 +1522,7 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA } } } - if ((i == 2) && (app->configuration.performConvolution)) { + if ((i == axis->specializationConstants.convolutionBindingID) && (app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { pfUINT bufferId = 0; pfUINT offset = j; @@ -1544,7 +1548,7 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA axis->specializationConstants.kernelOffset.data.i = app->configuration.kernelOffset; } } - if ((i == axis->numBindings - 1) && (app->configuration.useLUT == 1)) { + if ((i == axis->specializationConstants.LUTBindingID) && (app->configuration.useLUT == 1)) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.buffer = axis->bufferLUT; diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h index 47b4f7e..f61c308 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h @@ -46,7 +46,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla #elif(VKFFT_BACKEND==5) #endif VkFFTAxis* axis = (reverseBluesteinMultiUpload) ? &FFTPlan->inverseBluesteinAxes[axis_id][axis_upload_id] : &FFTPlan->axes[axis_id][axis_upload_id]; - + axis->batchWorkGroup = 1; axis->specializationConstants.sourceFFTSize.type = 31; axis->specializationConstants.sourceFFTSize.data.i = app->configuration.size[axis_id]; axis->specializationConstants.axis_id = (int)axis_id; diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h index f5b5e8e..c060cc5 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h @@ -42,6 +42,7 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* #elif(VKFFT_BACKEND==5) #endif VkFFTAxis* axis = &FFTPlan->R2Cdecomposition; + axis->batchWorkGroup = 2; axis->specializationConstants.sourceFFTSize.type = 31; axis->specializationConstants.sourceFFTSize.data.i = (pfINT)app->configuration.size[0]; axis->specializationConstants.numFFTdims = (int)app->configuration.FFTdim; diff --git a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h index d30318f..57ebddc 100644 --- a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h +++ b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h @@ -150,6 +150,8 @@ typedef struct { VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled + VkBuffer indirectBuffer; //buffer that contains workgroupsizes for indirect dispatch. Size hould be at least 4 x 4 bytes x the number of dispatches. + unsigned int* indirectHostPointer; // pointer to the array with the indirect workgroup sizes on the host side. During dispatch this array will be filled by VkFFT, which can later be updated by user. format us uint[4] = {x_size, y_size, z_size, id}, with id the axis that contains the batch number (0=x, 1=y, 2=z), plus 10000 if the dispatch concerns an inverse FFT. #elif(VKFFT_BACKEND==1) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation @@ -186,6 +188,7 @@ typedef struct { pfUINT inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 pfUINT outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0 pfUINT kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0 + pfUINT indirectBufferOffset; //specify if VkFFT has to offset the first element posigion inside the indirectBuffer. In bytes. Default 0 pfUINT specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0 //optional: (default 0 if not stated otherwise) @@ -321,6 +324,7 @@ typedef struct { MTL::CommandBuffer* commandBuffer;//Filled at app execution MTL::ComputeCommandEncoder* commandEncoder;//Filled at app execution #endif + pfUINT indirectDispatch; //0 for direct dispatch, 1 for fwd indirect, 2 for inv indirect, 3 for both indirect } VkFFTConfiguration;//parameters specified at plan creation typedef struct { @@ -1113,6 +1117,7 @@ typedef struct { pfUINT bufferLUTSize; pfUINT bufferRaderUintLUTSize; pfUINT referenceLUT; + pfUINT batchWorkGroup; } VkFFTAxis; typedef struct { @@ -1177,7 +1182,7 @@ typedef struct { pfUINT bufferBluesteinSize[VKFFT_MAX_FFT_DIMENSIONS]; void* applicationBluesteinString[VKFFT_MAX_FFT_DIMENSIONS]; pfUINT applicationBluesteinStringSize[VKFFT_MAX_FFT_DIMENSIONS]; - + pfUINT indirectDispatchID; pfUINT numRaderFFTPrimes; pfUINT rader_primes[30]; pfUINT rader_buffer_size[30];