|
| 1 | +#define _IRR_STATIC_LIB_ |
| 2 | +#include <irrlicht.h> |
| 3 | + |
| 4 | +#include "../../source/Irrlicht/COpenGLDriver.h" |
| 5 | + |
| 6 | + |
| 7 | +using namespace irr; |
| 8 | +using namespace core; |
| 9 | +using namespace video; |
| 10 | +using namespace asset; |
| 11 | + |
| 12 | +//workgroup methods - since there are no subgroup methods for bit count |
| 13 | + |
| 14 | +uint32_t bitCount(uint32_t input) |
| 15 | +{ |
| 16 | + return input & 1; |
| 17 | +} |
| 18 | + |
| 19 | +struct emulatedWorkgroupReduction |
| 20 | +{ |
| 21 | + inline void operator()(uint32_t* outputData, uint32_t workgroupSize, uint32_t subgroupSize) |
| 22 | + { |
| 23 | + uint32_t bitC = 0; |
| 24 | + /*for (auto i=1u; i<workgroupSize; i++) |
| 25 | + bitC += bitCount<uint32_t>(1);*/ |
| 26 | + std::fill(outputData,outputData+workgroupSize,bitC); |
| 27 | + } |
| 28 | + |
| 29 | + _IRR_STATIC_INLINE_CONSTEXPR const char* name = "workgroup reduction"; |
| 30 | +}; |
| 31 | +struct emulatedWorkgroupScanExclusive |
| 32 | +{ |
| 33 | + inline void operator()(uint32_t* outputData, uint32_t workgroupSize, uint32_t subgroupSize) |
| 34 | + { |
| 35 | + uint32_t bitC = 0; |
| 36 | + //outputData[0u] = OP::IdentityElement; |
| 37 | + //for (auto i=1u; i<workgroupSize; i++) |
| 38 | + // outputData[i] = OP()(outputData[i-1u],workgroupData[i-1u]); |
| 39 | + //uint32_t bitC = 0; |
| 40 | + //for (auto i = 1u; i < workgroupSize; i++) |
| 41 | + // bitC += bitCount<uint32_t>(1); |
| 42 | + std::fill(outputData, outputData + workgroupSize, bitC); |
| 43 | + |
| 44 | + |
| 45 | + } |
| 46 | + |
| 47 | + _IRR_STATIC_INLINE_CONSTEXPR const char* name = "workgroup exclusive scan"; |
| 48 | +}; |
| 49 | +struct emulatedWorkgroupScanInclusive |
| 50 | +{ |
| 51 | + |
| 52 | + inline void operator()(uint32_t* outputData, uint32_t workgroupSize, uint32_t subgroupSize) |
| 53 | + { |
| 54 | + uint32_t bitC = 0; |
| 55 | + /* outputData[0u] = workgroupData[0u]; |
| 56 | + for (auto i=1u; i<workgroupSize; i++) |
| 57 | + outputData[i] = OP()(outputData[i-1u],workgroupData[i]);*/ |
| 58 | + std::fill(outputData, outputData + workgroupSize, bitC); |
| 59 | + |
| 60 | + } |
| 61 | + |
| 62 | + _IRR_STATIC_INLINE_CONSTEXPR const char* name = "workgroup inclusive scan"; |
| 63 | +}; |
| 64 | + |
| 65 | + |
| 66 | +#include "common.glsl" |
| 67 | +constexpr uint32_t kBufferSize = BUFFER_DWORD_COUNT*sizeof(uint32_t); |
| 68 | + |
| 69 | +//returns true if result matches |
| 70 | +template<class Arithmetic> |
| 71 | +bool validateResults(video::IVideoDriver* driver, const uint32_t workgroupSize, const uint32_t workgroupCount, video::IGPUBuffer* bufferToDownload) |
| 72 | +{ |
| 73 | + constexpr uint64_t timeoutInNanoSeconds = 15000000000u; |
| 74 | + const uint32_t alignment = sizeof(uint32_t); |
| 75 | + auto downloadStagingArea = driver->getDefaultDownStreamingBuffer(); |
| 76 | + auto downBuffer = downloadStagingArea->getBuffer(); |
| 77 | + |
| 78 | + |
| 79 | + bool success = false; |
| 80 | + |
| 81 | + |
| 82 | + uint32_t address = std::remove_pointer<decltype(downloadStagingArea)>::type::invalid_address; |
| 83 | + auto unallocatedSize = downloadStagingArea->multi_alloc(1u, &address, &kBufferSize, &alignment); |
| 84 | + if (unallocatedSize) |
| 85 | + { |
| 86 | + os::Printer::log("Could not download the buffer from the GPU!", ELL_ERROR); |
| 87 | + return false; |
| 88 | + } |
| 89 | + driver->copyBuffer(bufferToDownload, downBuffer, 0, address, kBufferSize); |
| 90 | + |
| 91 | + auto downloadFence = driver->placeFence(true); |
| 92 | + auto result = downloadFence->waitCPU(timeoutInNanoSeconds, true); |
| 93 | + if (result != video::E_DRIVER_FENCE_RETVAL::EDFR_TIMEOUT_EXPIRED && result != video::E_DRIVER_FENCE_RETVAL::EDFR_FAIL) |
| 94 | + { |
| 95 | + success = true; |
| 96 | + |
| 97 | + if (downloadStagingArea->needsManualFlushOrInvalidate()) |
| 98 | + driver->invalidateMappedMemoryRanges({ {downloadStagingArea->getBuffer()->getBoundMemory(),address,kBufferSize} }); |
| 99 | + |
| 100 | + auto dataFromBuffer = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(downloadStagingArea->getBufferPointer())+address); |
| 101 | + |
| 102 | + // now check if the data obtained has valid values |
| 103 | + constexpr uint32_t subgroupSize = 4u; |
| 104 | + uint32_t* tmp = new uint32_t[workgroupSize]; |
| 105 | + for (uint32_t workgroupID=0u; success&&workgroupID<workgroupCount; workgroupID++) |
| 106 | + { |
| 107 | + const auto workgroupOffset = workgroupID*workgroupSize; |
| 108 | + Arithmetic()(tmp,workgroupSize,subgroupSize); |
| 109 | + for (uint32_t localInvocationIndex=0u; localInvocationIndex<workgroupSize; localInvocationIndex++) |
| 110 | + if (tmp[localInvocationIndex]!=dataFromBuffer[workgroupOffset+localInvocationIndex]) |
| 111 | + { |
| 112 | + os::Printer::log("Failed test #" + std::to_string(workgroupSize) + " (" + Arithmetic::name + ")", ELL_ERROR); |
| 113 | + success = false; |
| 114 | + break; |
| 115 | + } |
| 116 | + } |
| 117 | + delete[] tmp; |
| 118 | + } |
| 119 | + else |
| 120 | + os::Printer::log("Could not download the buffer from the GPU, fence not signalled!", ELL_ERROR); |
| 121 | + |
| 122 | + downloadStagingArea->multi_free(1u, &address, &kBufferSize, nullptr); |
| 123 | + return success; |
| 124 | + |
| 125 | +} |
| 126 | +template<class Arithmetic> |
| 127 | +bool runTest(video::IVideoDriver* driver, video::IGPUComputePipeline* pipeline, const video::IGPUDescriptorSet* ds, const uint32_t workgroupSize, core::smart_refctd_ptr<IGPUBuffer> buffer) |
| 128 | +{ |
| 129 | + if (pipeline == nullptr) return false; //code could not be compiled |
| 130 | + driver->bindComputePipeline(pipeline); |
| 131 | + driver->bindDescriptorSets(video::EPBP_COMPUTE,pipeline->getLayout(),0u,1u,&ds,nullptr); |
| 132 | + const uint32_t workgroupCount = BUFFER_DWORD_COUNT/workgroupSize; |
| 133 | + driver->dispatch(workgroupCount, 1, 1); |
| 134 | + video::COpenGLExtensionHandler::extGlMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT); |
| 135 | + //check results |
| 136 | + bool passed = validateResults<Arithmetic>(driver, workgroupSize, workgroupCount, buffer.get()); |
| 137 | + return passed; |
| 138 | +} |
| 139 | + |
| 140 | +int main() |
| 141 | +{ |
| 142 | + irr::SIrrlichtCreationParameters params; |
| 143 | + params.Bits = 24; |
| 144 | + params.ZBufferBits = 24; //we'd like 32bit here |
| 145 | + params.DriverType = video::EDT_OPENGL; //! Only Well functioning driver, software renderer left for sake of 2D image drawing |
| 146 | + params.WindowSize = dimension2d<uint32_t>(1280, 720); |
| 147 | + params.Fullscreen = false; |
| 148 | + params.Vsync = true; //! If supported by target platform |
| 149 | + params.Doublebuffer = true; |
| 150 | + params.Stencilbuffer = false; //! This will not even be a choice soon |
| 151 | + params.StreamingDownloadBufferSize = kBufferSize; |
| 152 | + auto device = createDeviceEx(params); |
| 153 | + |
| 154 | + if (!device) |
| 155 | + return 1; // could not create selected driver. |
| 156 | + |
| 157 | + video::IVideoDriver* driver = device->getVideoDriver(); |
| 158 | + io::IFileSystem* filesystem = device->getFileSystem(); |
| 159 | + asset::IAssetManager* am = device->getAssetManager(); |
| 160 | + |
| 161 | + //buffer with results from the gpu |
| 162 | + core::smart_refctd_ptr<IGPUBuffer> buffer= driver->createDeviceLocalGPUBufferOnDedMem(kBufferSize); |
| 163 | + |
| 164 | + |
| 165 | + IGPUDescriptorSetLayout::SBinding binding = { 0u,EDT_STORAGE_BUFFER,1u,IGPUSpecializedShader::ESS_COMPUTE,nullptr }; |
| 166 | + auto gpuDSLayout = driver->createGPUDescriptorSetLayout(&binding, &binding + 1); |
| 167 | + constexpr uint32_t pushconstantSize = 12; |
| 168 | + SPushConstantRange pcRange[1] = { IGPUSpecializedShader::ESS_COMPUTE,0u,pushconstantSize }; |
| 169 | + auto pipelineLayout = driver->createGPUPipelineLayout(pcRange, pcRange + pushconstantSize, core::smart_refctd_ptr(gpuDSLayout)); |
| 170 | + |
| 171 | + auto descriptorSet = driver->createGPUDescriptorSet(core::smart_refctd_ptr(gpuDSLayout)); |
| 172 | + { |
| 173 | + IGPUDescriptorSet::SDescriptorInfo info; |
| 174 | + info.desc = buffer; |
| 175 | + info.buffer = { 0u,kBufferSize }; |
| 176 | + |
| 177 | + IGPUDescriptorSet::SWriteDescriptorSet write = { descriptorSet.get(),0u,0u,1u,EDT_STORAGE_BUFFER, &info }; |
| 178 | + |
| 179 | + driver->updateDescriptorSets(1, &write, 0u, nullptr); |
| 180 | + } |
| 181 | + struct GLSLCodeWithWorkgroup { |
| 182 | + uint32_t workgroup_definition_position; |
| 183 | + std::string glsl; |
| 184 | + }; |
| 185 | + constexpr const char* symbolsToReplace = "????"; |
| 186 | + auto getShaderGLSL = [&](const char* filePath) |
| 187 | + { |
| 188 | + std::ifstream file(filePath); |
| 189 | + std::stringstream buff; buff << file.rdbuf(); |
| 190 | + std::string shaderCode = buff.str(); |
| 191 | + uint32_t wgPos = shaderCode.find(symbolsToReplace, 0); |
| 192 | + GLSLCodeWithWorkgroup ret = { wgPos,shaderCode }; |
| 193 | + return ret; |
| 194 | + }; |
| 195 | + GLSLCodeWithWorkgroup shaderGLSL[] = |
| 196 | + { |
| 197 | + getShaderGLSL("../testWorkgroupReduce.comp"), |
| 198 | + getShaderGLSL("../testWorkgroupExclusive.comp"), |
| 199 | + getShaderGLSL("../testWorkgroupInclusive.comp") |
| 200 | + }; |
| 201 | + constexpr auto kTestTypeCount = sizeof(shaderGLSL)/sizeof(GLSLCodeWithWorkgroup); |
| 202 | + |
| 203 | + auto getGPUShader = [&](GLSLCodeWithWorkgroup glsl, uint32_t wg_count) |
| 204 | + { |
| 205 | + auto alteredGLSL = glsl.glsl.replace(glsl.workgroup_definition_position, 4, std::to_string(wg_count)); |
| 206 | + auto shaderUnspecialized = core::make_smart_refctd_ptr<asset::ICPUShader>(alteredGLSL.data()); |
| 207 | + asset::ISpecializedShader::SInfo specinfo(nullptr, nullptr, "main", IGPUSpecializedShader::ESS_COMPUTE, "../file.comp"); |
| 208 | + auto cs = core::make_smart_refctd_ptr<asset::ICPUSpecializedShader>(std::move(shaderUnspecialized), std::move(specinfo)); |
| 209 | + auto cs_rawptr = cs.get(); |
| 210 | + core::smart_refctd_ptr<IGPUSpecializedShader> shader = driver->getGPUObjectsFromAssets(&cs_rawptr, &cs_rawptr + 1)->front(); |
| 211 | + return shader; |
| 212 | + }; |
| 213 | + |
| 214 | + //max workgroup size is hardcoded to 1024 |
| 215 | + uint32_t totalFailCount = 0; |
| 216 | + const auto ds = descriptorSet.get(); |
| 217 | + for (uint32_t workgroupSize=1u; workgroupSize<=1024u; workgroupSize++) |
| 218 | + { |
| 219 | + core::smart_refctd_ptr<IGPUComputePipeline> pipelines[kTestTypeCount]; |
| 220 | + for (uint32_t i=0u; i<kTestTypeCount; i++) |
| 221 | + pipelines[i] = driver->createGPUComputePipeline(nullptr, core::smart_refctd_ptr(pipelineLayout), std::move(getGPUShader(shaderGLSL[i], workgroupSize))); |
| 222 | + |
| 223 | + bool passed = true; |
| 224 | + |
| 225 | + driver->beginScene(true); |
| 226 | + const video::IGPUDescriptorSet* ds = descriptorSet.get(); |
| 227 | + passed = runTest<emulatedWorkgroupReduction>(driver,pipelines[0u].get(),descriptorSet.get(),workgroupSize,buffer)&&passed; |
| 228 | + passed = runTest<emulatedWorkgroupScanExclusive>(driver,pipelines[1u].get(),descriptorSet.get(),workgroupSize, buffer)&&passed; |
| 229 | + passed = runTest<emulatedWorkgroupScanInclusive>(driver,pipelines[2u].get(),descriptorSet.get(),workgroupSize, buffer)&&passed; |
| 230 | + |
| 231 | + if (passed) |
| 232 | + os::Printer::log("Passed test #" + std::to_string(workgroupSize), ELL_INFORMATION); |
| 233 | + else |
| 234 | + { |
| 235 | + totalFailCount++; |
| 236 | + os::Printer::log("Failed test #" + std::to_string(workgroupSize), ELL_INFORMATION); |
| 237 | + } |
| 238 | + driver->endScene(); |
| 239 | + } |
| 240 | + os::Printer::log("==========Result==========", ELL_INFORMATION); |
| 241 | + os::Printer::log("Fail Count: " + std::to_string(totalFailCount), ELL_INFORMATION); |
| 242 | + |
| 243 | + return 0; |
| 244 | +} |
0 commit comments