Skip to content

Commit 5b6b09e

Browse files
author
devsh
committed
sketch out pipeline caching
1 parent b459895 commit 5b6b09e

File tree

2 files changed

+95
-28
lines changed

2 files changed

+95
-28
lines changed

include/nbl/video/utilities/CComputeBlit.h

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,21 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
4141
}
4242

4343
// ctor
44-
inline CComputeBlit(core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice) : m_device(std::move(logicalDevice)) {}
44+
CComputeBlit(
45+
core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice,
46+
core::smart_refctd_ptr<asset::IShaderCompiler::CCache>&& cache=nullptr,
47+
core::smart_refctd_ptr<system::ILogger>&& logger=nullptr
48+
);
49+
50+
// if you set the balues too small, we'll correct them ourselves anyway
51+
struct STask
52+
{
53+
uint32_t workgroupSizeLog2 : 4 = 0;
54+
// the TRUE output format, not the storage view format you might manually encode into
55+
hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN;
56+
uint32_t sharedMemoryPerInvocation : 6 = 0;
57+
uint32_t unused : 14 = 0;
58+
};
4559

4660
//! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE.
4761
inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format)
@@ -66,22 +80,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
6680
return compatFormat;
6781
}
6882
}
69-
/*
70-
struct STask
71-
{
72-
hlsl::vector<uint8_t,3> preloadWindow;
73-
asset::E_FORMAT inFormat;
74-
asset::E_FORMAT outFormat;
75-
// default no coverage adjustment
76-
uint8_t alphaBinCountLog2 : 4 = 0;
77-
};
78-
inline void initializeTaskDefault(STask& task) const
79-
{
80-
auto physDev = m_device->getPhysicalDevice();
81-
const auto formatTrait = hlsl::format::getTraits(static_cast<hlsl::format::TexelBlockFormat>(task.outFormat));
82-
task.alphaBinCountLog2 = hlsl::max(,task.alphaBinCountLog2);
83-
}
84-
*/
83+
8584
#if 0
8685
// @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp
8786
core::smart_refctd_ptr<video::IGPUShader> createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount);
@@ -666,7 +665,11 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
666665
EBT_COUNT
667666
};
668667

668+
void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks);
669+
669670
core::smart_refctd_ptr<ILogicalDevice> m_device;
671+
system::logger_opt_smart_ptr m_logger;
672+
core::smart_refctd_ptr<asset::IShaderCompiler::CCache> m_shaderCache;
670673

671674
//! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger
672675
//! by a pixel in each dimension.

src/nbl/video/utilities/CComputeBlit.cpp

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,79 @@ using namespace nbl::system;
55
using namespace nbl::asset;
66
using namespace nbl::video;
77

8+
9+
CComputeBlit::CComputeBlit(smart_refctd_ptr<ILogicalDevice>&& logicalDevice, smart_refctd_ptr<IShaderCompiler::CCache>&& cache, smart_refctd_ptr<ILogger>&& logger) : m_device(std::move(logicalDevice)), m_logger(nullptr)
10+
{
11+
if (logger)
12+
m_logger = std::move(logger);
13+
else if (auto debugCb=m_device->getPhysicalDevice()->getDebugCallback(); debugCb->getLogger())
14+
m_logger = smart_refctd_ptr<system::ILogger>(debugCb->getLogger());
15+
16+
if (cache)
17+
m_shaderCache = std::move(cache);
18+
else
19+
m_shaderCache = make_smart_refctd_ptr<IShaderCompiler::CCache>();
20+
}
21+
22+
void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks)
23+
{
24+
core::vector<smart_refctd_ptr<ICPUComputePipeline>> cpuPplns;
25+
cpuPplns.reserve(tasks.size());
26+
27+
const auto& limits = m_device->getPhysicalDevice()->getLimits();
28+
for (auto task : tasks)
29+
{
30+
// adjust task default values
31+
{
32+
if (task.workgroupSizeLog2<limits.maxSubgroupSize)
33+
task.workgroupSizeLog2 = core::roundDownToPoT(limits.maxComputeWorkGroupInvocations);
34+
bool useFloat16 = false;
35+
uint16_t channels = 4;
36+
using namespace hlsl::format;
37+
if (task.outputFormat!=TexelBlockFormat::TBF_UNKNOWN)
38+
{
39+
channels = getTraits(task.outputFormat).Channels;
40+
const auto precisionAt1 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,1.f);
41+
const auto precisionAt0 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,0.f);
42+
if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits<hlsl::float16_t>::min())
43+
useFloat16 = true;
44+
}
45+
// the absolute minimum needed to store a single pixel
46+
const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t));
47+
// also slightly more memory is needed
48+
task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation);
49+
}
50+
// create blit pipeline
51+
cpuPplns.emplace_back(nullptr);
52+
// create optional coverage normalization pipeline
53+
cpuPplns.emplace_back(nullptr);
54+
}
55+
56+
CAssetConverter::SInputs inputs = {};
57+
inputs.readCache = converter;
58+
inputs.logger = m_logger.getRaw();
59+
std::get<CAssetConverter::SInputs::asset_span_t<ICPUComputePipeline>>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()};
60+
inputs.readShaderCache = m_shaderCache.get();
61+
inputs.writeShaderCache = m_shaderCache.get();
62+
// no pipeline cache, because we only make the same pipeline once, ever
63+
auto reserveResults = converter->reserve(inputs);
64+
assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE);
65+
// copy over the results
66+
{
67+
auto rIt = reserveResults.getGPUObjects<ICPUComputePipeline>().data();
68+
// TODO: redo
69+
for (size_t i=0; i<tasks.size(); i++)
70+
*(pipelines++) = (rIt++)->value;
71+
}
72+
73+
// this just inserts the pipelines into the cache
74+
{
75+
CAssetConverter::SConvertParams params = {};
76+
auto convertResults = reserveResults.convert(params);
77+
assert(!convertResults.blocking());
78+
}
79+
}
80+
881
#if 0
982
core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
1083
{
@@ -39,21 +112,14 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializ
39112
"}\n";
40113

41114
auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader");
42-
43-
return m_device->createShader(std::move(cpuShader.get()));
44115
}
45116

46-
core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat,
47-
const uint32_t alphaBinCount)
117+
core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
48118
{
49119
const auto workgroupDims = getDefaultWorkgroupDims(imageType);
50120
const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
51121
const uint32_t blitDimCount = static_cast<uint32_t>(imageType) + 1;
52122

53-
const auto castedFormat = getOutImageViewFormat(outFormat);
54-
assert(outFormat == castedFormat);
55-
const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat);
56-
57123
std::ostringstream shaderSourceStream;
58124

59125
shaderSourceStream
@@ -67,7 +133,7 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
67133
"[[vk::binding(0, 0)]]\n"
68134
"nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::combined_sampler_t inCS;\n"
69135

70-
"[[vk::image_format(\"" << formatQualifier << "\")]]\n"
136+
"[[vk::image_format(\"unknown\")]]\n"
71137
"[[vk::binding(1, 0)]]\n"
72138
"nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::image_t outImg;\n"
73139

@@ -90,7 +156,5 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
90156
"}\n";
91157

92158
auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader");
93-
94-
return m_device->createShader(std::move(cpuShader.get()));
95159
}
96160
#endif

0 commit comments

Comments
 (0)