Skip to content

Commit ee5ce4b

Browse files
committed
add a few allocation strategies for tests, I think the one with division is nice though I wrote my thoughts in comments
1 parent 83991ac commit ee5ce4b

File tree

1 file changed

+82
-46
lines changed

1 file changed

+82
-46
lines changed

src/nbl/ext/ImGui/ImGui.cpp

Lines changed: 82 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,24 +1143,60 @@ namespace nbl::ext::imgui
11431143

11441144
struct
11451145
{
1146-
mdi_size_t offset, size;
1146+
std::vector<mdi_size_t> offsets, sizes;
1147+
float memoryBlockFactor = 1.f;
11471148
} bigChunkRequestState;
11481149

11491150
//! we will try to upload entrie MDI buffer with all available indirect data to our streaming buffer, but we cannot guarantee the allocation can be done in single request nor we allocate all totalIndirectDrawCount at all - we can hit timeout and it may appear not all of totalIndirectDrawCount will be uploaded then
11501151
for (mdi_size_t uploadedSize = 0ull; uploadedSize < mdiLimits.totalByteSizeRequest;)
11511152
{
1152-
bigChunkRequestState.offset = InvalidAddress;
1153-
bigChunkRequestState.size = streamingBuffer->max_size(); // TODO: divide by 2 request strategy on fail
1153+
// ok we cannot make it just
1154+
// min(streamingBuffer->max_size(), (mdiLimits.totalByteSizeRequest - uploadedSize))
1155+
// with bigChunkRequestState.memoryBlockFactor being divided by 2 because we will always have at least one offset which cannot be suballocated by linear allocator, this will be too tight for the suballocator to respect alignments - to make it work this delta would need a little factor which would add something to this difference I guess
1156+
// tests:
1157+
1158+
#define ALLOC_STRATEGY_1
1159+
//#define ALLOC_STRATEGY_2
1160+
//#define ALLOC_STRATEGY_3
1161+
1162+
#ifdef ALLOC_STRATEGY_1
1163+
mdi_size_t chunkOffset = InvalidAddress, chunkSize = min(streamingBuffer->max_size(), (mdiLimits.totalByteSizeRequest * bigChunkRequestState.memoryBlockFactor)); // we divide requests, delta has space for suballocator's padding - we trying to add another block with the fixed size, but if not posible we divide the block by 2
1164+
1165+
constexpr auto StreamingAllocationCount = 1u;
1166+
const size_t unallocatedSize = m_mdi.compose->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(100u), StreamingAllocationCount, &chunkOffset, &chunkSize, &MdiMaxAlignment); //! (*) note we request single tight chunk of memory with fixed max alignment - big address space from which we fill try to suballocate to fill data
1167+
1168+
if (chunkOffset == InvalidAddress)
1169+
{
1170+
bigChunkRequestState.memoryBlockFactor *= 0.5f;
1171+
continue;
1172+
}
1173+
#endif
1174+
#ifdef ALLOC_STRATEGY_2
1175+
mdi_size_t chunkOffset = InvalidAddress, chunkSize = min(streamingBuffer->max_size(), (mdiLimits.totalByteSizeRequest - uploadedSize) * 2u /* we request twice the delta with respect to max_size UB */);
11541176

11551177
constexpr auto StreamingAllocationCount = 1u;
1156-
const size_t unallocatedSize = m_mdi.compose->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(100u), StreamingAllocationCount, &bigChunkRequestState.offset, &bigChunkRequestState.size, &MdiMaxAlignment); //! (*) note we request single tight chunk of memory with fixed max alignment - big address space from which we fill try to suballocate to fill data
1178+
const size_t unallocatedSize = m_mdi.compose->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(100u), StreamingAllocationCount, &chunkOffset, &chunkSize, &MdiMaxAlignment); //! (*) note we request single tight chunk of memory with fixed max alignment - big address space from which we fill try to suballocate to fill data
11571179

1158-
if (bigChunkRequestState.offset == InvalidAddress)
1180+
if (chunkOffset == InvalidAddress)
11591181
continue;
1182+
#endif
1183+
#ifdef ALLOC_STRATEGY_3
1184+
mdi_size_t chunkOffset = InvalidAddress, chunkSize = streamingBuffer->max_size(); // take all whats available <- dumbie I guess
1185+
1186+
constexpr auto StreamingAllocationCount = 1u;
1187+
const size_t unallocatedSize = m_mdi.compose->multi_allocate(std::chrono::steady_clock::now() + std::chrono::microseconds(100u), StreamingAllocationCount, &chunkOffset, &chunkSize, &MdiMaxAlignment); //! (*) note we request single tight chunk of memory with fixed max alignment - big address space from which we fill try to suballocate to fill data
1188+
1189+
if (chunkOffset == InvalidAddress)
1190+
continue;
1191+
#endif
11601192
else
11611193
{
1162-
const auto alignOffsetNeeded = MdiMaxSize - (bigChunkRequestState.offset % MdiMaxSize);
1163-
SMdiBuffer::suballocator_traits_t::allocator_type fillSubAllocator(mdiData, bigChunkRequestState.offset, alignOffsetNeeded, MdiMaxAlignment, bigChunkRequestState.size); //! (*) we create linear suballocator to fill the allocated chunk of memory (some of at least)
1194+
// chunk allocated? put state onto stack & keep alive for suballocator to fill it as required
1195+
bigChunkRequestState.offsets.emplace_back() = chunkOffset;
1196+
bigChunkRequestState.sizes.emplace_back() = chunkSize;
1197+
1198+
const auto alignOffsetNeeded = MdiMaxSize - (chunkOffset % MdiMaxSize);
1199+
SMdiBuffer::suballocator_traits_t::allocator_type fillSubAllocator(mdiData, chunkOffset, alignOffsetNeeded, MdiMaxAlignment, chunkSize); //! (*) we create linear suballocator to fill the allocated chunk of memory (some of at least)
11641200
SMdiBuffer::suballocator_traits_t::multi_alloc_addr(fillSubAllocator, allocation.offsets.size(), allocation.offsets.data(), mdiLimits.sizes.data(), mdiLimits.alignments.data()); //! (*) we suballocate memory regions from the allocated chunk with required alignments - multi request all with single traits call
11651201

11661202
auto upload = [&]() -> size_t
@@ -1181,16 +1217,18 @@ namespace nbl::ext::imgui
11811217
return 0u;
11821218
};
11831219

1184-
// they are very small & negligible in size compared to buffers, but this small pool which we will conditionally fill on successfull object buffer suballocations is required to not complicate things (if we cannot allocate all mdiLimits.totalIndirectDrawCount object buffers then simply those coresponding structures will be filled with dummy params making it an invocation with 0u indices, we treat both components as arrays)
1220+
// they are *very* small (<1% of the total request size) & negligible in size compared to buffers - at the end we must have them all anyway (explained in following comment)
11851221
const bool structuresSuballocated = allocation.offsets[(uint32_t)TightContent::INDIRECT_STRUCTURES] != InvalidAddress && allocation.offsets[(uint32_t)TightContent::ELEMENT_STRUCTURES] != InvalidAddress;
11861222

1187-
if (structuresSuballocated) // note that suballocated only means we have valid address(es) we can work on, it doesn't mean we filled anything
1223+
if (structuresSuballocated) // note that suballocated only means we have valid address(es) we can work on, it doesn't mean we filled anything (suballocated -> *can* fill)
11881224
{
11891225
auto* const indirectStructures = reinterpret_cast<VkDrawIndexedIndirectCommand*>(mdiData + allocation.offsets[(uint32_t)TightContent::INDIRECT_STRUCTURES]);
11901226
auto* const elementStructures = reinterpret_cast<PerObjectData*>(mdiData + allocation.offsets[(uint32_t)TightContent::ELEMENT_STRUCTURES]);
11911227
{
1192-
// I make a assumption here since I can access them later but I don't guarantee all of them will be present,
1193-
// we can fail other suballocations which are required for the struct, note that in reality we fill them below & conditionally
1228+
// I make a assumption here since I can access them later but I don't guarantee all of them will be present at the first run, we can fail buffer
1229+
// subalocations from the current memory block chunk which makes a command list invalid for the iteration! Because of that we fill them conditionally
1230+
// once buffers are correctly suballocated for handled command list - at the end we must have them all filled regardless what chunk their data come from due
1231+
// to the fact we cannot submit an overflow, we don't have dynamic rendering allowing us to stop recording the subpass, submit work to queue & start recording again
11941232
updateSuballocation((uint32_t)TightContent::INDIRECT_STRUCTURES);
11951233
updateSuballocation((uint32_t)TightContent::ELEMENT_STRUCTURES);
11961234
}
@@ -1237,52 +1275,50 @@ namespace nbl::ext::imgui
12371275

12381276
assert(validateObjectOffsets()); // debug check only
12391277

1240-
// we consider buffers valid if we suballocated them (under the hood filled) - if buffers are valid then subindirect call referencing them is too
1278+
// we consider buffers valid for command list if we suballocated them (under the hood filled at first time then skipped to not repeat memcpy) - if buffers are valid then command list with indirects is as well
12411279
const auto buffersSuballocated = fillBuffer(vertexBuffer.Data, vtxAllocationIx) && fillBuffer(indexBuffer.Data, idxAllocationIx);
12421280
const auto [vtxGlobalObjectOffset, idxGlobalObjectOffset] = buffersSuballocated ? std::make_tuple(allocation.offsets[vtxAllocationIx] / sizeof(ImDrawVert), allocation.offsets[idxAllocationIx] / sizeof(ImDrawIdx)) : std::make_tuple((size_t)0u, (size_t)0u);
12431281

1244-
for (uint32_t j = 0u; j < commandList->CmdBuffer.Size; j++)
1282+
if (buffersSuballocated)
12451283
{
1246-
const auto* cmd = &commandList->CmdBuffer[j];
1247-
auto* indirect = indirectStructures + drawID;
1248-
auto* element = elementStructures + drawID;
1284+
for (uint32_t j = 0u; j < commandList->CmdBuffer.Size; j++)
1285+
{
1286+
const auto* cmd = &commandList->CmdBuffer[j];
1287+
auto* indirect = indirectStructures + drawID;
1288+
auto* element = elementStructures + drawID;
12491289

1250-
// we make a trick to keep indirect & element structs in the mdi iteration but explicitly execute dummy null invocation if we don't have vertex or index buffer for the struct (suballocation failed for any of those 2 buffers).
1251-
// TODO: we could make the current structs pool "dynamic" in size and treat as simple stack instead (trying it first to make things easier)
1252-
indirect->indexCount = buffersSuballocated ? cmd->ElemCount /* valid invocation */ : 0u /* null invocation */;
1290+
indirect->indexCount = cmd->ElemCount;
12531291

1254-
indirect->firstInstance = drawID; // we use base instance as draw ID
1255-
indirect->instanceCount = 1u;
1292+
// we use base instance as draw ID
1293+
indirect->firstInstance = drawID;
1294+
indirect->instanceCount = 1u;
1295+
indirect->vertexOffset = vtxGlobalObjectOffset + cmd->VtxOffset;
1296+
indirect->firstIndex = idxGlobalObjectOffset + cmd->IdxOffset;
12561297

1257-
// starting to wonder, for some reason imgui decided to keep single vertex & index shared between cmds within cmd list
1258-
// but maybe we should cut current [vertexBuffer, indexBuffer] with respect to cmd->IdxOffset & cmd->VtxOffset (therefore we could have even smaller alloc requests, now a few structs can point to the same buffer but with different offsets [indirect])
1259-
// though not sure if I don't double some data then <- EDIT: YES, turns out we may double some data
1260-
indirect->vertexOffset = vtxGlobalObjectOffset + cmd->VtxOffset; // safe to assume due to indirect->indexCount depending on buffersSuballocated
1261-
indirect->firstIndex = idxGlobalObjectOffset + cmd->IdxOffset; // safe to assume due to indirect->indexCount depending on buffersSuballocated
1298+
const auto clipRectangle = clip.getClipRectangle(cmd);
1299+
const auto scissor = clip.getScissor(clipRectangle);
12621300

1263-
const auto clipRectangle = clip.getClipRectangle(cmd);
1264-
const auto scissor = clip.getScissor(clipRectangle);
1301+
auto packSnorm16 = [](float ndc) -> int16_t
1302+
{
1303+
return std::round<int16_t>(std::clamp(ndc, -1.0f, 1.0f) * 32767.0f); // TODO: ok encodePixels<EF_R16_SNORM, double>(void* _pix, const double* _input) but iirc we have issues with our encode/decode utils
1304+
};
12651305

1266-
auto packSnorm16 = [](float ndc) -> int16_t
1267-
{
1268-
return std::round<int16_t>(std::clamp(ndc, -1.0f, 1.0f) * 32767.0f); // TODO: ok encodePixels<EF_R16_SNORM, double>(void* _pix, const double* _input) but iirc we have issues with our encode/decode utils
1269-
};
1306+
const auto vMin = trs.toNDC(vector2df_SIMD(scissor.offset.x, scissor.offset.y));
1307+
const auto vMax = trs.toNDC(vector2df_SIMD(scissor.offset.x + scissor.extent.width, scissor.offset.y + scissor.extent.height));
12701308

1271-
const auto vMin = trs.toNDC(vector2df_SIMD(scissor.offset.x, scissor.offset.y));
1272-
const auto vMax = trs.toNDC(vector2df_SIMD(scissor.offset.x + scissor.extent.width, scissor.offset.y + scissor.extent.height));
1309+
struct snorm16_t2_packed
1310+
{
1311+
int16_t x, y;
1312+
};
12731313

1274-
struct snorm16_t2_packed
1275-
{
1276-
int16_t x, y;
1277-
};
1278-
1279-
reinterpret_cast<snorm16_t2_packed&>(element->aabbMin) = { .x = packSnorm16(vMin.x), .y = packSnorm16(vMin.y) };
1280-
reinterpret_cast<snorm16_t2_packed&>(element->aabbMax) = { .x = packSnorm16(vMax.x), .y = packSnorm16(vMax.y) };
1314+
reinterpret_cast<snorm16_t2_packed&>(element->aabbMin) = { .x = packSnorm16(vMin.x), .y = packSnorm16(vMin.y) };
1315+
reinterpret_cast<snorm16_t2_packed&>(element->aabbMax) = { .x = packSnorm16(vMax.x), .y = packSnorm16(vMax.y) };
12811316

1282-
element->texId = cmd->TextureId.textureID;
1283-
element->samplerIx = cmd->TextureId.samplerIx;
1317+
element->texId = cmd->TextureId.textureID;
1318+
element->samplerIx = cmd->TextureId.samplerIx;
12841319

1285-
++drawID;
1320+
++drawID;
1321+
}
12861322
}
12871323
}
12881324
}
@@ -1291,8 +1327,7 @@ namespace nbl::ext::imgui
12911327

12921328
uploadedSize += upload();
12931329
}
1294-
streamingBuffer->multi_deallocate(StreamingAllocationCount, &bigChunkRequestState.offset, &bigChunkRequestState.size, waitInfo); //! (*) block allocated, we just latch offsets deallocation to keep it alive as long as required
1295-
1330+
12961331
// we let it run at least once
12971332
const bool timeout = std::chrono::steady_clock::now() >= waitPoint;
12981333

@@ -1305,6 +1340,7 @@ namespace nbl::ext::imgui
13051340
return false;
13061341
}
13071342
}
1343+
streamingBuffer->multi_deallocate(bigChunkRequestState.offsets.size(), bigChunkRequestState.offsets.data(), bigChunkRequestState.sizes.data(), waitInfo); //! (*) blocks allocated, we just latch offsets deallocation to keep them alive as long as required
13081344
}
13091345

13101346
auto mdiBuffer = smart_refctd_ptr<IGPUBuffer>(m_mdi.compose->getBuffer());

0 commit comments

Comments
 (0)