Skip to content

Commit 56ea8bd

Browse files
committed
feat(dxmt, d3d11): implement more precise fence for render encoder
1 parent 24e16c7 commit 56ea8bd

File tree

5 files changed

+119
-36
lines changed

5 files changed

+119
-36
lines changed

src/d3d11/d3d11_context_impl.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1530,7 +1530,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
15301530
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
15311531
EmitOP([IndexType, IndexBufferOffset, Primitive, ArgBuffer = bindable->buffer(),
15321532
AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) {
1533-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1533+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
15341534
enc.bumpVisibilityResultOffset();
15351535
auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer();
15361536
auto &cmd = enc.encodeRenderCommand<wmtcmd_render_draw_indexed_indirect>();
@@ -1565,7 +1565,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
15651565
}
15661566
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
15671567
EmitOP([Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) {
1568-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1568+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
15691569
enc.bumpVisibilityResultOffset();
15701570
auto &cmd = enc.encodeRenderCommand<wmtcmd_render_draw_indirect>();
15711571
cmd.type = WMTRenderCommandDrawIndirect;
@@ -1582,7 +1582,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
15821582
) {
15831583
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
15841584
EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
1585-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1585+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
15861586
auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
15871587

15881588
auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo);
@@ -1611,7 +1611,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
16111611

16121612
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
16131613
EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
1614-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1614+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
16151615
auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
16161616

16171617
auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo);
@@ -1641,7 +1641,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
16411641
) {
16421642
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
16431643
EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
1644-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1644+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
16451645
auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
16461646

16471647
auto PatchPerGroup = 32 / enc.tess_threads_per_patch;
@@ -1673,7 +1673,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
16731673

16741674
if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
16751675
EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
1676-
auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
1676+
auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
16771677
auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
16781678

16791679
auto PatchPerGroup = 32 / enc.tess_threads_per_patch;
@@ -4630,7 +4630,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
46304630
auto &so_slot0 = state_.StreamOutput.Targets[0];
46314631
if (so_slot0.Offset == 0xFFFFFFFF) {
46324632
EmitST([slot0 = so_slot0.Buffer->buffer()](ArgumentEncodingContext &enc) {
4633-
auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
4633+
auto [buffer, buffer_offset] = enc.access<true>(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
46344634
auto &cmd = enc.encodeRenderCommand<wmtcmd_render_setbuffer>();
46354635
cmd.type = WMTRenderCommandSetVertexBuffer;
46364636
cmd.buffer = buffer->buffer();;
@@ -4640,7 +4640,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
46404640
});
46414641
} else {
46424642
EmitST([slot0 = so_slot0.Buffer->buffer(), offset = so_slot0.Offset](ArgumentEncodingContext &enc) {
4643-
auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
4643+
auto [buffer, buffer_offset] = enc.access<true>(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
46444644
auto &cmd = enc.encodeRenderCommand<wmtcmd_render_setbuffer>();
46454645
cmd.type = WMTRenderCommandSetVertexBuffer;
46464646
cmd.buffer = buffer->buffer();;

src/dxmt/dxmt_context.cpp

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ ArgumentEncodingContext::encodeVertexBuffers(uint32_t slot_mask, uint64_t offset
8181
continue;
8282
}
8383
auto length = buffer->length();
84-
auto [buffer_alloc, buffer_offset] = access(buffer, DXMT_ENCODER_RESOURCE_ACESS_READ);
84+
auto [buffer_alloc, buffer_offset] = access<true>(buffer, DXMT_ENCODER_RESOURCE_ACESS_READ);
8585
entries[index].buffer_handle = buffer_alloc->gpuAddress() + buffer_offset + state.offset;
8686
entries[index].stride = state.stride;
8787
entries[index++].length = length > state.offset ? length - state.offset : 0;
@@ -143,7 +143,10 @@ template void ArgumentEncodingContext::encodeConstantBuffers<PipelineStage::Pixe
143143
template <PipelineStage stage, PipelineKind kind>
144144
void
145145
ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *reflection, const MTL_SM50_SHADER_ARGUMENT * constant_buffers, uint64_t offset) {
146-
uint64_t *encoded_buffer = getMappedArgumentBuffer<uint64_t, stage == PipelineStage::Compute>(offset);
146+
uint64_t *encoded_buffer = getMappedArgumentBuffer < uint64_t, stage == PipelineStage::Compute > (offset);
147+
148+
constexpr bool AtVertexStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain ||
149+
stage == PipelineStage::Hull || stage == PipelineStage::Geometry;
147150

148151
for (unsigned i = 0; i < reflection->NumConstantBuffers; i++) {
149152
auto &arg = constant_buffers[i];
@@ -158,7 +161,7 @@ ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *refl
158161
}
159162
auto argbuf = cbuf.buffer;
160163
// FIXME: did we intended to use the whole buffer?
161-
auto [argbuf_alloc, argbuf_offset] = access(argbuf, DXMT_ENCODER_RESOURCE_ACESS_READ);
164+
auto [argbuf_alloc, argbuf_offset] = access<AtVertexStage>(argbuf, DXMT_ENCODER_RESOURCE_ACESS_READ);
162165
encoded_buffer[arg.StructurePtrOffset] = argbuf_alloc->gpuAddress() + argbuf_offset + cbuf.offset;
163166
makeResident<stage, kind>(argbuf.ptr());
164167
break;
@@ -246,6 +249,9 @@ ArgumentEncodingContext::encodeShaderResources(
246249

247250
auto &UAVBindingSet = stage == PipelineStage::Compute ? cs_uav_ : om_uav_;
248251

252+
constexpr bool AtVertexStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain ||
253+
stage == PipelineStage::Hull || stage == PipelineStage::Geometry;
254+
249255
for (unsigned i = 0; i < BindingCount; i++) {
250256
auto &arg = arguments[i];
251257
switch (arg.Type) {
@@ -272,7 +278,7 @@ ArgumentEncodingContext::encodeShaderResources(
272278

273279
if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) {
274280
if (srv.buffer.ptr()) {
275-
auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ);
281+
auto [srv_alloc, offset] = access<AtVertexStage>(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ);
276282
encoded_buffer[arg.StructurePtrOffset] = srv_alloc->gpuAddress() + offset + srv.slice.byteOffset;
277283
encoded_buffer[arg.StructurePtrOffset + 1] = srv.slice.byteLength;
278284
makeResident<stage, kind>(srv.buffer.ptr());
@@ -283,7 +289,7 @@ ArgumentEncodingContext::encodeShaderResources(
283289
} else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) {
284290
if (srv.buffer.ptr()) {
285291
assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET);
286-
auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
292+
auto [view, offset] = access<AtVertexStage>(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
287293
encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id;
288294
encoded_buffer[arg.StructurePtrOffset + 1] =
289295
((uint64_t)srv.slice.elementCount << 32) | (uint64_t)(srv.slice.firstElement + offset);
@@ -292,7 +298,7 @@ ArgumentEncodingContext::encodeShaderResources(
292298
assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP);
293299
auto viewIdChecked = srv.texture->checkViewUseArray(srv.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY);
294300
encoded_buffer[arg.StructurePtrOffset] =
295-
access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpu_resource_id;
301+
access<AtVertexStage>(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpu_resource_id;
296302
encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(srv.texture->arrayLength(viewIdChecked), 0);
297303
makeResident<stage, kind>(srv.texture.ptr(), viewIdChecked);
298304
} else {
@@ -312,7 +318,7 @@ ArgumentEncodingContext::encodeShaderResources(
312318

313319
if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) {
314320
if (uav.buffer.ptr()) {
315-
auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags);
321+
auto [uav_alloc, offset] = access<AtVertexStage>(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags);
316322
encoded_buffer[arg.StructurePtrOffset] = uav_alloc->gpuAddress() + offset + uav.slice.byteOffset;
317323
encoded_buffer[arg.StructurePtrOffset + 1] = uav.slice.byteLength;
318324
makeResident<stage, kind>(uav.buffer.ptr(), read, write);
@@ -323,15 +329,15 @@ ArgumentEncodingContext::encodeShaderResources(
323329
} else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) {
324330
if (uav.buffer.ptr()) {
325331
assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET);
326-
auto [view, offset] = access(uav.buffer, uav.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
332+
auto [view, offset] = access<AtVertexStage>(uav.buffer, uav.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
327333
encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id;
328334
encoded_buffer[arg.StructurePtrOffset + 1] =
329335
((uint64_t)uav.slice.elementCount << 32) | (uint64_t)(uav.slice.firstElement + offset);
330336
makeResident<stage, kind>(uav.buffer.ptr(), uav.viewId, read, write);
331337
} else if (uav.texture.ptr()) {
332338
assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP);
333339
auto viewIdChecked = uav.texture->checkViewUseArray(uav.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY);
334-
encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpu_resource_id;
340+
encoded_buffer[arg.StructurePtrOffset] = access<AtVertexStage>(uav.texture, viewIdChecked, access_flags).gpu_resource_id;
335341
encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(uav.texture->arrayLength(viewIdChecked), 0);
336342
makeResident<stage, kind>(uav.texture.ptr(), viewIdChecked, read, write);
337343
} else {
@@ -341,7 +347,7 @@ ArgumentEncodingContext::encodeShaderResources(
341347
}
342348
if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_UAV_COUNTER) {
343349
if (uav.counter) {
344-
auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE);
350+
auto [counter_alloc, offset] = access<AtVertexStage>(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE);
345351
encoded_buffer[arg.StructurePtrOffset + 2] = counter_alloc->gpuAddress() + offset;
346352
makeResident<stage, kind>(uav.counter.ptr(), true, true);
347353
} else {
@@ -572,6 +578,7 @@ ArgumentEncodingContext::startRenderPass(
572578
assert(!encoder_current);
573579
auto encoder_info = allocate<RenderEncoderData>();
574580
encoder_info->type = EncoderType::Render;
581+
encoder_info->encoder_id_vertex = nextEncoderId();
575582
encoder_info->id = nextEncoderId();
576583
WMT::InitializeRenderPassInfo(encoder_info->info);
577584
encoder_info->cmd_head.type = WMTRenderCommandNop;
@@ -587,6 +594,7 @@ ArgumentEncodingContext::startRenderPass(
587594
encoder_current = encoder_info;
588595

589596
fence_alias_map_.unalias(encoder_info->id & kFenceIdMask);
597+
fence_alias_map_.unalias(encoder_info->encoder_id_vertex & kFenceIdMask);
590598
currentFrameStatistics().render_pass_count++;
591599

592600
vro_state_.beginEncoder();
@@ -757,9 +765,13 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
757765
}
758766
auto gpu_buffer_ = data->allocated_argbuf;
759767
auto encoder = cmdbuf.renderCommandEncoder(data->info);
760-
data->fence_wait.forEach(fence_alias_map_, [&](FenceId id) {
761-
encoder.waitForFence(fence_pool_[id], WMTRenderStageVertex | WMTRenderStageMesh | WMTRenderStageObject);
762-
});
768+
data->fence_wait.forEach(
769+
data->fence_wait_vertex, fence_alias_map_,
770+
[&](FenceId id) {
771+
encoder.waitForFence(fence_pool_[id], WMTRenderStageVertex | WMTRenderStageMesh | WMTRenderStageObject);
772+
},
773+
[&](FenceId id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }
774+
);
763775
encoder.setVertexBuffer(gpu_buffer_, 0, 16);
764776
encoder.setVertexBuffer(gpu_buffer_, 0, 29);
765777
encoder.setVertexBuffer(gpu_buffer_, 0, 30);
@@ -832,6 +844,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
832844
);
833845
}
834846
encoder.encodeCommands(&data->cmd_head);
847+
encoder.updateFence(
848+
fence_pool_[data->encoder_id_vertex & kFenceIdMask],
849+
WMTRenderStageVertex | WMTRenderStageMesh | WMTRenderStageObject
850+
);
835851
encoder.updateFence(fence_pool_[data->id & kFenceIdMask], WMTRenderStageFragment);
836852
encoder.endEncoding();
837853
data->~RenderEncoderData();
@@ -1092,7 +1108,7 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
10921108
auto r1 = reinterpret_cast<RenderEncoderData *>(latter);
10931109
auto r0 = reinterpret_cast<RenderEncoderData *>(former);
10941110

1095-
if (isEncoderSignatureMatched(r0, r1)) {
1111+
if (isEncoderSignatureMatched(r0, r1) && !r1->fence_wait_vertex.contains(r0->id & kFenceIdMask)) {
10961112
for (unsigned i = 0; i < r0->render_target_count; i++) {
10971113
auto &a0 = r0->info.colors[i];
10981114
auto &a1 = r1->info.colors[i];
@@ -1133,6 +1149,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
11331149
r1->fence_wait.remove(r0->id & kFenceIdMask);
11341150
r1->fence_wait.merge(r0->fence_wait);
11351151
fence_alias_map_.alias(r0->id & kFenceIdMask, r1->id & kFenceIdMask);
1152+
r1->fence_wait_vertex.remove(r0->encoder_id_vertex & kFenceIdMask);
1153+
r1->fence_wait_vertex.merge(r0->fence_wait_vertex);
1154+
fence_alias_map_.alias(r0->encoder_id_vertex & kFenceIdMask, r1->encoder_id_vertex & kFenceIdMask);
11361155

11371156
currentFrameStatistics().render_pass_optimized++;
11381157
r0->~RenderEncoderData();
@@ -1151,6 +1170,23 @@ ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *for
11511170
/**
11521171
`former` is guaranteed unaliased
11531172
*/
1173+
if (latter->type == EncoderType::Render) {
1174+
auto render_latter = reinterpret_cast<RenderEncoderData *>(latter);
1175+
if (former->type == EncoderType::Render) {
1176+
auto render_former = reinterpret_cast<RenderEncoderData *>(former);
1177+
return render_latter->fence_wait.contains(render_former->id & kFenceIdMask) ||
1178+
render_latter->fence_wait_vertex.contains(render_former->id & kFenceIdMask) ||
1179+
render_latter->fence_wait.contains(render_former->encoder_id_vertex & kFenceIdMask) ||
1180+
render_latter->fence_wait_vertex.contains(render_former->encoder_id_vertex & kFenceIdMask);
1181+
}
1182+
return render_latter->fence_wait.contains(former->id & kFenceIdMask) ||
1183+
render_latter->fence_wait_vertex.contains(former->id & kFenceIdMask);
1184+
}
1185+
if (former->type == EncoderType::Render) {
1186+
auto render_former = reinterpret_cast<RenderEncoderData *>(former);
1187+
return latter->fence_wait.contains(render_former->id & kFenceIdMask) ||
1188+
latter->fence_wait.contains(render_former->encoder_id_vertex & kFenceIdMask);
1189+
}
11541190
return latter->fence_wait.contains(former->id & kFenceIdMask);
11551191
}
11561192

0 commit comments

Comments
 (0)