Skip to content

Commit 1e1a6e3

Browse files
committed
Adding iree_hal_device_queue_update.
As with all queue DMA operations it's best if things are batched into command buffers but it's bad to have a command buffer with a single DMA operation - this completes the set of fill/update/copy operations at the queue level to match the command buffer DMA operations. Practically this is useful when combined with reusable/indirect command buffers for uploading new parameters in queue order prior to issuing a command buffer that references them. The compiler will use this to turn push constants into uniform buffers. An emulated version is added but implementations are encouraged to do better... they currently don't.
1 parent 632bc11 commit 1e1a6e3

File tree

12 files changed

+177
-23
lines changed

12 files changed

+177
-23
lines changed

experimental/webgpu/webgpu_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ const iree_hal_device_vtable_t iree_hal_webgpu_device_vtable = {
470470
.queue_alloca = iree_hal_webgpu_device_queue_alloca,
471471
.queue_dealloca = iree_hal_webgpu_device_queue_dealloca,
472472
.queue_fill = iree_hal_device_queue_emulated_fill,
473+
.queue_update = iree_hal_device_queue_emulated_update,
473474
.queue_copy = iree_hal_device_queue_emulated_copy,
474475
.queue_read = iree_hal_webgpu_device_queue_read,
475476
.queue_write = iree_hal_webgpu_device_queue_write,

runtime/src/iree/hal/command_buffer.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,15 @@ IREE_API_EXPORT iree_status_t iree_hal_create_transfer_command_buffer(
662662
transfer_command->fill.pattern,
663663
transfer_command->fill.pattern_length, IREE_HAL_FILL_FLAG_NONE);
664664
break;
665+
case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
666+
status = iree_hal_command_buffer_update_buffer(
667+
command_buffer, transfer_command->update.source_buffer,
668+
transfer_command->update.source_offset,
669+
iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
670+
transfer_command->update.target_offset,
671+
transfer_command->update.length),
672+
IREE_HAL_UPDATE_FLAG_NONE);
673+
break;
665674
case IREE_HAL_TRANSFER_COMMAND_TYPE_COPY:
666675
status = iree_hal_command_buffer_copy_buffer(
667676
command_buffer,
@@ -673,15 +682,6 @@ IREE_API_EXPORT iree_status_t iree_hal_create_transfer_command_buffer(
673682
transfer_command->copy.length),
674683
IREE_HAL_COPY_FLAG_NONE);
675684
break;
676-
case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
677-
status = iree_hal_command_buffer_update_buffer(
678-
command_buffer, transfer_command->update.source_buffer,
679-
transfer_command->update.source_offset,
680-
iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
681-
transfer_command->update.target_offset,
682-
transfer_command->update.length),
683-
IREE_HAL_UPDATE_FLAG_NONE);
684-
break;
685685
default:
686686
status =
687687
iree_make_status(IREE_STATUS_INVALID_ARGUMENT,

runtime/src/iree/hal/command_buffer.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ enum iree_hal_fill_flag_bits_t {
230230
IREE_HAL_FILL_FLAG_NONE = 0,
231231
};
232232

233-
// Bitfield specifying flags controlling a update operation.
233+
// Bitfield specifying flags controlling an update operation.
234234
typedef uint64_t iree_hal_update_flags_t;
235235
enum iree_hal_update_flag_bits_t {
236236
IREE_HAL_UPDATE_FLAG_NONE = 0,
@@ -802,10 +802,10 @@ IREE_API_EXPORT iree_status_t iree_hal_command_buffer_validate_submission(
802802
typedef enum iree_hal_transfer_command_type_t {
803803
// iree_hal_command_buffer_fill_buffer
804804
IREE_HAL_TRANSFER_COMMAND_TYPE_FILL = 0u,
805-
// iree_hal_command_buffer_copy_buffer
806-
IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 1u,
807805
// iree_hal_command_buffer_update_buffer
808-
IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 2u,
806+
IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 1u,
807+
// iree_hal_command_buffer_copy_buffer
808+
IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 2u,
809809
} iree_hal_transfer_command_type_t;
810810

811811
// Represents a single transfer command within a batch of commands.
@@ -821,14 +821,6 @@ typedef struct iree_hal_transfer_command_t {
821821
const void* pattern;
822822
iree_host_size_t pattern_length;
823823
} fill;
824-
// IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
825-
struct {
826-
iree_hal_buffer_t* source_buffer;
827-
iree_device_size_t source_offset;
828-
iree_hal_buffer_t* target_buffer;
829-
iree_device_size_t target_offset;
830-
iree_device_size_t length;
831-
} copy;
832824
// IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE
833825
struct {
834826
const void* source_buffer;
@@ -837,6 +829,14 @@ typedef struct iree_hal_transfer_command_t {
837829
iree_device_size_t target_offset;
838830
iree_device_size_t length;
839831
} update;
832+
// IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
833+
struct {
834+
iree_hal_buffer_t* source_buffer;
835+
iree_device_size_t source_offset;
836+
iree_hal_buffer_t* target_buffer;
837+
iree_device_size_t target_offset;
838+
iree_device_size_t length;
839+
} copy;
840840
};
841841
} iree_hal_transfer_command_t;
842842

runtime/src/iree/hal/device.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,91 @@ IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
196196
return status;
197197
}
198198

199+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
200+
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
201+
const iree_hal_semaphore_list_t wait_semaphore_list,
202+
const iree_hal_semaphore_list_t signal_semaphore_list,
203+
const void* source_buffer, iree_host_size_t source_offset,
204+
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
205+
iree_device_size_t length, iree_hal_update_flags_t flags) {
206+
IREE_ASSERT_ARGUMENT(device);
207+
IREE_ASSERT_ARGUMENT(source_buffer);
208+
IREE_ASSERT_ARGUMENT(target_buffer);
209+
IREE_TRACE_ZONE_BEGIN(z0);
210+
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
211+
212+
// If we are starting execution immediately then we can reduce latency by
213+
// allowing inline command buffer execution.
214+
iree_hal_command_buffer_mode_t command_buffer_mode =
215+
IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
216+
if (wait_semaphore_list.count == 0) {
217+
command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
218+
}
219+
220+
// TODO(benvanik): support splitting the update into multiple chunks to fit
221+
// under the max command buffer update size limit. This provisional API is
222+
// intended only for updating dispatch parameters today.
223+
if (length > UINT16_MAX) {
224+
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
225+
"queue buffer updates currently limited to 64KB, "
226+
"tried to update %" PRIhsz " bytes",
227+
length);
228+
}
229+
230+
iree_hal_transfer_command_t command = {
231+
.type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
232+
.update =
233+
{
234+
.source_buffer = source_buffer,
235+
.source_offset = source_offset,
236+
.target_buffer = target_buffer,
237+
.target_offset = target_offset,
238+
.length = length,
239+
},
240+
};
241+
242+
iree_hal_command_buffer_t* command_buffer = NULL;
243+
IREE_RETURN_AND_END_ZONE_IF_ERROR(
244+
z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
245+
queue_affinity, 1, &command,
246+
&command_buffer));
247+
248+
iree_status_t status = iree_hal_device_queue_execute(
249+
device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
250+
command_buffer, iree_hal_buffer_binding_table_empty());
251+
252+
iree_hal_command_buffer_release(command_buffer);
253+
254+
IREE_TRACE_ZONE_END(z0);
255+
return status;
256+
}
257+
258+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
259+
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
260+
const iree_hal_semaphore_list_t wait_semaphore_list,
261+
const iree_hal_semaphore_list_t signal_semaphore_list,
262+
const void* source_buffer, iree_host_size_t source_offset,
263+
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
264+
iree_device_size_t length, iree_hal_update_flags_t flags) {
265+
IREE_ASSERT_ARGUMENT(device);
266+
IREE_ASSERT_ARGUMENT(
267+
!wait_semaphore_list.count ||
268+
(wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
269+
IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
270+
(signal_semaphore_list.semaphores &&
271+
signal_semaphore_list.payload_values));
272+
IREE_ASSERT_ARGUMENT(source_buffer);
273+
IREE_ASSERT_ARGUMENT(target_buffer);
274+
IREE_TRACE_ZONE_BEGIN(z0);
275+
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
276+
iree_status_t status = _VTABLE_DISPATCH(device, queue_update)(
277+
device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
278+
source_buffer, source_offset, target_buffer, target_offset, length,
279+
flags);
280+
IREE_TRACE_ZONE_END(z0);
281+
return status;
282+
}
283+
199284
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
200285
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
201286
const iree_hal_semaphore_list_t wait_semaphore_list,

runtime/src/iree/hal/device.h

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,11 +312,12 @@ IREE_API_EXPORT iree_status_t iree_hal_device_queue_dealloca(
312312
iree_hal_buffer_t* buffer);
313313

314314
// Enqueues a single queue-ordered fill operation.
315+
// The |target_buffer| must be visible to the device queue performing the fill.
315316
//
316317
// WARNING: individual fills have a high overhead and batching should be
317318
// performed by the caller instead of calling this multiple times. The
318319
// iree_hal_create_transfer_command_buffer utility makes it easy to create
319-
// batches of transfer operations (fill, copy, update) and is only a few lines
320+
// batches of transfer operations (fill, update, copy) and is only a few lines
320321
// more code.
321322
IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
322323
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -326,12 +327,36 @@ IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
326327
iree_device_size_t length, const void* pattern,
327328
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
328329

330+
// Enqueues a single queue-ordered buffer update operation.
331+
// The provided |source_buffer| will be captured and need not remain live or
332+
// unchanged while the operation is queued. The |target_buffer| must be visible
333+
// to the device queue performing the update.
334+
//
335+
// Some implementations may have limits on the size of the update or may perform
336+
// poorly if the size is larger than an implementation-defined limit. Updates
337+
// should be kept as small and infrequent as possible.
338+
//
339+
// WARNING: individual copies have a high overhead and batching should be
340+
// performed by the caller instead of calling this multiple times. The
341+
// iree_hal_create_transfer_command_buffer utility makes it easy to create
342+
// batches of transfer operations (fill, update, copy) and is only a few lines
343+
// more code.
344+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
345+
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
346+
const iree_hal_semaphore_list_t wait_semaphore_list,
347+
const iree_hal_semaphore_list_t signal_semaphore_list,
348+
const void* source_buffer, iree_host_size_t source_offset,
349+
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
350+
iree_device_size_t length, iree_hal_update_flags_t flags);
351+
329352
// Enqueues a single queue-ordered copy operation.
353+
// The |source_buffer| and |target_buffer| must both be visible to the device
354+
// queue performing the copy.
330355
//
331356
// WARNING: individual copies have a high overhead and batching should be
332357
// performed by the caller instead of calling this multiple times. The
333358
// iree_hal_create_transfer_command_buffer utility makes it easy to create
334-
// batches of transfer operations (fill, copy, update) and is only a few lines
359+
// batches of transfer operations (fill, update, copy) and is only a few lines
335360
// more code.
336361
IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
337362
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -578,6 +603,14 @@ typedef struct iree_hal_device_vtable_t {
578603
iree_device_size_t length, const void* pattern,
579604
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
580605

606+
iree_status_t(IREE_API_PTR* queue_update)(
607+
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
608+
const iree_hal_semaphore_list_t wait_semaphore_list,
609+
const iree_hal_semaphore_list_t signal_semaphore_list,
610+
const void* source_buffer, iree_host_size_t source_offset,
611+
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
612+
iree_device_size_t length, iree_hal_update_flags_t flags);
613+
581614
iree_status_t(IREE_API_PTR* queue_copy)(
582615
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
583616
const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -634,6 +667,14 @@ IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
634667
iree_device_size_t length, const void* pattern,
635668
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
636669

670+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
671+
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
672+
const iree_hal_semaphore_list_t wait_semaphore_list,
673+
const iree_hal_semaphore_list_t signal_semaphore_list,
674+
const void* source_buffer, iree_host_size_t source_offset,
675+
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
676+
iree_device_size_t length, iree_hal_update_flags_t flags);
677+
637678
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
638679
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
639680
const iree_hal_semaphore_list_t wait_semaphore_list,

runtime/src/iree/hal/drivers/cuda/cuda_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,7 @@ static const iree_hal_device_vtable_t iree_hal_cuda_device_vtable = {
11301130
.queue_alloca = iree_hal_cuda_device_queue_alloca,
11311131
.queue_dealloca = iree_hal_cuda_device_queue_dealloca,
11321132
.queue_fill = iree_hal_device_queue_emulated_fill,
1133+
.queue_update = iree_hal_device_queue_emulated_update,
11331134
.queue_copy = iree_hal_device_queue_emulated_copy,
11341135
.queue_read = iree_hal_cuda_device_queue_read,
11351136
.queue_write = iree_hal_cuda_device_queue_write,

runtime/src/iree/hal/drivers/hip/hip_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,7 @@ static const iree_hal_device_vtable_t iree_hal_hip_device_vtable = {
11271127
.queue_alloca = iree_hal_hip_device_queue_alloca,
11281128
.queue_dealloca = iree_hal_hip_device_queue_dealloca,
11291129
.queue_fill = iree_hal_device_queue_emulated_fill,
1130+
.queue_update = iree_hal_device_queue_emulated_update,
11301131
.queue_copy = iree_hal_device_queue_emulated_copy,
11311132
.queue_read = iree_hal_hip_device_queue_read,
11321133
.queue_write = iree_hal_hip_device_queue_write,

runtime/src/iree/hal/drivers/local_sync/sync_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ static const iree_hal_device_vtable_t iree_hal_sync_device_vtable = {
504504
.queue_alloca = iree_hal_sync_device_queue_alloca,
505505
.queue_dealloca = iree_hal_sync_device_queue_dealloca,
506506
.queue_fill = iree_hal_device_queue_emulated_fill,
507+
.queue_update = iree_hal_device_queue_emulated_update,
507508
.queue_copy = iree_hal_device_queue_emulated_copy,
508509
.queue_read = iree_hal_sync_device_queue_read,
509510
.queue_write = iree_hal_sync_device_queue_write,

runtime/src/iree/hal/drivers/local_task/task_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ static const iree_hal_device_vtable_t iree_hal_task_device_vtable = {
539539
.queue_alloca = iree_hal_task_device_queue_alloca,
540540
.queue_dealloca = iree_hal_task_device_queue_dealloca,
541541
.queue_fill = iree_hal_device_queue_emulated_fill,
542+
.queue_update = iree_hal_device_queue_emulated_update,
542543
.queue_copy = iree_hal_device_queue_emulated_copy,
543544
.queue_read = iree_hal_task_device_queue_read,
544545
.queue_write = iree_hal_task_device_queue_write,

runtime/src/iree/hal/drivers/metal/metal_device.m

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,7 @@ static iree_status_t iree_hal_metal_device_profiling_end(iree_hal_device_t* base
618618
.queue_alloca = iree_hal_metal_device_queue_alloca,
619619
.queue_dealloca = iree_hal_metal_device_queue_dealloca,
620620
.queue_fill = iree_hal_device_queue_emulated_fill,
621+
.queue_update = iree_hal_device_queue_emulated_update,
621622
.queue_copy = iree_hal_device_queue_emulated_copy,
622623
.queue_read = iree_hal_metal_device_queue_read,
623624
.queue_write = iree_hal_metal_device_queue_write,

0 commit comments

Comments
 (0)