Skip to content

Commit 278e249

Browse files
authored
Adding AMDGPU command buffer implementation. (iree-org#21123)
This contains the host-side `iree_hal_amdgpu_command_buffer_t` responsible for recording and managing the replicas of the device-side `iree_hal_amdgpu_device_command_buffer_t` instances and the device-side implementation for issuing and executing command buffer programs. The comments for `iree_hal_amdgpu_device_command_buffer_t` and its related commands document how the command buffer is represented in device memory and what each command does. Some of the commands are not yet wired up to host recording code as the it is skating ahead of the HAL command buffer API. Even though unconditional branches cannot be recorded directly they are used for segmenting large command buffers to fit within resource limits. Conditional branches are nominally supported but need an API (similar to indirect dispatches) to be plumbed from the compiler to the command buffer vtable. IREE's HAL is intended to optimize for precompiled command buffers that can amortize the recording cost - this implementation leans that direction and should be optimal for meaningful (non-benchmark) programs originating from anything targeting CUDA graphs. There are some provisions for small command buffers but the intent is that single-command buffers end up routed towards device queue operations much earlier. Since the device-side queue scheduler does not yet exist the two methods required (`iree_hal_amdgpu_device_queue_scheduler_*`) are stubbed out to indicate where they'll be used in the future. The API may change in subsequent changes that implement the behavior.
1 parent 8f60ea8 commit 278e249

File tree

12 files changed

+4539
-19
lines changed

12 files changed

+4539
-19
lines changed

runtime/src/iree/hal/drivers/amdgpu/BUILD.bazel

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ iree_runtime_cc_library(
2424
"buffer_pool.h",
2525
"channel.c",
2626
"channel.h",
27-
# "command_buffer.c",
28-
# "command_buffer.h",
27+
"command_buffer.c",
28+
"command_buffer.h",
2929
"driver.c",
3030
"driver.h",
3131
"event.c",
@@ -83,7 +83,7 @@ iree_runtime_cc_library(
8383
"buffer.h",
8484
"buffer_pool.h",
8585
"channel.h",
86-
# "command_buffer.h",
86+
"command_buffer.h",
8787
"driver.h",
8888
"event.h",
8989
"executable.h",

runtime/src/iree/hal/drivers/amdgpu/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ iree_cc_library(
2424
"buffer_pool.h"
2525
"channel.c"
2626
"channel.h"
27+
"command_buffer.c"
28+
"command_buffer.h"
2729
"driver.c"
2830
"driver.h"
2931
"event.c"
@@ -68,6 +70,7 @@ iree_cc_library(
6870
"buffer.h"
6971
"buffer_pool.h"
7072
"channel.h"
73+
"command_buffer.h"
7174
"driver.h"
7275
"event.h"
7376
"executable.h"

runtime/src/iree/hal/drivers/amdgpu/command_buffer.c

Lines changed: 1875 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
// Copyright 2025 The IREE Authors
2+
//
3+
// Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
#ifndef IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_
8+
#define IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_
9+
10+
#include "iree/base/api.h"
11+
#include "iree/base/internal/arena.h"
12+
#include "iree/hal/api.h"
13+
#include "iree/hal/drivers/amdgpu/device/command_buffer.h"
14+
#include "iree/hal/drivers/amdgpu/util/affinity.h"
15+
16+
typedef struct iree_hal_amdgpu_block_pools_t iree_hal_amdgpu_block_pools_t;
17+
18+
//===----------------------------------------------------------------------===//
19+
// iree_hal_amdgpu_command_buffer_options_t
20+
//===----------------------------------------------------------------------===//
21+
22+
// Determines where and how command buffers are recorded.
23+
typedef enum iree_hal_amdgpu_command_buffer_recording_flags_t {
24+
IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_NONE = 0u,
25+
26+
// TODO(benvanik): support lead-physical-device storage. This would need the
27+
// block pool on the lead device to make its blocks accessible to all devices
28+
// - today the block pool is device-local only. Produced data is immutable and
29+
// PCIe atomics/coherency is not required across devices.
30+
//
31+
// Allocate embedded data on the lead physical device instead of on each
32+
// device the command buffer is recorded for. This reduces overall memory
33+
// consumption and recording time at the cost of cross-device transfers.
34+
// IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_DATA_ON_LEAD_PHYSICAL_DEVICE
35+
// = 1u << 0,
36+
37+
// TODO(benvanik): support compaction. This would require changing the command
38+
// buffer to use relative offsets for embedded data and a data table for
39+
// indirecting so that we can move around base pointers. A fixup would be
40+
// possible as well by launching a kernel that rebased the embedded pointers
41+
// (though trickier). For now we assume the block pool block size is a big
42+
// enough lever and most programs only use a handful of command buffers so
43+
// the waste per command buffer is minimal (compared to a single layer weight
44+
// in an ML model).
45+
//
46+
// Compacts the command buffer when recording ends by reallocating it to the
47+
// precise size required and reuploads it to each device. This will return any
48+
// block pool blocks back to their respective pool for reuse and ensure
49+
// there's no unused device memory - the cost is extra host time to do the
50+
// reallocation/copies.
51+
// IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_COMPACT_ON_FINALIZE
52+
// = 1u << 1,
53+
} iree_hal_amdgpu_command_buffer_recording_flags_t;
54+
55+
// TODO(benvanik): move this someplace common.
56+
//
57+
// Block pools for host memory blocks of various sizes.
58+
typedef struct iree_hal_amdgpu_host_block_pools_t {
59+
// Used for small allocations of around 1-4KB.
60+
iree_arena_block_pool_t small;
61+
// Used for large page-sized allocations of 32-64kB.
62+
iree_arena_block_pool_t large;
63+
} iree_hal_amdgpu_host_block_pools_t;
64+
65+
// Minimum number of AQL packets in a single command buffer block.
66+
// Any fewer and it's not guaranteed a command buffer can complete execution.
67+
#define IREE_HAL_AMDGPU_COMMAND_BUFFER_MIN_BLOCK_AQL_PACKET_COUNT (16)
68+
69+
// Maximum number of AQL packets in a single command buffer block.
70+
// This is currently limited by the `uint16_t packet_offset` in
71+
// iree_hal_amdgpu_device_cmd_header_t.
72+
//
73+
// TODO(benvanik): currently we also limit this by tracy's outstanding GPU event
74+
// limit. If we made our own timeline (which we really need to for concurrency)
75+
// then we could eliminate this artificial limit.
76+
#define IREE_HAL_AMDGPU_COMMAND_BUFFER_MAX_BLOCK_AQL_PACKET_COUNT \
77+
IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_DEVICE_QUERY_RINGBUFFER_CAPACITY, \
78+
(1u << sizeof(((iree_hal_amdgpu_device_cmd_header_t*)NULL) \
79+
->packet_offset) * \
80+
8))
81+
82+
// Recording options for a command buffer.
83+
// Referenced data structures such as block pools must remain live for the
84+
// lifetime of the command buffer but the options struct and its storage (such
85+
// as the device block pool list) need not.
86+
typedef struct iree_hal_amdgpu_command_buffer_options_t {
87+
iree_hal_allocator_t* device_allocator;
88+
iree_hal_command_buffer_mode_t mode;
89+
iree_hal_command_category_t command_categories;
90+
iree_hal_queue_affinity_t queue_affinity;
91+
iree_host_size_t binding_capacity;
92+
93+
// Controls recording behavior (placement, optimization, debugging, etc).
94+
iree_hal_amdgpu_command_buffer_recording_flags_t recording_flags;
95+
96+
// Maximum number of AQL packets the command buffer is allowed to issue at
97+
// a time. Must be at or under the HSA queue capacity of any execution queue
98+
// the command buffer will be scheduled on. The command buffer may decide to
99+
// use fewer packets.
100+
iree_host_size_t block_aql_packet_count;
101+
102+
// Block pools for host memory blocks of various sizes.
103+
iree_hal_amdgpu_host_block_pools_t* host_block_pools;
104+
105+
// Bitmap of physical devices that the command buffer will be recorded for.
106+
// The command buffer can only be issued on these devices.
107+
iree_hal_amdgpu_device_affinity_t device_affinity;
108+
109+
// Compact list of physical device block pools corresponding to the bits set
110+
// in the device_affinity bitmap. A device affinity of 0b110 would lead to two
111+
// device block pools in the list at [0] and [1].
112+
iree_hal_amdgpu_block_pools_t* const* device_block_pools /*[device_count]*/;
113+
} iree_hal_amdgpu_command_buffer_options_t;
114+
115+
// Initializes |out_options| to its default values.
116+
void iree_hal_amdgpu_command_buffer_options_initialize(
117+
iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
118+
iree_hal_command_category_t command_categories,
119+
iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
120+
iree_hal_amdgpu_command_buffer_options_t* out_options);
121+
122+
// Verifies command buffer options to ensure they meet the requirements of the
123+
// devices the command buffer will be scheduled on.
124+
iree_status_t iree_hal_amdgpu_command_buffer_options_verify(
125+
const iree_hal_amdgpu_command_buffer_options_t* options);
126+
127+
//===----------------------------------------------------------------------===//
128+
// iree_hal_amdgpu_command_buffer_t
129+
//===----------------------------------------------------------------------===//
130+
131+
// Creates an AMDGPU command buffer with the given |options| controlling how
132+
// it is recorded and prepared for execution.
133+
//
134+
// Referenced data structures in the options such as block pools must remain
135+
// live for the lifetime of the command buffer.
136+
iree_status_t iree_hal_amdgpu_command_buffer_create(
137+
const iree_hal_amdgpu_command_buffer_options_t* options,
138+
iree_allocator_t host_allocator,
139+
iree_hal_command_buffer_t** out_command_buffer);
140+
141+
// Returns true if |command_buffer| is a AMDGPU command buffer.
142+
bool iree_hal_amdgpu_command_buffer_isa(
143+
iree_hal_command_buffer_t* command_buffer);
144+
145+
// Queries the device-side command buffer representation for the GPU device
146+
// agent with |device_ordinal| in the system topology.
147+
// |out_max_kernarg_capacity| will be set to the minimum required kernarg
148+
// reservation used by any block in the command buffer.
149+
iree_status_t iree_hal_amdgpu_command_buffer_query_execution_state(
150+
iree_hal_command_buffer_t* command_buffer, iree_host_size_t device_ordinal,
151+
IREE_AMDGPU_DEVICE_PTR iree_hal_amdgpu_device_command_buffer_t**
152+
out_device_command_buffer,
153+
iree_host_size_t* out_max_kernarg_capacity);
154+
155+
#endif // IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_

runtime/src/iree/hal/drivers/amdgpu/device/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@
1111
set(_BITCODE_SRCS
1212
"blit.c"
1313
"buffer.c"
14+
"command_buffer.c"
1415
"host_client.c"
1516
"tracing.c"
1617
)
1718

1819
set(_BITCODE_HDRS
1920
"blit.h"
2021
"buffer.h"
22+
"command_buffer.h"
2123
"kernel_tables.h"
2224
"kernels.h"
2325
"host_client.h"

runtime/src/iree/hal/drivers/amdgpu/device/buffer.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,26 @@ void* iree_hal_amdgpu_device_workgroup_count_buffer_ref_resolve(
6363
? (uint8_t*)buffer_ref.value.ptr + buffer_ref.offset
6464
: NULL;
6565
}
66+
67+
void* iree_hal_amdgpu_device_uint64_buffer_ref_resolve(
68+
iree_hal_amdgpu_device_uint64_buffer_ref_t buffer_ref,
69+
IREE_AMDGPU_ALIGNAS(64)
70+
const iree_hal_amdgpu_device_buffer_ref_t* IREE_AMDGPU_RESTRICT
71+
binding_table) {
72+
if (buffer_ref.type == IREE_HAL_AMDGPU_DEVICE_BUFFER_TYPE_SLOT) {
73+
const iree_hal_amdgpu_device_buffer_ref_t binding =
74+
binding_table[buffer_ref.value.slot];
75+
const uint64_t offset = buffer_ref.offset + binding.offset;
76+
buffer_ref = (iree_hal_amdgpu_device_uint64_buffer_ref_t){
77+
.type = binding.type,
78+
.offset = offset,
79+
.value.bits = binding.value.bits,
80+
};
81+
}
82+
if (buffer_ref.type == IREE_HAL_AMDGPU_DEVICE_BUFFER_TYPE_HANDLE) {
83+
buffer_ref.value.ptr = buffer_ref.value.handle->ptr;
84+
}
85+
return buffer_ref.value.ptr
86+
? (uint8_t*)buffer_ref.value.ptr + buffer_ref.offset
87+
: NULL;
88+
}

runtime/src/iree/hal/drivers/amdgpu/device/buffer.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,34 @@ static_assert(sizeof(iree_hal_amdgpu_device_workgroup_count_buffer_ref_t) == 16,
131131
#define iree_hal_amdgpu_device_workgroup_count_buffer_ref_length(buffer_ref) \
132132
(sizeof(uint32_t) * 3)
133133

134+
// Describes a buffer binding that contains a single uint64_t value.
135+
// This is a size-optimized version of iree_hal_amdgpu_device_buffer_ref_t so
136+
// that it will fit in our tiny packets. We know the length is a constant 8 and
137+
// only need the offset, type, and value.
138+
typedef struct iree_hal_amdgpu_device_uint64_buffer_ref_t {
139+
// Type of the buffer reference used to resolve the device pointer.
140+
uint64_t type : 2; // iree_hal_amdgpu_device_buffer_type_t
141+
// Offset, in bytes, into the buffer that the binding starts at.
142+
// This will be added to the offset specified on each usage of the slot.
143+
uint64_t offset : 62;
144+
union {
145+
// IREE_HAL_AMDGPU_DEVICE_BUFFER_TYPE_PTR: raw device pointer.
146+
void* ptr;
147+
// IREE_HAL_AMDGPU_DEVICE_BUFFER_TYPE_HANDLE: queue-ordered allocation
148+
// handle.
149+
iree_hal_amdgpu_device_allocation_handle_t* handle;
150+
// IREE_HAL_AMDGPU_DEVICE_BUFFER_TYPE_SLOT: binding table slot.
151+
iree_hal_amdgpu_device_buffer_ordinal_t slot;
152+
// Used for setting the value.
153+
uint64_t bits;
154+
} value;
155+
} iree_hal_amdgpu_device_uint64_buffer_ref_t;
156+
static_assert(sizeof(iree_hal_amdgpu_device_uint64_buffer_ref_t) == 16,
157+
"binding table entries should be 8 byte aligned and tiny");
158+
159+
#define iree_hal_amdgpu_device_uint64_buffer_ref_length(buffer_ref) \
160+
sizeof(uint64_t)
161+
134162
#if defined(IREE_AMDGPU_TARGET_DEVICE)
135163

136164
// Resolves a buffer reference to an absolute device pointer.
@@ -155,6 +183,16 @@ void* iree_hal_amdgpu_device_workgroup_count_buffer_ref_resolve(
155183
const iree_hal_amdgpu_device_buffer_ref_t* IREE_AMDGPU_RESTRICT
156184
binding_table);
157185

186+
// Resolves a scalar uint64_t buffer reference to an absolute device pointer.
187+
// This is equivalent to iree_hal_amdgpu_device_buffer_ref_resolve but for a
188+
// fixed-size uint64_t value. The returned pointer should have 8-byte
189+
// alignment.
190+
void* iree_hal_amdgpu_device_uint64_buffer_ref_resolve(
191+
iree_hal_amdgpu_device_uint64_buffer_ref_t buffer_ref,
192+
IREE_AMDGPU_ALIGNAS(64)
193+
const iree_hal_amdgpu_device_buffer_ref_t* IREE_AMDGPU_RESTRICT
194+
binding_table);
195+
158196
#endif // IREE_AMDGPU_TARGET_DEVICE
159197

160198
#endif // IREE_HAL_DRIVERS_AMDGPU_DEVICE_BUFFER_H_

0 commit comments

Comments
 (0)