|
| 1 | +// Copyright 2025 The IREE Authors |
| 2 | +// |
| 3 | +// Licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | + |
| 7 | +#ifndef IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_ |
| 8 | +#define IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_ |
| 9 | + |
| 10 | +#include "iree/base/api.h" |
| 11 | +#include "iree/base/internal/arena.h" |
| 12 | +#include "iree/hal/api.h" |
| 13 | +#include "iree/hal/drivers/amdgpu/device/command_buffer.h" |
| 14 | +#include "iree/hal/drivers/amdgpu/util/affinity.h" |
| 15 | + |
| 16 | +typedef struct iree_hal_amdgpu_block_pools_t iree_hal_amdgpu_block_pools_t; |
| 17 | + |
| 18 | +//===----------------------------------------------------------------------===// |
| 19 | +// iree_hal_amdgpu_command_buffer_options_t |
| 20 | +//===----------------------------------------------------------------------===// |
| 21 | + |
| 22 | +// Determines where and how command buffers are recorded. |
| 23 | +typedef enum iree_hal_amdgpu_command_buffer_recording_flags_t { |
| 24 | + IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_NONE = 0u, |
| 25 | + |
| 26 | + // TODO(benvanik): support lead-physical-device storage. This would need the |
| 27 | + // block pool on the lead device to make its blocks accessible to all devices |
| 28 | + // - today the block pool is device-local only. Produced data is immutable and |
| 29 | + // PCIe atomics/coherency is not required across devices. |
| 30 | + // |
| 31 | + // Allocate embedded data on the lead physical device instead of on each |
| 32 | + // device the command buffer is recorded for. This reduces overall memory |
| 33 | + // consumption and recording time at the cost of cross-device transfers. |
| 34 | + // IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_DATA_ON_LEAD_PHYSICAL_DEVICE |
| 35 | + // = 1u << 0, |
| 36 | + |
| 37 | + // TODO(benvanik): support compaction. This would require changing the command |
| 38 | + // buffer to use relative offsets for embedded data and a data table for |
| 39 | + // indirecting so that we can move around base pointers. A fixup would be |
| 40 | + // possible as well by launching a kernel that rebased the embedded pointers |
| 41 | + // (though trickier). For now we assume the block pool block size is a big |
| 42 | + // enough lever and most programs only use a handful of command buffers so |
| 43 | + // the waste per command buffer is minimal (compared to a single layer weight |
| 44 | + // in an ML model). |
| 45 | + // |
| 46 | + // Compacts the command buffer when recording ends by reallocating it to the |
| 47 | + // precise size required and reuploads it to each device. This will return any |
| 48 | + // block pool blocks back to their respective pool for reuse and ensure |
| 49 | + // there's no unused device memory - the cost is extra host time to do the |
| 50 | + // reallocation/copies. |
| 51 | + // IREE_HAL_AMDGPU_COMMAND_BUFFER_RECORDING_FLAG_COMPACT_ON_FINALIZE |
| 52 | + // = 1u << 1, |
| 53 | +} iree_hal_amdgpu_command_buffer_recording_flags_t; |
| 54 | + |
| 55 | +// TODO(benvanik): move this someplace common. |
| 56 | +// |
| 57 | +// Block pools for host memory blocks of various sizes. |
| 58 | +typedef struct iree_hal_amdgpu_host_block_pools_t { |
| 59 | + // Used for small allocations of around 1-4KB. |
| 60 | + iree_arena_block_pool_t small; |
| 61 | + // Used for large page-sized allocations of 32-64kB. |
| 62 | + iree_arena_block_pool_t large; |
| 63 | +} iree_hal_amdgpu_host_block_pools_t; |
| 64 | + |
| 65 | +// Minimum number of AQL packets in a single command buffer block. |
| 66 | +// Any fewer and it's not guaranteed a command buffer can complete execution. |
| 67 | +#define IREE_HAL_AMDGPU_COMMAND_BUFFER_MIN_BLOCK_AQL_PACKET_COUNT (16) |
| 68 | + |
| 69 | +// Maximum number of AQL packets in a single command buffer block. |
| 70 | +// This is currently limited by the `uint16_t packet_offset` in |
| 71 | +// iree_hal_amdgpu_device_cmd_header_t. |
| 72 | +// |
| 73 | +// TODO(benvanik): currently we also limit this by tracy's outstanding GPU event |
| 74 | +// limit. If we made our own timeline (which we really need to for concurrency) |
| 75 | +// then we could eliminate this artificial limit. |
| 76 | +#define IREE_HAL_AMDGPU_COMMAND_BUFFER_MAX_BLOCK_AQL_PACKET_COUNT \ |
| 77 | + IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_DEVICE_QUERY_RINGBUFFER_CAPACITY, \ |
| 78 | + (1u << sizeof(((iree_hal_amdgpu_device_cmd_header_t*)NULL) \ |
| 79 | + ->packet_offset) * \ |
| 80 | + 8)) |
| 81 | + |
| 82 | +// Recording options for a command buffer. |
| 83 | +// Referenced data structures such as block pools must remain live for the |
| 84 | +// lifetime of the command buffer but the options struct and its storage (such |
| 85 | +// as the device block pool list) need not. |
| 86 | +typedef struct iree_hal_amdgpu_command_buffer_options_t { |
| 87 | + iree_hal_allocator_t* device_allocator; |
| 88 | + iree_hal_command_buffer_mode_t mode; |
| 89 | + iree_hal_command_category_t command_categories; |
| 90 | + iree_hal_queue_affinity_t queue_affinity; |
| 91 | + iree_host_size_t binding_capacity; |
| 92 | + |
| 93 | + // Controls recording behavior (placement, optimization, debugging, etc). |
| 94 | + iree_hal_amdgpu_command_buffer_recording_flags_t recording_flags; |
| 95 | + |
| 96 | + // Maximum number of AQL packets the command buffer is allowed to issue at |
| 97 | + // a time. Must be at or under the HSA queue capacity of any execution queue |
| 98 | + // the command buffer will be scheduled on. The command buffer may decide to |
| 99 | + // use fewer packets. |
| 100 | + iree_host_size_t block_aql_packet_count; |
| 101 | + |
| 102 | + // Block pools for host memory blocks of various sizes. |
| 103 | + iree_hal_amdgpu_host_block_pools_t* host_block_pools; |
| 104 | + |
| 105 | + // Bitmap of physical devices that the command buffer will be recorded for. |
| 106 | + // The command buffer can only be issued on these devices. |
| 107 | + iree_hal_amdgpu_device_affinity_t device_affinity; |
| 108 | + |
| 109 | + // Compact list of physical device block pools corresponding to the bits set |
| 110 | + // in the device_affinity bitmap. A device affinity of 0b110 would lead to two |
| 111 | + // device block pools in the list at [0] and [1]. |
| 112 | + iree_hal_amdgpu_block_pools_t* const* device_block_pools /*[device_count]*/; |
| 113 | +} iree_hal_amdgpu_command_buffer_options_t; |
| 114 | + |
| 115 | +// Initializes |out_options| to its default values. |
| 116 | +void iree_hal_amdgpu_command_buffer_options_initialize( |
| 117 | + iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, |
| 118 | + iree_hal_command_category_t command_categories, |
| 119 | + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, |
| 120 | + iree_hal_amdgpu_command_buffer_options_t* out_options); |
| 121 | + |
| 122 | +// Verifies command buffer options to ensure they meet the requirements of the |
| 123 | +// devices the command buffer will be scheduled on. |
| 124 | +iree_status_t iree_hal_amdgpu_command_buffer_options_verify( |
| 125 | + const iree_hal_amdgpu_command_buffer_options_t* options); |
| 126 | + |
| 127 | +//===----------------------------------------------------------------------===// |
| 128 | +// iree_hal_amdgpu_command_buffer_t |
| 129 | +//===----------------------------------------------------------------------===// |
| 130 | + |
| 131 | +// Creates an AMDGPU command buffer with the given |options| controlling how |
| 132 | +// it is recorded and prepared for execution. |
| 133 | +// |
| 134 | +// Referenced data structures in the options such as block pools must remain |
| 135 | +// live for the lifetime of the command buffer. |
| 136 | +iree_status_t iree_hal_amdgpu_command_buffer_create( |
| 137 | + const iree_hal_amdgpu_command_buffer_options_t* options, |
| 138 | + iree_allocator_t host_allocator, |
| 139 | + iree_hal_command_buffer_t** out_command_buffer); |
| 140 | + |
| 141 | +// Returns true if |command_buffer| is a AMDGPU command buffer. |
| 142 | +bool iree_hal_amdgpu_command_buffer_isa( |
| 143 | + iree_hal_command_buffer_t* command_buffer); |
| 144 | + |
| 145 | +// Queries the device-side command buffer representation for the GPU device |
| 146 | +// agent with |device_ordinal| in the system topology. |
| 147 | +// |out_max_kernarg_capacity| will be set to the minimum required kernarg |
| 148 | +// reservation used by any block in the command buffer. |
| 149 | +iree_status_t iree_hal_amdgpu_command_buffer_query_execution_state( |
| 150 | + iree_hal_command_buffer_t* command_buffer, iree_host_size_t device_ordinal, |
| 151 | + IREE_AMDGPU_DEVICE_PTR iree_hal_amdgpu_device_command_buffer_t** |
| 152 | + out_device_command_buffer, |
| 153 | + iree_host_size_t* out_max_kernarg_capacity); |
| 154 | + |
| 155 | +#endif // IREE_HAL_DRIVERS_AMDGPU_COMMAND_BUFFER_H_ |
0 commit comments