|
| 1 | +// Copyright 2025 The IREE Authors |
| 2 | +// |
| 3 | +// Licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | + |
| 7 | +#include "iree/hal/drivers/amdgpu/allocator.h" |
| 8 | + |
| 9 | +#include "iree/hal/drivers/amdgpu/buffer.h" |
| 10 | +#include "iree/hal/drivers/amdgpu/util/topology.h" |
| 11 | + |
| 12 | +//===----------------------------------------------------------------------===// |
| 13 | +// iree_hal_amdgpu_allocator_t |
| 14 | +//===----------------------------------------------------------------------===// |
| 15 | + |
| 16 | +// TODO(benvanik): use one ID per address space or pool - each shows as a |
| 17 | +// different track in tracing tools. |
| 18 | +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING |
| 19 | +static const char* IREE_HAL_AMDGPU_ALLOCATOR_ID = "AMDGPU unpooled"; |
| 20 | +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING |
| 21 | + |
| 22 | +typedef struct iree_hal_amdgpu_allocator_t { |
| 23 | + iree_hal_resource_t resource; |
| 24 | + iree_allocator_t host_allocator; |
| 25 | + |
| 26 | + // Unowned libhsa handle. Must be retained by the owner. |
| 27 | + const iree_hal_amdgpu_libhsa_t* libhsa; |
| 28 | + // Topology with all CPU and GPU agents. |
| 29 | + const iree_hal_amdgpu_topology_t* topology; |
| 30 | + |
| 31 | + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) |
| 32 | +} iree_hal_amdgpu_allocator_t; |
| 33 | + |
| 34 | +static const iree_hal_allocator_vtable_t iree_hal_amdgpu_allocator_vtable; |
| 35 | + |
| 36 | +static iree_hal_amdgpu_allocator_t* iree_hal_amdgpu_allocator_cast( |
| 37 | + iree_hal_allocator_t* base_value) { |
| 38 | + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_amdgpu_allocator_vtable); |
| 39 | + return (iree_hal_amdgpu_allocator_t*)base_value; |
| 40 | +} |
| 41 | + |
| 42 | +iree_status_t iree_hal_amdgpu_allocator_create( |
| 43 | + const iree_hal_amdgpu_libhsa_t* libhsa, |
| 44 | + const iree_hal_amdgpu_topology_t* topology, iree_allocator_t host_allocator, |
| 45 | + iree_hal_allocator_t** out_allocator) { |
| 46 | + IREE_ASSERT_ARGUMENT(libhsa); |
| 47 | + IREE_ASSERT_ARGUMENT(topology); |
| 48 | + IREE_ASSERT_ARGUMENT(out_allocator); |
| 49 | + IREE_TRACE_ZONE_BEGIN(z0); |
| 50 | + |
| 51 | + iree_hal_amdgpu_allocator_t* allocator = NULL; |
| 52 | + IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| 53 | + z0, iree_allocator_malloc(host_allocator, sizeof(*allocator), |
| 54 | + (void**)&allocator)); |
| 55 | + iree_hal_resource_initialize(&iree_hal_amdgpu_allocator_vtable, |
| 56 | + &allocator->resource); |
| 57 | + allocator->host_allocator = host_allocator; |
| 58 | + allocator->libhsa = libhsa; |
| 59 | + allocator->topology = topology; |
| 60 | + |
| 61 | + // TODO(benvanik): query device heaps, supported features (concurrent |
| 62 | + // access/etc), and prepare any pools that will be used during allocation. |
| 63 | + // It's expected that most failures that occur after creation are allocation |
| 64 | + // request-specific so preparing here will help keep the errors more |
| 65 | + // localized. |
| 66 | + iree_status_t status = iree_ok_status(); |
| 67 | + |
| 68 | + if (iree_status_is_ok(status)) { |
| 69 | + *out_allocator = (iree_hal_allocator_t*)allocator; |
| 70 | + } else { |
| 71 | + iree_hal_allocator_release((iree_hal_allocator_t*)allocator); |
| 72 | + } |
| 73 | + IREE_TRACE_ZONE_END(z0); |
| 74 | + return status; |
| 75 | +} |
| 76 | + |
| 77 | +static void iree_hal_amdgpu_allocator_destroy( |
| 78 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { |
| 79 | + IREE_ASSERT_ARGUMENT(base_allocator); |
| 80 | + iree_hal_amdgpu_allocator_t* allocator = |
| 81 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 82 | + IREE_TRACE_ZONE_BEGIN(z0); |
| 83 | + |
| 84 | + iree_allocator_free(allocator->host_allocator, allocator); |
| 85 | + |
| 86 | + IREE_TRACE_ZONE_END(z0); |
| 87 | +} |
| 88 | + |
| 89 | +static iree_allocator_t iree_hal_amdgpu_allocator_host_allocator( |
| 90 | + const iree_hal_allocator_t* IREE_RESTRICT base_allocator) { |
| 91 | + iree_hal_amdgpu_allocator_t* allocator = |
| 92 | + (iree_hal_amdgpu_allocator_t*)base_allocator; |
| 93 | + return allocator->host_allocator; |
| 94 | +} |
| 95 | + |
| 96 | +static iree_status_t iree_hal_amdgpu_allocator_trim( |
| 97 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { |
| 98 | + iree_hal_amdgpu_allocator_t* allocator = |
| 99 | + (iree_hal_amdgpu_allocator_t*)base_allocator; |
| 100 | + |
| 101 | + // TODO(benvanik): if the allocator is retaining any unused resources they |
| 102 | + // should be dropped here. If the underlying implementation has pools or |
| 103 | + // caches it should be notified that a trim is requested. This is called in |
| 104 | + // low-memory situations or when IREE is not going to be used for awhile (low |
| 105 | + // power modes or suspension). |
| 106 | + (void)allocator; |
| 107 | + |
| 108 | + return iree_ok_status(); |
| 109 | +} |
| 110 | + |
| 111 | +static void iree_hal_amdgpu_allocator_query_statistics( |
| 112 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 113 | + iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) { |
| 114 | + IREE_STATISTICS({ |
| 115 | + iree_hal_amdgpu_allocator_t* allocator = |
| 116 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 117 | + memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics)); |
| 118 | + // TODO(benvanik): update statistics (merge). |
| 119 | + }); |
| 120 | +} |
| 121 | + |
| 122 | +static iree_status_t iree_hal_amdgpu_allocator_query_memory_heaps( |
| 123 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 124 | + iree_host_size_t capacity, |
| 125 | + iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps, |
| 126 | + iree_host_size_t* IREE_RESTRICT out_count) { |
| 127 | + iree_hal_amdgpu_allocator_t* allocator = |
| 128 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 129 | + |
| 130 | + // TODO(benvanik): return heap information. This is called at least once with |
| 131 | + // a capacity that may be 0 (indicating a query for the total count) and the |
| 132 | + // heaps should only be populated if capacity is sufficient to store all of |
| 133 | + // them. |
| 134 | + (void)allocator; |
| 135 | + iree_status_t status = |
| 136 | + iree_make_status(IREE_STATUS_UNIMPLEMENTED, "heap query not implemented"); |
| 137 | + |
| 138 | + return status; |
| 139 | +} |
| 140 | + |
| 141 | +static iree_hal_buffer_compatibility_t |
| 142 | +iree_hal_amdgpu_allocator_query_buffer_compatibility( |
| 143 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 144 | + iree_hal_buffer_params_t* IREE_RESTRICT params, |
| 145 | + iree_device_size_t* IREE_RESTRICT allocation_size) { |
| 146 | + iree_hal_amdgpu_allocator_t* allocator = |
| 147 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 148 | + |
| 149 | + // TODO(benvanik): set compatibility rules based on the implementation. |
| 150 | + // Note that the user may have requested that the allocator place the |
| 151 | + // allocation based on whatever is optimal for the indicated usage by |
| 152 | + // including the IREE_HAL_MEMORY_TYPE_OPTIMAL flag. It's still required that |
| 153 | + // the implementation meet all the requirements but it is free to place it in |
| 154 | + // either host or device memory so long as the appropriate bits are updated to |
| 155 | + // indicate where it landed. |
| 156 | + (void)allocator; |
| 157 | + iree_hal_buffer_compatibility_t compatibility = |
| 158 | + IREE_HAL_BUFFER_COMPATIBILITY_NONE; |
| 159 | + |
| 160 | + // We are now optimal. |
| 161 | + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; |
| 162 | + |
| 163 | + // Guard against the corner case where the requested buffer size is 0. The |
| 164 | + // application is unlikely to do anything when requesting a 0-byte buffer; but |
| 165 | + // it can happen in real world use cases. So we should at least not crash. |
| 166 | + if (*allocation_size == 0) *allocation_size = 4; |
| 167 | + |
| 168 | + return compatibility; |
| 169 | +} |
| 170 | + |
| 171 | +static iree_status_t iree_hal_amdgpu_allocator_allocate_buffer( |
| 172 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 173 | + const iree_hal_buffer_params_t* IREE_RESTRICT params, |
| 174 | + iree_device_size_t allocation_size, |
| 175 | + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { |
| 176 | + iree_hal_amdgpu_allocator_t* allocator = |
| 177 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 178 | + |
| 179 | + // Coerce options into those required by the current device. |
| 180 | + iree_hal_buffer_params_t compat_params = *params; |
| 181 | + iree_hal_buffer_compatibility_t compatibility = |
| 182 | + iree_hal_amdgpu_allocator_query_buffer_compatibility( |
| 183 | + base_allocator, &compat_params, &allocation_size); |
| 184 | + if (!iree_all_bits_set(compatibility, |
| 185 | + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { |
| 186 | + // TODO(benvanik): make a helper for this. |
| 187 | +#if IREE_STATUS_MODE |
| 188 | + iree_bitfield_string_temp_t temp0, temp1, temp2; |
| 189 | + iree_string_view_t memory_type_str = |
| 190 | + iree_hal_memory_type_format(params->type, &temp0); |
| 191 | + iree_string_view_t usage_str = |
| 192 | + iree_hal_buffer_usage_format(params->usage, &temp1); |
| 193 | + iree_string_view_t compatibility_str = |
| 194 | + iree_hal_buffer_compatibility_format(compatibility, &temp2); |
| 195 | + return iree_make_status( |
| 196 | + IREE_STATUS_INVALID_ARGUMENT, |
| 197 | + "allocator cannot allocate a buffer with the given parameters; " |
| 198 | + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", |
| 199 | + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, |
| 200 | + usage_str.data, (int)compatibility_str.size, compatibility_str.data); |
| 201 | +#else |
| 202 | + return iree_make_status( |
| 203 | + IREE_STATUS_INVALID_ARGUMENT, |
| 204 | + "allocator cannot allocate a buffer with the given parameters"); |
| 205 | +#endif // IREE_STATUS_MODE |
| 206 | + } |
| 207 | + |
| 208 | + // TODO(benvanik): allocate the underlying device memory. The impl_ptr is just |
| 209 | + // used for accounting and can be an opaque value (handle/etc) so long as it |
| 210 | + // is consistent between the alloc and free and unique to the buffer while it |
| 211 | + // is live. An example iree_hal_amdgpu_external_buffer_wrap is provided that |
| 212 | + // can be used for implementations that are managing memory using underlying |
| 213 | + // allocators and just wrapping those device pointers in the HAL buffer type. |
| 214 | + // Other implementations that require more tracking can provide their own |
| 215 | + // buffer types that do such tracking for them. |
| 216 | + (void)allocator; |
| 217 | + void* impl_ptr = NULL; |
| 218 | + (void)impl_ptr; |
| 219 | + iree_hal_buffer_t* buffer = NULL; |
| 220 | + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, |
| 221 | + "buffer allocation not implemented"); |
| 222 | + |
| 223 | + if (iree_status_is_ok(status)) { |
| 224 | + // TODO(benvanik): ensure this accounting is balanced in deallocate_buffer. |
| 225 | + IREE_TRACE_ALLOC_NAMED(IREE_HAL_AMDGPU_ALLOCATOR_ID, impl_ptr, |
| 226 | + allocation_size); |
| 227 | + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( |
| 228 | + &allocator->statistics, compat_params.type, allocation_size)); |
| 229 | + *out_buffer = buffer; |
| 230 | + } else { |
| 231 | + iree_hal_buffer_release(buffer); |
| 232 | + } |
| 233 | + return status; |
| 234 | +} |
| 235 | + |
| 236 | +static void iree_hal_amdgpu_allocator_deallocate_buffer( |
| 237 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 238 | + iree_hal_buffer_t* IREE_RESTRICT base_buffer) { |
| 239 | + iree_hal_amdgpu_allocator_t* allocator = |
| 240 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 241 | + |
| 242 | + // TODO(benvanik): free the underlying device memory here. Buffers allocated |
| 243 | + // from this allocator will call this method to handle cleanup. Note that |
| 244 | + // because this method is responsible for doing the base |
| 245 | + // iree_hal_buffer_destroy and the caller assumes the memory has been freed an |
| 246 | + // implementation could pool the buffer handle and return it in the future. |
| 247 | + (void)allocator; |
| 248 | + void* impl_ptr = NULL; |
| 249 | + (void)impl_ptr; |
| 250 | + |
| 251 | + // TODO(benvanik): if the buffer was imported then this accounting may need to |
| 252 | + // be conditional depending on the implementation. |
| 253 | + bool was_imported = false; |
| 254 | + if (!was_imported) { |
| 255 | + IREE_TRACE_FREE_NAMED(IREE_HAL_AMDGPU_ALLOCATOR_ID, impl_ptr); |
| 256 | + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( |
| 257 | + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), |
| 258 | + iree_hal_buffer_allocation_size(base_buffer))); |
| 259 | + } |
| 260 | + |
| 261 | + iree_hal_buffer_destroy(base_buffer); |
| 262 | +} |
| 263 | + |
| 264 | +static iree_status_t iree_hal_amdgpu_allocator_import_buffer( |
| 265 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 266 | + const iree_hal_buffer_params_t* IREE_RESTRICT params, |
| 267 | + iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, |
| 268 | + iree_hal_buffer_release_callback_t release_callback, |
| 269 | + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { |
| 270 | + iree_hal_amdgpu_allocator_t* allocator = |
| 271 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 272 | + |
| 273 | + // Coerce options into those required by the current device. |
| 274 | + iree_hal_buffer_params_t compat_params = *params; |
| 275 | + iree_device_size_t allocation_size = external_buffer->size; |
| 276 | + iree_hal_buffer_compatibility_t compatibility = |
| 277 | + iree_hal_amdgpu_allocator_query_buffer_compatibility( |
| 278 | + base_allocator, &compat_params, &allocation_size); |
| 279 | + if (!iree_all_bits_set(compatibility, |
| 280 | + IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { |
| 281 | + // TODO(benvanik): make a helper for this. |
| 282 | +#if IREE_STATUS_MODE |
| 283 | + iree_bitfield_string_temp_t temp0, temp1, temp2; |
| 284 | + iree_string_view_t memory_type_str = |
| 285 | + iree_hal_memory_type_format(params->type, &temp0); |
| 286 | + iree_string_view_t usage_str = |
| 287 | + iree_hal_buffer_usage_format(params->usage, &temp1); |
| 288 | + iree_string_view_t compatibility_str = |
| 289 | + iree_hal_buffer_compatibility_format(compatibility, &temp2); |
| 290 | + return iree_make_status( |
| 291 | + IREE_STATUS_INVALID_ARGUMENT, |
| 292 | + "allocator cannot import a buffer with the given parameters; " |
| 293 | + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", |
| 294 | + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, |
| 295 | + usage_str.data, (int)compatibility_str.size, compatibility_str.data); |
| 296 | +#else |
| 297 | + return iree_make_status( |
| 298 | + IREE_STATUS_INVALID_ARGUMENT, |
| 299 | + "allocator cannot import a buffer with the given parameters"); |
| 300 | +#endif // IREE_STATUS_MODE |
| 301 | + } |
| 302 | + |
| 303 | + // TODO(benvanik): switch on external_buffer->type and import the buffer. See |
| 304 | + // the headers for more information on semantics. Most implementations can |
| 305 | + // service IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION by just wrapping |
| 306 | + // the underlying device pointer. Those that can service |
| 307 | + // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot of |
| 308 | + // additional copies when moving data around between host and device or across |
| 309 | + // devices from different drivers. |
| 310 | + (void)allocator; |
| 311 | + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, |
| 312 | + "external buffer type not supported"); |
| 313 | + |
| 314 | + return status; |
| 315 | +} |
| 316 | + |
| 317 | +static iree_status_t iree_hal_amdgpu_allocator_export_buffer( |
| 318 | + iree_hal_allocator_t* IREE_RESTRICT base_allocator, |
| 319 | + iree_hal_buffer_t* IREE_RESTRICT buffer, |
| 320 | + iree_hal_external_buffer_type_t requested_type, |
| 321 | + iree_hal_external_buffer_flags_t requested_flags, |
| 322 | + iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) { |
| 323 | + iree_hal_amdgpu_allocator_t* allocator = |
| 324 | + iree_hal_amdgpu_allocator_cast(base_allocator); |
| 325 | + |
| 326 | + // TODO(benvanik): switch on requested_type and export as appropriate. Most |
| 327 | + // implementations can service IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION |
| 328 | + // by just exposing the underlying device pointer. Those that can service |
| 329 | + // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot of |
| 330 | + // additional copies when moving data around between host and device or across |
| 331 | + // devices from different drivers. |
| 332 | + (void)allocator; |
| 333 | + return iree_make_status(IREE_STATUS_UNAVAILABLE, |
| 334 | + "external buffer type not supported"); |
| 335 | +} |
| 336 | + |
| 337 | +static const iree_hal_allocator_vtable_t iree_hal_amdgpu_allocator_vtable = { |
| 338 | + .destroy = iree_hal_amdgpu_allocator_destroy, |
| 339 | + .host_allocator = iree_hal_amdgpu_allocator_host_allocator, |
| 340 | + .trim = iree_hal_amdgpu_allocator_trim, |
| 341 | + .query_statistics = iree_hal_amdgpu_allocator_query_statistics, |
| 342 | + .query_memory_heaps = iree_hal_amdgpu_allocator_query_memory_heaps, |
| 343 | + .query_buffer_compatibility = |
| 344 | + iree_hal_amdgpu_allocator_query_buffer_compatibility, |
| 345 | + .allocate_buffer = iree_hal_amdgpu_allocator_allocate_buffer, |
| 346 | + .deallocate_buffer = iree_hal_amdgpu_allocator_deallocate_buffer, |
| 347 | + .import_buffer = iree_hal_amdgpu_allocator_import_buffer, |
| 348 | + .export_buffer = iree_hal_amdgpu_allocator_export_buffer, |
| 349 | +}; |
0 commit comments