@@ -156,6 +156,7 @@ struct vk_device_struct {
156156 vk::PhysicalDeviceProperties properties;
157157 std::string name;
158158 uint64_t max_memory_allocation_size;
159+ uint64_t suballocation_block_size;
159160 bool fp16;
160161 bool pipeline_robustness;
161162 vk::Device device;
@@ -2269,6 +2270,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
22692270
22702271 device->physical_device .getProperties2 (&props2);
22712272 device->properties = props2.properties ;
2273+ device->vendor_id = device->properties .vendorID ;
22722274
22732275 const char * GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv (" GGML_VK_FORCE_MAX_ALLOCATION_SIZE" );
22742276
@@ -2280,7 +2282,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
22802282 device->max_memory_allocation_size = props3.maxMemoryAllocationSize ;
22812283 }
22822284
2283- device->vendor_id = device->properties .vendorID ;
2285+ const char * GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv (" GGML_VK_SUBALLOCATION_BLOCK_SIZE" );
2286+
2287+ if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr ) {
2288+ device->suballocation_block_size = std::stoul (GGML_VK_SUBALLOCATION_BLOCK_SIZE);
2289+ #if defined(_WIN32)
2290+ } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
2291+ // Limit batching of allocations to 1GB by default to avoid fragmentation issues
2292+ device->suballocation_block_size = 1024 *1024 *1024 ;
2293+ #endif
2294+ } else {
2295+ device->suballocation_block_size = device->max_memory_allocation_size ;
2296+ }
2297+ device->suballocation_block_size = std::min (device->suballocation_block_size , device->max_memory_allocation_size );
2298+
22842299 device->subgroup_size = subgroup_props.subgroupSize ;
22852300 device->uma = device->properties .deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
22862301 if (sm_builtins) {
@@ -7561,7 +7576,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
75617576
75627577static size_t ggml_backend_vk_buffer_type_get_max_size (ggml_backend_buffer_type_t buft) {
75637578 ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context ;
7564- return ctx->device ->max_memory_allocation_size ;
7579+ return ctx->device ->suballocation_block_size ;
75657580}
75667581
75677582static size_t ggml_backend_vk_buffer_type_get_alloc_size (ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
0 commit comments