Improve suballocator efficiency, add efficiency metrics

per-mathisen-arm · per-mathisen-arm · commit a00c53bc5367 · 2025-12-13T16:51:32.000+01:00
diff --git a/src/memory.cpp b/src/memory.cpp
@@ -81,6 +81,7 @@ memory_requirements get_trackedimage_memory_requirements(VkDevice device, const
 	wrap_vkGetDeviceImageMemoryRequirements(device, &info, &req);
 	reqs.requirements = req.memoryRequirements;
 	reqs.memory_flags = data.memory_flags;
+	assert(reqs.requirements.alignment != 0);
 	return reqs;
 }
 
diff --git a/src/read.cpp b/src/read.cpp
@@ -132,9 +132,16 @@ void lava_reader::finalize(bool terminate)
 	assert(stop_process_cpu_usage.tv_sec >= process_cpu_usage.tv_sec);
 	const uint64_t process_time = diff_timespec(&stop_process_cpu_usage, &process_cpu_usage);
 	ILOG("CPU time spent in ms - readhead workers %lu, API runners %lu, full process %lu", (long unsigned)worker, (long unsigned)runner, (long unsigned)process_time);
-	out["readahead_workers_time"] = (Json::Value::UInt64)worker;
-	out["api_runners_time"] = (Json::Value::UInt64)runner;
-	out["process_time"] = (Json::Value::UInt64)process_time;
+	out["readahead_workers_time"] = worker;
+	out["api_runners_time"] = runner;
+	out["process_time"] = process_time;
+	suballoc_metrics sm = allocator.performance();
+	out["suballocator_used"] = sm.used;
+	out["suballocator_allocated"] = sm.allocated;
+	out["suballocator_heaps"] = sm.heaps;
+	out["suballocator_objects"] = sm.objects;
+	out["suballocator_efficiency"] = sm.efficiency;
+	ILOG("Suballocator used=%lu allocated=%lu heaps=%u objects=%u efficiency=%g", (unsigned long)sm.used, (unsigned long)sm.allocated, (unsigned)sm.heaps, (unsigned)sm.objects, sm.efficiency);
 	if (terminate)
 	{
 		for (auto& v : *thread_call_numbers) v = 0; // stop waiting threads from progressing
diff --git a/src/suballocator.cpp b/src/suballocator.cpp
@@ -40,6 +40,7 @@ struct heap
 	VkDeviceMemory mem;
 	VkDeviceSize free;
 	VkDeviceSize total;
+	VkMemoryPropertyFlags flags;
 	/// This one does not need to be concurrent safe, since each thread owns its own heap
 	/// and only it may iterate over and modify the allocations list.
 	std::list<suballocation> subs;
@@ -76,6 +77,10 @@ struct suballocator_private
 	std::vector<lookup> tensor_lookup;
 	/// Does this device have the an annoying optimal-to-linear padding requirement? If so, put optimal and linear objects in different memory heaps
 	bool allow_mixed_tiling = true;
+	std::atomic_uint64_t used_bytes { 0 };
+	std::atomic_uint_least32_t used_count { 0 };
+	std::atomic_uint64_t allocated_bytes { 0 };
+	std::atomic_uint_least32_t allocated_heaps { 0 };
 
 	void print_memory_usage();
 	uint32_t get_device_memory_type(uint32_t type_filter, VkMemoryPropertyFlags properties);
@@ -87,6 +92,7 @@ struct suballocator_private
 		lava_tiling tiling, bool dedicated, VkMemoryAllocateFlags allocflags);
 	void self_test();
 	void bind(heap& h, const suballocation& s);
+	suballoc_metrics performance() const;
 
 	inline bool needs_flush(unsigned memoryTypeIndex) { return !(memory_properties.memoryTypes[memoryTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); }
 };
@@ -102,6 +108,22 @@ static VkMemoryPropertyFlags prune_memory_flags(VkMemoryPropertyFlags flags)
 	return flags;
 }
 
+suballoc_metrics suballocator::performance() const
+{
+	return priv->performance();
+}
+
+suballoc_metrics suballocator_private::performance() const
+{
+	suballoc_metrics m;
+	m.used = used_bytes;
+	m.objects = used_count;
+	m.heaps = allocated_heaps;
+	m.allocated = allocated_bytes;
+	m.efficiency = (double)used_bytes / (double)allocated_bytes;
+	return m;
+}
+
 void suballocator_private::print_memory_usage()
 {
 	printf("Suballocator memory usage:\n");
@@ -244,11 +266,14 @@ suballoc_location suballocator_private::add_object_new(VkDevice device, uint16_t
 	{
 		h.mem = (VkDeviceMemory)malloc(info.allocationSize);
 	}
+	allocated_bytes += info.allocationSize;
+	allocated_heaps++;
 	h.free = info.allocationSize - s.size;
 	h.total = info.allocationSize;
 	h.memoryTypeIndex = memoryTypeIndex;
 	h.tiling = tiling;
 	h.subs.push_back(s);
+	h.flags = flags;
 	DLOG2("allocating new memory pool with size = %lu, free = %lu (memoryTypeIndex=%u, tiling=%u)", (unsigned long)info.allocationSize,
 	      (unsigned long)h.free, (unsigned)memoryTypeIndex, (unsigned)tiling);
 	auto it = heaps.push_back(h);
@@ -260,13 +285,15 @@ suballoc_location suballocator_private::add_object_new(VkDevice device, uint16_t
 suballoc_location suballocator_private::add_object(VkDevice device, uint16_t tid, uint32_t memoryTypeIndex, suballocation &s, VkMemoryPropertyFlags flags,
 	lava_tiling tiling, bool dedicated, VkMemoryAllocateFlags allocflags)
 {
+	used_count++;
+	used_bytes += s.size;
+	assert(s.alignment != 0);
 	if (dedicated)
 	{
 		return add_object_new(device, tid, memoryTypeIndex, s, flags, tiling, dedicated, allocflags);
 	}
 	for (heap& h : heaps)
 	{
-		VkMemoryPropertyFlags f = memory_properties.memoryTypes[h.memoryTypeIndex].propertyFlags;
 		// this is a safe time to actually delete things
 		if (!h.deletes.empty())
 		{
@@ -287,7 +314,7 @@ suballoc_location suballocator_private::add_object(VkDevice device, uint16_t tid
 			h.deletes.clear();
 		}
 		// find suballocation
-		if (h.tid == tid && (flags & f) == flags && h.free >= s.size && h.memoryTypeIndex == memoryTypeIndex && (h.tiling == tiling || allow_mixed_tiling))
+		if (h.tid == tid && (flags & h.flags) == flags && h.free >= s.size && h.memoryTypeIndex == memoryTypeIndex && (h.tiling == tiling || allow_mixed_tiling))
 		{
 			// First case: nothing allocated in heap. In this case, we do not care about alignment, because according to the spec:
 			// "Allocations returned by vkAllocateMemory are guaranteed to meet any alignment requirement of the implementation."
@@ -398,6 +425,7 @@ suballoc_location suballocator::add_image(uint16_t tid, VkDevice device, VkImage
 
 suballoc_location suballocator::add_trackedobject(uint16_t tid, VkDevice device, const memory_requirements& reqs, uint64_t native, const trackedobject& data)
 {
+	assert(reqs.requirements.alignment != 0); // not properly initialized!
 	const VkMemoryPropertyFlags memory_flags = prune_memory_flags(data.memory_flags);
 	const uint32_t memoryTypeIndex = priv->get_device_memory_type(reqs.requirements.memoryTypeBits, memory_flags);
 	suballocation s;
diff --git a/src/suballocator.h b/src/suballocator.h
@@ -14,11 +14,23 @@ struct suballoc_location
 
 struct suballocator_private;
 
+struct suballoc_metrics
+{
+	uint64_t used = 0;
+	uint64_t allocated = 0;
+	uint32_t heaps = 0;
+	uint32_t objects = 0;
+	double efficiency = 0.0;
+};
+
 struct suballocator
 {
 	/// Call as early as possible to set up internal data structures. Must be called before any other suballoc function.
 	void init(int num_images, int num_buffers, int tensors, int heap_size = -1, bool fake = false);
 
+	/// Get performance metrics for the suballocator. This is not thread safe.
+	suballoc_metrics performance() const;
+
 	suballocator();
 	~suballocator();
 

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ memory_requirements get_trackedimage_memory_requirements(VkDevice device, const`
`81`	`81`	`wrap_vkGetDeviceImageMemoryRequirements(device, &info, &req);`
`82`	`82`	`reqs.requirements = req.memoryRequirements;`
`83`	`83`	`reqs.memory_flags = data.memory_flags;`
	`84`	`+ assert(reqs.requirements.alignment != 0);`
`84`	`85`	`return reqs;`
`85`	`86`	`}`
`86`	`87`