diff --git a/soc/nordic/common/Kconfig b/soc/nordic/common/Kconfig
index 782d9452b67..e1fcd713c77 100644
--- a/soc/nordic/common/Kconfig
+++ b/soc/nordic/common/Kconfig
@@ -48,5 +48,22 @@ source "subsys/logging/Kconfig.template.log_config"
 
 endif # MRAM_LATENCY
 
+if HAS_NORDIC_DMM
+
+config DMM_HEAP_CHUNKS
+	int "Number of chunks in the DMM heap"
+	default 32
+	help
+	  DMM is using a simplified heap which is using 32 bit mask to allocate
+	  required buffer which consists of contiguous chunks. If there are many
+	  small buffers used with DMM it is possible that allocation will fail.
+	  Number of chunks is a trade-off between performance and granularity.
+	  Must be multiply of 32.
+
+config DMM_STATS
+	bool "Usage statistics"
+
+endif # HAS_NORDIC_DMM
+
 rsource "vpr/Kconfig"
 rsource "uicr/Kconfig"
diff --git a/soc/nordic/common/dmm.c b/soc/nordic/common/dmm.c
index 0b4e42f8c6d..ac22f8ee430 100644
--- a/soc/nordic/common/dmm.c
+++ b/soc/nordic/common/dmm.c
@@ -6,7 +6,7 @@
 #include <string.h>
 #include <zephyr/cache.h>
 #include <zephyr/kernel.h>
-#include <zephyr/sys/sys_heap.h>
+#include <zephyr/sys/bitarray.h>
 #include <zephyr/mem_mgmt/mem_attr.h>
 #include "dmm.h"
 
@@ -26,6 +26,9 @@
 	 .dt_align = DMM_REG_ALIGN_SIZE(node_id),                                                  \
 	 .dt_allc = &_BUILD_LINKER_END_VAR(node_id)},
 
+#define HEAP_NUM_WORDS (CONFIG_DMM_HEAP_CHUNKS / 32)
+BUILD_ASSERT(IS_ALIGNED(CONFIG_DMM_HEAP_CHUNKS, 32));
+
 /* Generate declarations of linker variables used to determine size of preallocated variables
  * stored in memory sections spanning over memory regions.
  * These are used to determine memory left for dynamic bounce buffer allocator to work with.
@@ -42,9 +45,18 @@ struct dmm_region {
 };
 
 struct dmm_heap {
-	struct sys_heap heap;
+	uint32_t mask[HEAP_NUM_WORDS];
+	atomic_t tail_mask[HEAP_NUM_WORDS];
+	uintptr_t ptr;
+	uintptr_t ptr_end;
+	size_t blk_size;
 	const struct dmm_region *region;
+	sys_bitarray_t bitarray;
+#ifdef CONFIG_DMM_STATS
+	atomic_t curr_use;
+	uint32_t max_use;
 	struct k_spinlock lock;
+#endif
 };
 
 static const struct dmm_region dmm_regions[] = {
@@ -55,7 +67,6 @@ struct {
 	struct dmm_heap dmm_heaps[ARRAY_SIZE(dmm_regions)];
 } dmm_heaps_data;
 
-
 static struct dmm_heap *dmm_heap_find(void *region)
 {
 	struct dmm_heap *dh;
@@ -103,37 +114,154 @@ static bool is_user_buffer_correctly_preallocated(void const *user_buffer, size_
 	return false;
 }
 
-static size_t dmm_heap_start_get(struct dmm_heap *dh)
+/* Function updates the tail bits mask after the allocation. Tail bits are all bits
+ * except the head. Tail bits mask together with a known index of the start of
+ * chunk (because freeing has a buffer address) allows to determine the size of the
+ * buffer (how many chunks were included. Because tail_mask is updated after allocation
+ * we can safely modify bits that represents allocated buffer, we only need to use
+ * atomic operation on the mask since mask may be modified (but different bits).
+ */
+static void tail_mask_set(atomic_t *tail_mask, size_t num_bits, size_t off)
 {
-	return ROUND_UP(dh->region->dt_allc, dh->region->dt_align);
+	size_t tail_bits = num_bits - 1;
+	size_t tail_off = off + 1;
+
+	if (tail_bits == 0) {
+		return;
+	}
+
+	if (HEAP_NUM_WORDS == 1) {
+		atomic_or(tail_mask, BIT_MASK(tail_bits) << tail_off);
+		return;
+	}
+
+	size_t idx = tail_off / 32;
+	atomic_t *t_mask = &tail_mask[idx];
+
+	tail_off = tail_off % 32;
+	while (tail_bits > 0) {
+		uint32_t bits = MIN(32 - tail_off, tail_bits);
+		uint32_t mask = (bits == 32) ? UINT32_MAX : (BIT_MASK(bits) << tail_off);
+
+		atomic_or(t_mask, mask);
+		t_mask++;
+		tail_off = 0;
+		tail_bits -= bits;
+	}
 }
 
-static size_t dmm_heap_size_get(struct dmm_heap *dh)
+/* Function determines how many chunks were used for the allocated buffer. It is
+ * determined from tail bits mask and index of the starting chunk (%p off).
+ * Function is called before bits are freed in the bitarray so we can safely modify
+ * bits that belong to that buffer.
+ *
+ * @param tail_mask Pointer to tail_mask array.
+ * @param off Index of the start of the buffer.
+ *
+ * @return Number of chunks that forms the buffer that will be freed.
+ */
+static uint32_t num_bits_get(atomic_t *tail_mask, size_t off)
 {
-	return (dh->region->dt_size - (dmm_heap_start_get(dh) - dh->region->dt_addr));
+	uint32_t num_bits = 1;
+	size_t tail_off = off + 1;
+	size_t idx = tail_off / 32;
+	atomic_t *t_mask = &tail_mask[idx];
+
+	tail_off = tail_off % 32;
+	do {
+		uint32_t mask = (uint32_t)*t_mask >> tail_off;
+
+		if (mask == UINT32_MAX) {
+			num_bits += 32;
+			atomic_set(t_mask, 0);
+		} else {
+			uint32_t bits = __builtin_ctz(~mask);
+
+			if (bits == 0) {
+				break;
+			}
+
+			num_bits += bits;
+			atomic_and(t_mask, ~(BIT_MASK(bits) << tail_off));
+
+			if (bits + tail_off < 32) {
+				break;
+			}
+
+			tail_off = 0;
+		}
+
+		t_mask++;
+	} while ((HEAP_NUM_WORDS > 1) && (t_mask != &tail_mask[HEAP_NUM_WORDS]));
+
+	return num_bits;
 }
 
 static void *dmm_buffer_alloc(struct dmm_heap *dh, size_t length)
 {
-	void *ret;
-	k_spinlock_key_t key;
+	size_t num_bits, off;
+	int rv;
+
+	if (dh->ptr == 0) {
+		/* Not initialized. */
+		return NULL;
+	}
 
 	length = ROUND_UP(length, dh->region->dt_align);
+	num_bits = DIV_ROUND_UP(length, dh->blk_size);
+
+	rv = sys_bitarray_alloc(&dh->bitarray, num_bits, &off);
+	if (rv < 0) {
+		return NULL;
+	}
+
+	tail_mask_set(dh->tail_mask, num_bits, off);
+
+#ifdef CONFIG_DMM_STATS
+	k_spinlock_key_t key;
 
 	key = k_spin_lock(&dh->lock);
-	ret = sys_heap_aligned_alloc(&dh->heap, dh->region->dt_align, length);
+	dh->curr_use += num_bits;
+	dh->max_use = MAX(dh->max_use, dh->curr_use);
 	k_spin_unlock(&dh->lock, key);
+#endif
 
-	return ret;
+	return (void *)(dh->ptr + dh->blk_size * off);
 }
 
 static void dmm_buffer_free(struct dmm_heap *dh, void *buffer)
 {
-	k_spinlock_key_t key;
+	size_t offset = ((uintptr_t)buffer - dh->ptr) / dh->blk_size;
+	size_t num_bits = num_bits_get(dh->tail_mask, offset);
+	int rv;
+
+#ifdef CONFIG_DMM_STATS
+	atomic_sub(&dh->curr_use, num_bits);
+#endif
+	rv = sys_bitarray_free(&dh->bitarray, num_bits, offset);
+	(void)rv;
+	__ASSERT_NO_MSG(rv == 0);
+}
 
-	key = k_spin_lock(&dh->lock);
-	sys_heap_free(&dh->heap, buffer);
-	k_spin_unlock(&dh->lock, key);
+static void dmm_memcpy(void *dst, const void *src, size_t len)
+{
+#define IS_ALIGNED32(x) IS_ALIGNED(x, sizeof(uint32_t))
+#define IS_ALIGNED64(x) IS_ALIGNED(x, sizeof(uint64_t))
+	if (IS_ALIGNED64(len) && IS_ALIGNED64(dst) && IS_ALIGNED64(src)) {
+		for (uint32_t i = 0; i < len / sizeof(uint64_t); i++) {
+			((uint64_t *)dst)[i] = ((uint64_t *)src)[i];
+		}
+		return;
+	}
+
+	if (IS_ALIGNED32(len) && IS_ALIGNED32(dst) && IS_ALIGNED32(src)) {
+		for (uint32_t i = 0; i < len / sizeof(uint32_t); i++) {
+			((uint32_t *)dst)[i] = ((uint32_t *)src)[i];
+		}
+		return;
+	}
+
+	memcpy(dst, src, len);
 }
 
 int dmm_buffer_out_prepare(void *region, void const *user_buffer, size_t user_length,
@@ -172,7 +300,7 @@ int dmm_buffer_out_prepare(void *region, void const *user_buffer, size_t user_le
 			return -ENOMEM;
 		}
 		/* - copy user buffer contents into allocated buffer */
-		memcpy(*buffer_out, user_buffer, user_length);
+		dmm_memcpy(*buffer_out, user_buffer, user_length);
 	}
 
 	/* Check if device memory region is cacheable
@@ -201,7 +329,7 @@ int dmm_buffer_out_release(void *region, void *buffer_out)
 	/* Check if output buffer is contained within memory area
 	 * managed by dynamic memory allocator
 	 */
-	if (is_buffer_within_region(addr, 0, dmm_heap_start_get(dh), dmm_heap_size_get(dh))) {
+	if (is_buffer_within_region(addr, 0, dh->ptr, dh->ptr_end)) {
 		/* If yes, free the buffer */
 		dmm_buffer_free(dh, buffer_out);
 	}
@@ -281,14 +409,14 @@ int dmm_buffer_in_release(void *region, void *user_buffer, size_t user_length, v
 	 * If no, copy allocated buffer to the user buffer
 	 */
 	if (buffer_in != user_buffer) {
-		memcpy(user_buffer, buffer_in, user_length);
+		dmm_memcpy(user_buffer, buffer_in, user_length);
 	}
 	/* If yes, no action is needed */
 
 	/* Check if input buffer is contained within memory area
 	 * managed by dynamic memory allocator
 	 */
-	if (is_buffer_within_region(addr, 0, dmm_heap_start_get(dh), dmm_heap_size_get(dh))) {
+	if (is_buffer_within_region(addr, user_length, dh->ptr, dh->ptr_end)) {
 		/* If yes, free the buffer */
 		dmm_buffer_free(dh, buffer_in);
 	}
@@ -297,14 +425,51 @@ int dmm_buffer_in_release(void *region, void *user_buffer, size_t user_length, v
 	return 0;
 }
 
+int dmm_stats_get(void *region, uintptr_t *start_addr, uint32_t *curr_use, uint32_t *max_use)
+{
+#ifdef CONFIG_DMM_STATS
+	struct dmm_heap *dh;
+
+	dh = dmm_heap_find(region);
+	if (dh == NULL) {
+		return -EINVAL;
+	}
+
+	if (start_addr) {
+		*start_addr = dh->ptr;
+	}
+
+	if (curr_use) {
+		*curr_use = (100 * dh->curr_use) / dh->bitarray.num_bits;
+	}
+
+	if (max_use) {
+		*max_use = (100 * dh->max_use) / dh->bitarray.num_bits;
+	}
+
+	return 0;
+#else
+	return -ENOTSUP;
+#endif
+}
+
 int dmm_init(void)
 {
 	struct dmm_heap *dh;
+	int blk_cnt;
+	int heap_space;
 
 	for (size_t idx = 0; idx < ARRAY_SIZE(dmm_regions); idx++) {
 		dh = &dmm_heaps_data.dmm_heaps[idx];
 		dh->region = &dmm_regions[idx];
-		sys_heap_init(&dh->heap, (void *)dmm_heap_start_get(dh), dmm_heap_size_get(dh));
+		dh->ptr = ROUND_UP(dh->region->dt_allc, dh->region->dt_align);
+		heap_space = dh->region->dt_size - (dh->ptr - dh->region->dt_addr);
+		dh->blk_size = ROUND_UP(heap_space / (32 * HEAP_NUM_WORDS), dh->region->dt_align);
+		blk_cnt = heap_space / dh->blk_size;
+		dh->ptr_end = dh->ptr + blk_cnt * dh->blk_size;
+		dh->bitarray.num_bits = blk_cnt;
+		dh->bitarray.num_bundles = HEAP_NUM_WORDS;
+		dh->bitarray.bundles = dh->mask;
 	}
 
 	return 0;
diff --git a/soc/nordic/common/dmm.h b/soc/nordic/common/dmm.h
index 34b517c92df..09486289aa6 100644
--- a/soc/nordic/common/dmm.h
+++ b/soc/nordic/common/dmm.h
@@ -35,12 +35,12 @@ extern "C" {
  * Cache line alignment is required if region is cacheable and data cache is enabled.
  */
 #define DMM_REG_ALIGN_SIZE(node_id) \
-	(DMM_IS_REG_CACHEABLE(node_id) ? CONFIG_DCACHE_LINE_SIZE : sizeof(uint8_t))
+	(DMM_IS_REG_CACHEABLE(node_id) ? CONFIG_DCACHE_LINE_SIZE : sizeof(uint32_t))
 
 #else
 
 #define DMM_IS_REG_CACHEABLE(node_id) 0
-#define DMM_REG_ALIGN_SIZE(node_id) (sizeof(uint8_t))
+#define DMM_REG_ALIGN_SIZE(node_id) (sizeof(uint32_t))
 
 #endif /* CONFIG_DCACHE */
 
@@ -163,6 +163,22 @@ int dmm_buffer_in_prepare(void *region, void *user_buffer, size_t user_length, v
  */
 int dmm_buffer_in_release(void *region, void *user_buffer, size_t user_length, void *buffer_in);
 
+/**
+ * @brief Get statistics.
+ *
+ * Must be enabled with CONFIG_DMM_STATS.
+ *
+ * @param[in] region DMM memory region.
+ * @param[out] start_addr Location where starting address of the memory region is set. Can be null.
+ * @param[out] curr_use Location where current use in percent is written. Can be null.
+ * @param[out] max_use Location where maximum use in percent is written. Can be null.
+ *
+ * @retval 0 on success.
+ * @retval -EINVAL Invalid region.
+ * @retval -ENOTSUP Feature is disabled.
+ */
+int dmm_stats_get(void *region, uintptr_t *start_addr, uint32_t *curr_use, uint32_t *max_use);
+
 /**
  * @brief Initialize DMM.
  *
@@ -210,6 +226,17 @@ static ALWAYS_INLINE int dmm_buffer_in_release(void *region, void *user_buffer,
 	return 0;
 }
 
+static ALWAYS_INLINE int dmm_stats_get(void *region, uintptr_t *start_addr,
+				       uint32_t *curr_use, uint32_t *max_use)
+{
+	ARG_UNUSED(region);
+	ARG_UNUSED(start_addr);
+	ARG_UNUSED(curr_use);
+	ARG_UNUSED(max_use);
+
+	return 0;
+}
+
 static ALWAYS_INLINE int dmm_init(void)
 {
 	return 0;
diff --git a/tests/boards/nrf/dmm/boards/nrf5340dk_nrf5340_cpuapp.overlay b/tests/boards/nrf/dmm/boards/nrf5340dk_nrf5340_cpuapp.overlay
index 3e0b1b4d535..48a4e8adc26 100644
--- a/tests/boards/nrf/dmm/boards/nrf5340dk_nrf5340_cpuapp.overlay
+++ b/tests/boards/nrf/dmm/boards/nrf5340dk_nrf5340_cpuapp.overlay
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2024 Nordic Semiconductor ASA
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 / {
 	aliases {
 		dut-cache = &spi1;
@@ -52,3 +58,7 @@
 	pinctrl-1 = <&spi3_sleep_alt>;
 	pinctrl-names = "default", "sleep";
 };
+
+cycle_timer: &timer1 {
+	status = "okay";
+};
diff --git a/tests/boards/nrf/dmm/boards/nrf54h20dk_nrf54h20_cpuapp.overlay b/tests/boards/nrf/dmm/boards/nrf54h20dk_nrf54h20_cpuapp.overlay
index e3924657b86..2507dd83dfe 100644
--- a/tests/boards/nrf/dmm/boards/nrf54h20dk_nrf54h20_cpuapp.overlay
+++ b/tests/boards/nrf/dmm/boards/nrf54h20dk_nrf54h20_cpuapp.overlay
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2024 Nordic Semiconductor ASA
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 / {
 	aliases {
 		dut-cache = &spi120;
@@ -58,3 +64,7 @@
 	pinctrl-names = "default", "sleep";
 	memory-regions = <&dma_fast_region>;
 };
+
+cycle_timer: &timer120 {
+	status = "okay";
+};
diff --git a/tests/boards/nrf/dmm/prj.conf b/tests/boards/nrf/dmm/prj.conf
index 9467c292689..c05afbb6ad3 100644
--- a/tests/boards/nrf/dmm/prj.conf
+++ b/tests/boards/nrf/dmm/prj.conf
@@ -1 +1,6 @@
 CONFIG_ZTEST=y
+CONFIG_ZTRESS=y
+CONFIG_ASSERT=n
+CONFIG_SPIN_VALIDATE=n
+CONFIG_TEST_EXTRA_STACK_SIZE=512
+CONFIG_COUNTER=y
diff --git a/tests/boards/nrf/dmm/src/main.c b/tests/boards/nrf/dmm/src/main.c
index 214a9069752..58b7e891c79 100644
--- a/tests/boards/nrf/dmm/src/main.c
+++ b/tests/boards/nrf/dmm/src/main.c
@@ -9,9 +9,14 @@
 #include <zephyr/cache.h>
 #include <zephyr/kernel.h>
 #include <zephyr/ztest.h>
+#include <zephyr/drivers/counter.h>
+#include <zephyr/ztress.h>
+#include <zephyr/random/random.h>
 
 #include <dmm.h>
 
+#define IS_ALIGNED64(x) IS_ALIGNED(x, sizeof(uint64_t))
+
 #define DUT_CACHE   DT_ALIAS(dut_cache)
 #define DUT_NOCACHE DT_ALIAS(dut_nocache)
 
@@ -25,7 +30,7 @@
 
 #if CONFIG_DCACHE
 BUILD_ASSERT(DMM_ALIGN_SIZE(DUT_CACHE) == CONFIG_DCACHE_LINE_SIZE);
-BUILD_ASSERT(DMM_ALIGN_SIZE(DUT_NOCACHE) == 1);
+BUILD_ASSERT(DMM_ALIGN_SIZE(DUT_NOCACHE) == sizeof(uint32_t));
 #endif
 
 struct dmm_test_region {
@@ -57,13 +62,49 @@ static const struct dmm_test_region dmm_test_regions[DMM_TEST_REGION_COUNT] = {
 		.size = DMM_TEST_GET_REG_SIZE(DUT_NOCACHE)
 	},
 };
+static const struct device *counter = DEVICE_DT_GET(DT_NODELABEL(cycle_timer));
+static uint32_t t_delta;
+
+static uint32_t ts_get(void)
+{
+	uint32_t t;
+
+	(void)counter_get_value(counter, &t);
+	return t;
+}
+
+static uint32_t ts_from_get(uint32_t from)
+{
+	return ts_get() - from;
+}
+
+static uint32_t cyc_to_us(uint32_t cyc)
+{
+	return counter_ticks_to_us(counter, cyc);
+}
+
+static uint32_t cyc_to_rem_ns(uint32_t cyc)
+{
+	uint32_t us = counter_ticks_to_us(counter, cyc);
+	uint32_t ns;
+
+	cyc = cyc - counter_us_to_ticks(counter, (uint64_t)us);
+	ns = counter_ticks_to_us(counter, 1000 * cyc);
+
+	return ns;
+}
 
 static void *test_setup(void)
 {
 	static struct dmm_fixture fixture;
+	uint32_t t;
 
+	counter_start(counter);
+	t = ts_get();
+	t_delta = ts_get() - t;
 	memcpy(fixture.regions, dmm_test_regions, sizeof(dmm_test_regions));
 	fixture.fill_value = 0x1;
+
 	return &fixture;
 }
 
@@ -79,13 +120,25 @@ static bool dmm_buffer_in_region_check(struct dmm_test_region *dtr, void *buf, s
 }
 
 static void dmm_check_output_buffer(struct dmm_test_region *dtr, uint32_t *fill_value,
-				    void *data, size_t size, bool was_prealloc, bool is_cached)
+				    void *data, size_t size, bool was_prealloc,
+				    bool is_cached, bool print_report)
 {
 	void *buf;
 	int retval;
+	uint32_t t;
+	bool aligned;
 
 	memset(data, (*fill_value)++, size);
+	t = ts_get();
 	retval = dmm_buffer_out_prepare(dtr->mem_reg, data, size, &buf);
+	t = ts_from_get(t);
+	aligned = IS_ALIGNED64(data) && IS_ALIGNED64(buf) && IS_ALIGNED64(size);
+
+	if (print_report) {
+		TC_PRINT("%saligned buffer out prepare size:%d buf:%p took %d.%dus (%d cycles)\n",
+			aligned ? "" : "not ", size, buf, cyc_to_us(t), cyc_to_rem_ns(t), t);
+	}
+
 	zassert_ok(retval);
 	if (IS_ENABLED(CONFIG_DCACHE) && is_cached) {
 		zassert_true(IS_ALIGNED(buf, CONFIG_DCACHE_LINE_SIZE));
@@ -104,21 +157,37 @@ static void dmm_check_output_buffer(struct dmm_test_region *dtr, uint32_t *fill_
 	sys_cache_data_invd_range(buf, size);
 	zassert_mem_equal(buf, data, size);
 
+	t = ts_get();
 	retval = dmm_buffer_out_release(dtr->mem_reg, buf);
+	t = ts_from_get(t);
+	if (print_report) {
+		TC_PRINT("buffer out release buf:%p size:%d took %d.%dus (%d cycles)\n",
+			buf, size, cyc_to_us(t), cyc_to_rem_ns(t), t);
+	}
 	zassert_ok(retval);
 }
 
 static void dmm_check_input_buffer(struct dmm_test_region *dtr, uint32_t *fill_value,
-				   void *data, size_t size, bool was_prealloc, bool is_cached)
+				   void *data, size_t size, bool was_prealloc,
+				   bool is_cached, bool print_report)
 {
 	void *buf;
 	int retval;
+	uint32_t t;
 	uint8_t intermediate_buf[128];
+	bool aligned;
 
-	zassert_true(size < sizeof(intermediate_buf));
+	zassert_true(size <= sizeof(intermediate_buf));
 
+	t = ts_get();
 	retval = dmm_buffer_in_prepare(dtr->mem_reg, data, size, &buf);
+	t = ts_from_get(t);
+	aligned = IS_ALIGNED64(data) && IS_ALIGNED64(buf) && IS_ALIGNED64(size);
 	zassert_ok(retval);
+	if (print_report) {
+		TC_PRINT("%saligned buffer in prepare buf:%p size:%d took %d.%dus (%d cycles)\n",
+			aligned ? "" : "not ", buf, size, cyc_to_us(t), cyc_to_rem_ns(t), t);
+	}
 	if (IS_ENABLED(CONFIG_DCACHE) && is_cached) {
 		zassert_true(IS_ALIGNED(buf, CONFIG_DCACHE_LINE_SIZE));
 	}
@@ -144,7 +213,13 @@ static void dmm_check_input_buffer(struct dmm_test_region *dtr, uint32_t *fill_v
 		memset(buf, (*fill_value)++, size);
 	}
 
+	t = ts_get();
 	retval = dmm_buffer_in_release(dtr->mem_reg, data, size, buf);
+	t = ts_from_get(t);
+	if (print_report) {
+		TC_PRINT("buffer in release buf:%p size:%d took %d.%dus (%d cycles)\n",
+			buf, size, cyc_to_us(t), cyc_to_rem_ns(t), t);
+	}
 	zassert_ok(retval);
 
 	zassert_mem_equal(data, intermediate_buf, size);
@@ -152,10 +227,14 @@ static void dmm_check_input_buffer(struct dmm_test_region *dtr, uint32_t *fill_v
 
 ZTEST_USER_F(dmm, test_check_dev_cache_in_allocate)
 {
-	uint8_t user_data[16];
+	uint8_t user_data[128] __aligned(sizeof(uint64_t));
 
 	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
-			       user_data, sizeof(user_data), false, true);
+			       user_data, 16, false, true, false);
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+			       user_data, 16, false, true, true);
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+			       user_data, sizeof(user_data), false, true, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_cache_in_preallocate)
@@ -163,15 +242,30 @@ ZTEST_USER_F(dmm, test_check_dev_cache_in_preallocate)
 	static uint8_t user_data[16] DMM_MEMORY_SECTION(DUT_CACHE);
 
 	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
-			       user_data, sizeof(user_data), true, true);
+			       user_data, sizeof(user_data), true, true, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_cache_out_allocate)
 {
-	uint8_t user_data[16];
+	uint8_t user_data[129] __aligned(sizeof(uint64_t));
+
+	/* First run to get code into ICACHE so that following runs has consistent timing. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+				user_data, 16, false, true, false);
+
+	/* Aligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+				user_data, 16, false, true, true);
+	/* Unaligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+				&user_data[1], 16, false, true, true);
 
+	/* Aligned user buffer. */
 	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
-				user_data, sizeof(user_data), false, true);
+				user_data, sizeof(user_data) - 1, false, true, true);
+	/* Unaligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
+				&user_data[1], sizeof(user_data) - 1, false, true, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_cache_out_preallocate)
@@ -179,15 +273,31 @@ ZTEST_USER_F(dmm, test_check_dev_cache_out_preallocate)
 	static uint8_t user_data[16] DMM_MEMORY_SECTION(DUT_CACHE);
 
 	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_CACHE], &fixture->fill_value,
-				user_data, sizeof(user_data), true, true);
+				user_data, sizeof(user_data), true, true, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_nocache_in_allocate)
 {
-	uint8_t user_data[16];
+	uint8_t user_data[129] __aligned(sizeof(uint64_t));
+
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+			       user_data, 16, false, false, false);
+
+	/* Aligned user buffer. */
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+			       user_data, 16, false, false, true);
+
+	/* Unaligned user buffer. */
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+			       &user_data[1], 16, false, false, true);
 
+	/* Aligned user buffer. */
 	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
-			       user_data, sizeof(user_data), false, false);
+			       user_data, sizeof(user_data) - 1, false, false, true);
+
+	/* Unaligned user buffer. */
+	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+			       &user_data[1], sizeof(user_data) - 1, false, false, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_nocache_in_preallocate)
@@ -195,15 +305,30 @@ ZTEST_USER_F(dmm, test_check_dev_nocache_in_preallocate)
 	static uint8_t user_data[16] DMM_MEMORY_SECTION(DUT_NOCACHE);
 
 	dmm_check_input_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
-			       user_data, sizeof(user_data), true, false);
+			       user_data, sizeof(user_data), true, false, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_nocache_out_allocate)
 {
-	uint8_t user_data[16];
+	uint8_t user_data[129] __aligned(sizeof(uint64_t));
+
+	/* First run to get code into ICACHE so that following results are consistent. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+				user_data, 16, false, false, false);
 
+	/* Aligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+				user_data, 16, false, false, true);
+	/* Unaligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+				&user_data[1], 16, false, false, true);
+
+	/* Aligned user buffer. */
+	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
+				user_data, sizeof(user_data) - 1, false, false, true);
+	/* Unaligned user buffer. */
 	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
-				user_data, sizeof(user_data), false, false);
+				&user_data[1], sizeof(user_data) - 1, false, false, true);
 }
 
 ZTEST_USER_F(dmm, test_check_dev_nocache_out_preallocate)
@@ -211,7 +336,233 @@ ZTEST_USER_F(dmm, test_check_dev_nocache_out_preallocate)
 	static uint8_t user_data[16] DMM_MEMORY_SECTION(DUT_NOCACHE);
 
 	dmm_check_output_buffer(&fixture->regions[DMM_TEST_REGION_NOCACHE], &fixture->fill_value,
-				user_data, sizeof(user_data), true, false);
+				user_data, sizeof(user_data), true, false, true);
+}
+
+ZTEST_USER_F(dmm, test_check_multiple_alloc_and_free)
+{
+	int retval;
+	uint8_t buf[256];
+	uint8_t buf2[32];
+	void *dmm_buf;
+	void *dmm_buf2;
+	void *mem_reg = fixture->regions[DMM_TEST_REGION_NOCACHE].mem_reg;
+	uintptr_t start_address;
+	uint32_t curr_use, max_use;
+
+	if (IS_ENABLED(CONFIG_DMM_STATS)) {
+		retval = dmm_stats_get(mem_reg, &start_address, &curr_use, &max_use);
+		zassert_ok(retval);
+	}
+
+	memset(buf, 0, sizeof(buf));
+	memset(buf2, 0, sizeof(buf2));
+
+	retval = dmm_buffer_out_prepare(mem_reg, (void *)buf, sizeof(buf), &dmm_buf);
+	zassert_ok(retval);
+	zassert_true(dmm_buf != NULL);
+
+	retval = dmm_buffer_out_prepare(mem_reg, (void *)buf2, sizeof(buf2), &dmm_buf2);
+	zassert_ok(retval);
+	zassert_true(dmm_buf2 != NULL);
+
+	retval = dmm_buffer_out_release(mem_reg, dmm_buf2);
+	zassert_ok(retval);
+	zassert_true(dmm_buf != NULL);
+
+	retval = dmm_buffer_out_release(mem_reg, dmm_buf);
+	zassert_ok(retval);
+	zassert_true(dmm_buf != NULL);
+
+	if (IS_ENABLED(CONFIG_DMM_STATS)) {
+		uint32_t curr_use2;
+
+		retval = dmm_stats_get(mem_reg, &start_address, &curr_use2, &max_use);
+		zassert_ok(retval);
+		zassert_equal(curr_use, curr_use2);
+		TC_PRINT("Stats start_address:%p current use:%d%% max use:%d%%\n",
+			(void *)start_address, curr_use2, max_use);
+	}
+}
+
+struct dmm_stress_data {
+	void *mem_reg;
+	void *alloc_ptr[32];
+	uint8_t alloc_token[32];
+	size_t alloc_len[32];
+	atomic_t alloc_mask;
+	atomic_t busy_mask;
+	atomic_t fails;
+	atomic_t cnt;
+	bool cached;
+};
+
+static void stress_free_op(struct dmm_stress_data *data, int prio, int id)
+{
+	/* buffer is allocated. */
+	uint8_t token = data->alloc_token[id];
+	size_t len = data->alloc_len[id];
+	uint8_t *ptr = data->alloc_ptr[id];
+	int rv;
+
+	for (int j = 0; j < len; j++) {
+		uint8_t exp_val = (uint8_t)(token + j);
+
+		if (ptr[j] != exp_val) {
+			for (int k = 0; k < len; k++) {
+				printk("%02x ", ptr[k]);
+			}
+		}
+		zassert_equal(ptr[j], exp_val, "At %d got:%d exp:%d, len:%d id:%d, alloc_cnt:%d",
+				j, ptr[j], exp_val, len, id, (uint32_t)data->cnt);
+	}
+
+	rv = dmm_buffer_in_release(data->mem_reg, ptr, len, ptr);
+	zassert_ok(rv);
+	/* Indicate that buffer is released. */
+	atomic_and(&data->alloc_mask, ~BIT(id));
+}
+
+static bool stress_alloc_op(struct dmm_stress_data *data, int prio, int id)
+{
+	uint32_t r32 = sys_rand32_get();
+	size_t len = r32 % 512;
+	uint8_t *ptr = data->alloc_ptr[id];
+	int rv;
+
+	/* Rarely allocate bigger buffer. */
+	if ((r32 & 0x7) == 0) {
+		len += 512;
+	}
+
+	rv = dmm_buffer_in_prepare(data->mem_reg, &r32/*dummy*/, len, (void **)&ptr);
+	if (rv < 0) {
+		atomic_inc(&data->fails);
+		return true;
+	}
+
+	uint8_t token = r32 >> 24;
+
+	data->alloc_ptr[id] = ptr;
+	data->alloc_len[id] = len;
+	data->alloc_token[id] = token;
+	for (int j = 0; j < len; j++) {
+		ptr[j] = (uint8_t)(j + token);
+	}
+	if (data->cached) {
+		sys_cache_data_flush_range(ptr, len);
+	}
+	atomic_inc(&data->cnt);
+	return false;
+}
+
+bool stress_func(void *user_data, uint32_t cnt, bool last, int prio)
+{
+	struct dmm_stress_data *data = user_data;
+	uint32_t r = sys_rand32_get();
+	int rpt = r & 0x3;
+
+	r >>= 2;
+
+	for (int i = 0; i < rpt + 1; i++) {
+		int id = r % 32;
+		int key;
+		bool free_op;
+		bool clear_bit;
+
+		key = irq_lock();
+		if ((data->busy_mask & BIT(id)) == 0) {
+			data->busy_mask |= BIT(id);
+			if (data->alloc_mask & BIT(id)) {
+				free_op = true;
+			} else {
+				data->alloc_mask |= BIT(id);
+				free_op = false;
+			}
+		} else {
+			irq_unlock(key);
+			continue;
+		}
+
+		irq_unlock(key);
+		r >>= 5;
+
+		if (free_op) {
+			stress_free_op(data, prio, id);
+			clear_bit = true;
+		} else {
+			clear_bit = stress_alloc_op(data, prio, id);
+		}
+
+		key = irq_lock();
+		data->busy_mask &= ~BIT(id);
+		if (clear_bit) {
+			data->alloc_mask &= ~BIT(id);
+		}
+		irq_unlock(key);
+	}
+
+	return true;
+}
+
+static void free_all(struct dmm_stress_data *data)
+{
+	while (data->alloc_mask) {
+		int id = 31 - __builtin_clz(data->alloc_mask);
+
+		stress_free_op(data, 0, id);
+		data->alloc_mask &= ~BIT(id);
+	}
+}
+
+static void stress_allocator(void *mem_reg, bool cached)
+{
+	uint32_t timeout = 3000;
+	struct dmm_stress_data ctx;
+	int rv;
+	uint32_t curr_use;
+
+	if (mem_reg == NULL) {
+		ztest_test_skip();
+	}
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.mem_reg = mem_reg;
+	ctx.cached = cached;
+
+	if (IS_ENABLED(CONFIG_DMM_STATS)) {
+		rv = dmm_stats_get(ctx.mem_reg, NULL, &curr_use, NULL);
+		zassert_ok(rv);
+	}
+
+	ztress_set_timeout(K_MSEC(timeout));
+
+	ZTRESS_EXECUTE(ZTRESS_THREAD(stress_func, &ctx, INT32_MAX, INT32_MAX, Z_TIMEOUT_TICKS(4)),
+		       ZTRESS_THREAD(stress_func, &ctx, INT32_MAX, INT32_MAX, Z_TIMEOUT_TICKS(4)),
+		       ZTRESS_THREAD(stress_func, &ctx, INT32_MAX, INT32_MAX, Z_TIMEOUT_TICKS(4)));
+
+	free_all(&ctx);
+	TC_PRINT("Executed %d allocation operation. Failed to allocate %d times.\n",
+			(uint32_t)ctx.cnt, (uint32_t)ctx.fails);
+
+	if (IS_ENABLED(CONFIG_DMM_STATS)) {
+		uint32_t curr_use2;
+
+		rv = dmm_stats_get(ctx.mem_reg, NULL, &curr_use2, NULL);
+		zassert_ok(rv);
+		zassert_equal(curr_use, curr_use2, "Unexpected usage got:%d exp:%d",
+				curr_use2, curr_use);
+	}
+}
+
+ZTEST_F(dmm, test_stress_allocator_nocache)
+{
+	stress_allocator(fixture->regions[DMM_TEST_REGION_NOCACHE].mem_reg, false);
+}
+
+ZTEST_F(dmm, test_stress_allocator_cache)
+{
+	stress_allocator(fixture->regions[DMM_TEST_REGION_CACHE].mem_reg, true);
 }
 
 ZTEST_SUITE(dmm, NULL, test_setup, NULL, test_cleanup, NULL);
diff --git a/tests/boards/nrf/dmm/testcase.yaml b/tests/boards/nrf/dmm/testcase.yaml
index b5f41f281a5..7fc991d4824 100644
--- a/tests/boards/nrf/dmm/testcase.yaml
+++ b/tests/boards/nrf/dmm/testcase.yaml
@@ -16,3 +16,14 @@ tests:
       - CONFIG_DCACHE=n
     platform_allow:
       - nrf54h20dk/nrf54h20/cpuapp
+  boards.nrf.dmm.stats:
+    extra_configs:
+      - CONFIG_DMM_STATS=y
+    platform_allow:
+      - nrf54h20dk/nrf54h20/cpuapp
+  boards.nrf.dmm.more_chunks:
+    extra_configs:
+      - CONFIG_DMM_STATS=y
+      - CONFIG_DMM_HEAP_CHUNKS=96
+    platform_allow:
+      - nrf54h20dk/nrf54h20/cpuapp