diff --git a/sw/cheri/checks/CMakeLists.txt b/sw/cheri/checks/CMakeLists.txt
index fe4366bc4..a0de9367a 100644
--- a/sw/cheri/checks/CMakeLists.txt
+++ b/sw/cheri/checks/CMakeLists.txt
@@ -51,9 +51,9 @@ add_custom_command(
 install(TARGETS ${NAME})
 
 set(NAME hyperram_test)
-add_executable(${NAME}  hyperram_test.cc)
+add_executable(${NAME} hyperram_test.cc hyperram_memset.S)
 target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES} "${reisfmt_SOURCE_DIR}/include")
-target_link_libraries(${NAME} common)
+target_link_libraries(${NAME} common block_tests)
 
 add_custom_command(
   TARGET ${NAME} POST_BUILD
diff --git a/sw/cheri/checks/hyperram_memset.S b/sw/cheri/checks/hyperram_memset.S
new file mode 100644
index 000000000..f64b8a1f8
--- /dev/null
+++ b/sw/cheri/checks/hyperram_memset.S
@@ -0,0 +1,251 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+// A set of memory initialisation functions using different access sizes, to check that
+// the write transactions are properly coalesced into write bursts to the HyperBus
+// Memory Controller.
+//
+// Each routine is expected to be writing a single, defined byte to each address that is
+// modified. By initialising the entirety of a target buffer to a different value first,
+// the set of modified addresses may be ascertained.
+
+  .section .text, "ax", @progbits
+
+  .option norvc
+
+// Byte-based memory writing.
+//
+// entry ca0 -> byte-aligned destination buffer
+//       a1   = byte to be stored
+//       a2   = number of bytes
+// exit  -
+  .globl hyperram_memset_b
+  .p2align 5
+hyperram_memset_b:
+  addi  a2, a2, -8
+  bltz  a2, memset_b_8fix
+memset_b_8:
+  csb   a1,  (ca0)
+  csb   a1, 1(ca0)
+  csb   a1, 2(ca0)
+  csb   a1, 3(ca0)
+  csb   a1, 4(ca0)
+  csb   a1, 5(ca0)
+  csb   a1, 6(ca0)
+  csb   a1, 7(ca0)
+  cincoffset ca0, ca0, 8
+  addi  a2, a2, -8
+  bgez  a2, memset_b_8
+memset_b_8fix:
+  addi  a2, a2, 8
+  bgtz  a2, memset_b_tail
+  cret
+
+// Just complete the request using byte stores; this is shared among all _ascending_ routines.
+// Performance is not very important, but it would be better to keep the byte writes temporally
+// close together to try to provoke races.
+//
+// ca0 -> byte-aligned pointer into destination buffer
+// a1  =  byte to be stored
+// a2  =  non-zero count of bytes remaining
+memset_b_tail:
+  add   a2, a0, a2
+memset_b_1:
+  csb   a1, (ca0)
+  cincoffset ca0, ca0, 1
+  bltu  a0, a2, memset_b_1
+  cret
+
+// Descending transfer, pre-decrementing address; shared among the two _descending_ routines.
+//
+// ca0 -> just beyond next address to be written, decrement before use.
+// a1  =  byte to be stored
+// a2  =  non-zero count of bytes remaining
+memset_b_desc_tail:
+  sub   a2, a0, a2
+memset_b_desc_1:
+  csb   a1, -1(ca0)
+  cincoffset ca0, ca0, -1
+  bgtu  a0, a2, memset_b_desc_1
+  cret
+
+// Byte and half word-based memory writing; each word is written using
+// 2 byte stores and a half-word store.
+//
+// entry ca0 -> word-aligned destination buffer
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+// exit  -
+  .globl hyperram_memset_hb
+  .p2align 5
+hyperram_memset_hb:
+  addi  a2, a2, -4
+  bltz  a2, memset_hb_4fix
+memset_hb_4:
+  csb   a1,  (ca0)
+  csb   a1, 1(ca0)
+  csh   a1, 2(ca0)
+  cincoffset ca0, ca0, 4
+  addi  a2, a2, -4
+  bgez  a2, memset_hb_4
+memset_hb_4fix:
+  addi  a2, a2, 4
+  bgtz  a2, memset_b_tail
+  cret
+
+// Half word-based memory writing.
+//
+// entry ca0 -> half-word aligned destination buffer
+//       a1   = byte to be stored, replicated throughout half-word
+//       a2   = number of bytes
+// exit  -
+  .globl hyperram_memset_h
+  .p2align 5
+hyperram_memset_h:
+  addi  a2, a2, -16
+  bltz  a2, memset_h_16fix
+memset_h_16:
+  csh   a1,   (ca0)
+  csh   a1,  2(ca0)
+  csh   a1,  4(ca0)
+  csh   a1,  6(ca0)
+  csh   a1,  8(ca0)
+  csh   a1, 10(ca0)
+  csh   a1, 12(ca0)
+  csh   a1, 14(ca0)
+  cincoffset ca0, ca0, 16
+  addi  a2, a2, -16
+  bgez a2, memset_h_16
+memset_h_16fix:
+  addi  a2, a2, 16
+  bgtz  a2, memset_b_tail
+  cret
+
+// Word-based memory writing.
+//
+// entry ca0 -> word-aligned destination buffer
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+  .globl hyperram_memset_w
+  .p2align 5
+hyperram_memset_w:
+  addi  a2, a2, -32
+  bltz  a2, memset_w_32fix
+memset_w_32:
+  csw   a1,   (ca0)
+  csw   a1,  4(ca0)
+  csw   a1,  8(ca0)
+  csw   a1, 12(ca0)
+  csw   a1, 16(ca0)
+  csw   a1, 20(ca0)
+  csw   a1, 24(ca0)
+  csw   a1, 28(ca0)
+  cincoffset ca0, ca0, 32
+  addi  a2, a2, -32
+  bgez  a2, memset_w_32
+memset_w_32fix:
+  addi  a2, a2, 32
+  bgtz  a2, memset_b_tail
+  cret
+
+// Repeated words memory writing; the performance of this code is of no consequence.
+// It is concerned purely with ensuring the correctness of the written data.
+//
+// entry ca0 -> word-aligned destination buffer
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+  .globl hyperram_memset_wr
+  .p2align 5
+hyperram_memset_wr:
+  addi  a2, a2, -4
+  bltz  a2, memset_wr_4fix
+  xori  a3, a1, -1
+memset_wr_4:
+  csw   a3, (ca0)  // This word should be overwritten...
+  csw   a1, (ca0)  // ...by the original value.
+  cincoffset ca0, ca0, 4
+  addi  a2, a2, -4
+  bgez  a2, memset_wr_4
+memset_wr_4fix:
+  addi  a2, a2, 4
+  bgtz  a2, memset_b_tail
+  cret
+
+// Word-based memory writing to descending addresses.
+//
+// entry ca0 -> word-aligned end of destination buffer, exclusive
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+  .globl hyperram_memset_wd
+  .p2align 5
+hyperram_memset_wd:
+  addi  a2, a2, -8
+  bltz  a2, memset_wd_8fix
+memset_wd_8:
+  csw   a1, -4(ca0)
+  csw   a1, -8(ca0)
+  cincoffset ca0, ca0, -8
+  addi  a2, a2, -8
+  bgez  a2, memset_wd_8
+memset_wd_8fix:
+  addi  a2, a2, 8
+  bgtz  a2, memset_b_desc_tail
+  cret
+
+// Capability stores to ascending addresses.
+//
+// These are issued as two back-to-back word writes and we're just using
+// this as a way to issue 64-bit writes rather than trying to create
+// sensible/valid capabilities.
+//
+// entry ca0 -> double-word aligned destination buffer
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+  .globl hyperram_memset_c
+  .p2align 5
+hyperram_memset_c:
+  // Replicate the data word to yield a double word.
+  cincoffset csp, csp, -8
+  csw   a1,  (csp)
+  csw   a1, 4(csp)
+  clc   ca1,(csp)
+  cincoffset csp, csp, 8
+  addi  a2, a2, -8
+  bltz  a2, memset_c_8fix
+memset_c_8:
+  csc   ca1, (ca0)
+  cincoffset ca0, ca0, 8
+  addi  a2, a2, -8
+  bgez  a2, memset_c_8
+memset_c_8fix:
+  addi  a2, a2, 8
+  bgtz  a2, memset_b_tail
+  cret
+
+// Capability stores to descending addresses. See above.
+//
+// entry ca0 -> double-word aligned end of destination buffer, exclusive
+//       a1   = byte to be stored, replicated throughout word
+//       a2   = number of bytes
+  .globl hyperram_memset_cd
+  .p2align 5
+hyperram_memset_cd:
+  // Replicate the data word to yield a double word.
+  cincoffset csp, csp, -8
+  csw   a1,  (csp)
+  csw   a1, 4(csp)
+  clc   ca1, (csp)
+  cincoffset csp, csp, 8
+  addi  a2, a2, -8
+  bltz  a2, memset_cd_8fix
+memset_cd_8:
+  csc   ca1, -8(ca0)
+  cincoffset ca0, ca0, -8
+  addi  a2, a2, -8
+  bgez  a2, memset_cd_8
+memset_cd_8fix:
+  addi  a2, a2, 8
+  bgtz  a2, memset_b_desc_tail
+  cret
+
diff --git a/sw/cheri/checks/hyperram_memset.h b/sw/cheri/checks/hyperram_memset.h
new file mode 100644
index 000000000..5063620be
--- /dev/null
+++ b/sw/cheri/checks/hyperram_memset.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright lowRISC contributors.
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <stdint.h>
+
+enum WriteTestType {
+  WriteTestType_B = 0,
+  WriteTestType_H,
+  WriteTestType_HB,
+  WriteTestType_W,
+  WriteTestType_WR,
+  WriteTestType_WD,
+  WriteTestType_C,
+  WriteTestType_CD
+};
+
+// Memory-writing routines; these all mimic the ISO C function `memset` except that they have
+// a constraint of 'natural alignment' upon the buffer address and have very specific
+// implementations to ensure defined traffic for testing the write coalescing/buffering logic
+// of the HyperRAM controller interface.
+extern "C" void hyperram_memset_b(volatile uint8_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_h(volatile uint16_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_hb(volatile uint16_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_w(volatile uint32_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_wr(volatile uint32_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_wd(volatile uint32_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_c(volatile uint64_t *dst, int c, size_t n);
+extern "C" void hyperram_memset_cd(volatile uint64_t *dst, int c, size_t n);
diff --git a/sw/cheri/checks/hyperram_test.cc b/sw/cheri/checks/hyperram_test.cc
index 999002dd8..f874e68ac 100644
--- a/sw/cheri/checks/hyperram_test.cc
+++ b/sw/cheri/checks/hyperram_test.cc
@@ -17,8 +17,12 @@
 #include <platform-uart.hh>
 
 #include "../common/console.hh"
+#include "../common/hyperram_perf_test.h"
+#include "../common/timer-utils.hh"
 #include "../common/uart-utils.hh"
 
+#include "hyperram_memset.h"
+
 using namespace CHERI;
 
 const int RandTestBlockSize = 256;
@@ -141,9 +145,10 @@ int rand_cap_test(Capability<volatile uint32_t> hyperram_area,
 // Writes a 32-bit value in every location in hyperram and then reads back to
 // check read values match written values. The values written alternate between
 // 'initial_val' and the inversion of 'initial_val'.
-int stripe_test(Capability<volatile uint32_t> hyperram_area, uint32_t initial_val) {
+int stripe_test(Capability<volatile uint32_t> hyperram_area, uint32_t initial_val, Log &log, bool report_time = false) {
   uint32_t failures      = 0;
   uint32_t cur_write_val = initial_val;
+  uint32_t start_time    = get_mcycle();
 
   for (uint32_t addr = 0; addr < HyperramSize; addr++) {
     hyperram_area[addr] = cur_write_val;
@@ -161,6 +166,11 @@ int stripe_test(Capability<volatile uint32_t> hyperram_area, uint32_t initial_va
     cur_expected_val = ~cur_expected_val;
   }
 
+  if (report_time) {
+    uint32_t end_time = get_mcycle();
+    log.print(" ({} cycles)...", end_time - start_time);
+  }
+
   return failures;
 }
 
@@ -190,13 +200,20 @@ void write_prog(Capability<volatile uint32_t> &hyperram_area, uint32_t addr) {
   hyperram_area[addr + 4] = 0x8082;
 
   asm volatile("fence.i" : : : "memory");
+
+  // By writing the first word of the code again we can ensure that the code is
+  // flushed out to the HyperRAM and will thus be coherent with instruction
+  // fetching when the code is executed.
+  hyperram_area[addr] = hyperram_area[addr];
 }
 
 // Writes a short function to a random area of hyperram and executes it checking
 // for successful execution (see 'write_prog' for details on the function
 // written).
-int execute_test(Capability<volatile uint32_t> &hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations) {
-  int failures = 0;
+int execute_test(Capability<volatile uint32_t> hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations, Log &log,
+                 bool report_time = false) {
+  uint32_t start_time = get_mcycle();
+  int failures        = 0;
 
   for (int i = 0; i < iterations; ++i) {
     uint32_t prog_addr = prng() % (HyperramSize - 5);
@@ -225,6 +242,376 @@ int execute_test(Capability<volatile uint32_t> &hyperram_area, ds::xoroshiro::P6
     }
   }
 
+  if (report_time) {
+    uint32_t end_time = get_mcycle();
+    log.print(" ({} cycles)...", end_time - start_time);
+  }
+
+  return failures;
+}
+
+// Perform partial writes to addresses that may collide with the read buffer to check that
+// read and write accesses are coherent.
+int buffering_test(Capability<volatile uint32_t> hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations) {
+  const uint32_t burst_len = 32u;
+  int failures             = 0;
+
+  // Create an expectation buffer that we may update ourselves; this is a local buffer on the
+  // stack and thus stored in the main system RAM.
+  uint32_t exp_data[burst_len / 4];
+  for (unsigned idx = 0u; idx < burst_len / 4; ++idx) {
+    exp_data[idx] = prng();
+  }
+
+  for (int i = 0; i < iterations; ++i) {
+    // Leave a small gap at the end of the test area, so that we may advance by half a burst length
+    // and still perform a full length burst.
+    uint32_t burst_addr = prng() % ((HyperramSize * 4) - 2 * burst_len);
+    // Align to the start of a burst.
+    burst_addr &= ~(burst_len - 1u);
+    // With 50% probability, ensure that the buffer crosses a burst boundary, to make
+    // the interaction of write and read less predictable.
+    if (prng() & 1) {
+      burst_addr += burst_len / 2;
+    }
+
+    // Decide upon a list of actions to be performed; the number of actions/iteration is pretty
+    // arbitrary.
+    const unsigned num_actions = 7u;
+    uint32_t act_addr[num_actions];
+    uint32_t act_data[num_actions];
+    for (unsigned act = 0u; act < num_actions; ++act) {
+      uint32_t wr_offset = prng() % burst_len;
+      uint32_t wr_type   = prng() % 3;
+      // Ensure that the write offset has natural alignment.
+      wr_offset &= ~((1u << (wr_type)) - 1u);
+      // Store the action type, offset and data compactly to keep the write operations
+      // close together. We want to ensure that writes and reads occur simultaneously/close
+      // together.
+      act_addr[act] = wr_offset | (wr_type << 24);
+      act_data[act] = prng();
+    }
+
+    // Randomise the word offset from which we read; read bursts are wrapping.
+    uint32_t rd_off = prng() % (burst_len / 4);
+
+    // ----- Avoid the use of randomisation after this point; timing would become predictable. -----
+
+    // We need pointers to write to data of different sizes.
+    volatile uint32_t *burst_wp = &hyperram_area[burst_addr / 4];
+    volatile uint16_t *burst_hp = reinterpret_cast<volatile uint16_t *>(burst_wp);
+    volatile uint8_t *burst_bp  = reinterpret_cast<volatile uint8_t *>(burst_wp);
+
+    // Initialise the data at the target address.
+    hyperram_copy_block(burst_wp, exp_data, burst_len);
+
+    // Ensure that we have read data from an address, to provoke fetching of a burst of
+    // read data from the HyperRAM. Check the first word is as expected.
+    volatile uint32_t rd_data = burst_wp[rd_off];
+    failures += (rd_data != exp_data[rd_off]);
+
+    // Update the expectation according to the actions that we're about to perform.
+    for (unsigned act = 0u; act < num_actions; ++act) {
+      uint32_t wr_offset = act_addr[act] & ~0xff000000u;
+      // Modify the affected word in each case. Note that `wr_offset` has natural alignment.
+      unsigned sh = 8u * (wr_offset & 3u);
+      uint32_t mask;
+      switch (act_addr[act] >> 24) {
+        case 0u:
+          mask = 0xffu << sh;
+          break;
+        case 1u:
+          mask = 0xffffu << sh;
+          break;
+        default:
+          mask = ~0u;
+          break;
+      }
+      exp_data[wr_offset >> 2] = (exp_data[wr_offset >> 2] & ~mask) | ((act_data[act] << sh) & mask);
+    }
+
+    // Perform the actions; keep this code short and fast to ensure that the write traffic
+    // is still held within the controller interface. (Write traffic is flushed out to the
+    // HyperRAM after a short while, rather than being held indefinitely.)
+    for (unsigned act = 0u; act < num_actions; ++act) {
+      uint32_t wr_offset = act_addr[act] & ~0xff000000u;
+      uint32_t d         = act_data[act];
+      switch (act_addr[act] >> 24) {
+        case 0u:
+          burst_bp[wr_offset] = (uint8_t)d;
+          break;
+        case 1u:
+          burst_hp[wr_offset >> 1] = (uint16_t)d;
+          break;
+        default:
+          burst_wp[wr_offset >> 2] = d;
+          break;
+      }
+    }
+
+    // ----- Avoid the use of randomisation before this point -----
+
+    // Check the entire contents of the target buffer against our expectations.
+    for (unsigned idx = 0u; idx < burst_len / 4; ++idx) {
+      failures += (exp_data[idx] != burst_wp[idx]);
+    }
+  }
+
+  return failures;
+}
+
+// Performance test to exercise burst transfers from/to the HyperRAM.
+//
+// - read buffers may be 'cleaned' before commencing the performance test, so that they have no
+//   history of earlier read accesses.
+// - source address and/or destination address will be randomised if not already set.
+// - run time of each of the 'copy' and 'compare' operations in `mcycle` ticks may optionally be reported.
+int perf_burst_test(Capability<volatile uint32_t> hyperram_area, Log &log, ds::xoroshiro::P64R32 &prng, size_t nbytes,
+                    bool clean_first = true, bool report_times = true, uint32_t dst_addr = UINT32_MAX,
+                    uint32_t src_addr = UINT32_MAX) {
+  typedef volatile void *(*hr_copy_fn_t)(volatile uint32_t *, const volatile uint32_t *, size_t);
+  int failures = 0;
+
+  // Randomised word offsets.
+  if (dst_addr == UINT32_MAX) {
+    dst_addr = prng() & 0x3ffu;
+  }
+  if (src_addr == UINT32_MAX) {
+    // Ensuring that the source and destination buffers cannot overlap.
+    src_addr = 0x1000u - dst_addr;
+  }
+
+  volatile uint32_t *d = &hyperram_area[dst_addr];
+  volatile uint32_t *s = &hyperram_area[src_addr];
+
+  // Complete the source buffer with complete words; it doesn't matter that we may write a few extra bytes.
+  const uint32_t whole_words = nbytes / 4u;
+  for (unsigned idx = 0u; idx <= whole_words; idx++) {
+    hyperram_area[src_addr + idx] = prng();
+  }
+
+  // Copy the code into the HyperRAM, using itself.
+  const uint32_t prog_addr = 0x903u;
+  hyperram_copy_block(&hyperram_area[prog_addr], (volatile uint32_t *)hyperram_copy_block, hyperram_copy_size);
+  hr_copy_fn_t hr_copy_ptr = (hr_copy_fn_t)get_hyperram_fn_ptr(HYPERRAM_ADDRESS + (prog_addr * 4));
+
+  for (unsigned code_in_hr = 0; code_in_hr < 2; ++code_in_hr) {
+    // Do we need to clean all buffered read data out of the HyperRAM controller interface first?
+    if (clean_first) {
+      // Use 8KiB of data to ensure that 128 buffers of 64 bytes/burst can be cleaned out; this
+      // should be more than enough for any current/future implementation.
+      hyperram_cleaner(&hyperram_area[0x2000u], &hyperram_area[0x4000u]);
+    }
+
+    // Time the memory copy operation.
+    uint32_t start_time = get_mcycle();
+    if (code_in_hr) {
+      hr_copy_ptr(d, s, nbytes);
+    } else {
+      hyperram_copy_block(d, s, nbytes);
+    }
+
+    // Read back and check the destination buffer; this becomes a significant part of the memory
+    // traffic/execution time because presently there is no prefetching of read data, so we time
+    // it separately.
+    uint32_t cmp_start_time = get_mcycle();
+    failures += (0 != hyperram_cmp_block(d, s, nbytes));
+
+    if (report_times) {
+      // Report the duration of the copy and compare operations separately.
+      uint32_t end_time = get_mcycle();
+      log.print("    copy:  {:#6d} - cmp: {:#6d} - total: {:#6d}...", cmp_start_time - start_time,
+                end_time - cmp_start_time, end_time - start_time);
+    }
+  }
+
+  return failures;
+}
+
+// Memory-writing tests to exercise the write coalescing behaviour.
+//
+// - tests all transfer sizes (byte, half-word, word and double-word)
+// - exercises coalescing of partial word writes to form complete words
+// - performs descending writes as well as ascending
+int write_tests(Capability<volatile uint8_t> hyperram_b_area, Capability<volatile uint16_t> hyperram_h_area,
+                Capability<volatile uint32_t> hyperram_w_area, Capability<volatile uint64_t> hyperram_d_area,
+                ds::xoroshiro::P64R32 &prng, Log &log, WriteTestType test_type, int iterations = 1,
+                uint32_t dst_addr = UINT32_MAX, uint32_t src_addr = UINT32_MAX) {
+  int failures = 0;
+
+  log.println("  Test type {}: {} iteration(s)", (int)test_type, iterations);
+
+  for (int iter = 0; iter < iterations; ++iter) {
+    // Choose a random start address and whether we are to intersperse reads.
+    bool intersperse_reads = ((prng() & 1u) != 0u);
+    uint32_t dst_off       = dst_addr;
+    uint32_t src_off       = src_addr;
+    // Choose source and destination addresses if not supplied.
+    if (UINT32_MAX == dst_off) {
+      dst_off = prng() & 0x3ffu;
+    }
+    if (UINT32_MAX == src_off) {
+      src_off = prng() & 0x3ffu;
+    }
+    // Control area must not be overlapped by any write operation.
+    const uint32_t ctrl_off = 0x800u;
+
+    // Choose a random byte value to store, and a constrained length.
+    uint32_t data = (uint8_t)prng();
+    data |= data << 8;
+    data |= data << 16;
+    // Write at least 32 bytes and up to 95 into a buffer of 128.
+    size_t len              = 0x20u + (prng() & 0x3fu);
+    const uint32_t init_len = 0x80u;
+
+    // Dword, word, hword and bytes aliases to the chosen target buffer.
+    volatile uint64_t *dstd = &hyperram_d_area[(dst_off + 7u) >> 3];
+    volatile uint32_t *dstw = &hyperram_w_area[(dst_off + 3u) >> 2];
+    volatile uint16_t *dsth = &hyperram_h_area[(dst_off + 1u) >> 1];
+    volatile uint8_t *dstb  = &hyperram_b_area[dst_off];
+    // End of target buffer; these addresses are exclusive, i.e. the pointers reference
+    // the first value _above_ the range to be modified.
+    volatile uint64_t *edstd = &hyperram_d_area[(dst_off + len + 7u) >> 3];
+    volatile uint32_t *edstw = &hyperram_w_area[(dst_off + len + 3u) >> 2];
+
+    // The different tests have different alignment requirements so determine the offset
+    // of the lowest address at which writing should be expected to occur.
+    uint32_t align_mask;
+    switch (test_type) {
+      // Half word-aligned test cases.
+      case WriteTestType_H:
+      case WriteTestType_HB:
+        align_mask = 1u;
+        break;
+      // Word-aligned test cases.
+      case WriteTestType_WR:
+      case WriteTestType_W:
+      case WriteTestType_WD:
+        align_mask = 3u;
+        break;
+      // Double word-aligned test cases.
+      case WriteTestType_C:
+      case WriteTestType_CD:
+        align_mask = 7u;
+        break;
+      // Byte-aligned test cases.
+      default:
+        align_mask = 0u;
+        break;
+    }
+
+    // Calculate the number of bytes that we have skipped to achieve natural alignment.
+    // This works for most cases...
+    uint32_t exp_start     = ((align_mask + 1u) - (dst_off & align_mask)) & align_mask;
+    uint32_t written_start = exp_start;
+    // ... but the descending transfers require special treatment.
+    if (test_type == WriteTestType_WD) {
+      exp_start     = ((dst_off + len + 3u) & ~3u) - (dst_off + len);
+      written_start = exp_start + len;
+    } else if (test_type == WriteTestType_CD) {
+      exp_start     = ((dst_off + len + 7u) & ~7u) - (dst_off + len);
+      written_start = exp_start + len;
+    }
+
+    // Initialise the control area.
+    hyperram_memset_w(&hyperram_w_area[ctrl_off >> 2], 0u, init_len);
+
+    // Initialise the target area, using data that we know should never be stored in the
+    // ensuing memory writing code. Thus we maximise the chance of detecting any bytes
+    // that are erroneously overwritten.
+    hyperram_memset_b(dstb, ~data, init_len);
+
+    uint32_t written_end = written_start;
+    uint32_t bytes_left  = len;
+    while (bytes_left > 0u) {
+      // Decide upon a word-aligned offset from which to read; do this before the
+      // memory writing because the random number generation is time-consuming and we want
+      // to test the interaction of the read with the under-construction write burst.
+      uint32_t rd_off  = prng() % init_len;
+      size_t chunk_len = intersperse_reads ? (1u + (prng() % bytes_left)) : bytes_left;
+      if (bytes_left <= align_mask) {  // Ensure the loop terminates.
+        chunk_len = bytes_left;
+      } else if (chunk_len < bytes_left) {  // Non-final chunk?
+        // The next chunk must have natural alignment too; note that we could end up
+        // with a chunk size of zero; memset routines accept that.
+        chunk_len &= ~align_mask;
+      }
+      // Adjust our expectations of the area that should have been written after this chunk.
+      if (test_type == WriteTestType_WD || test_type == WriteTestType_CD) {
+        written_start -= chunk_len;
+      } else {
+        written_end += chunk_len;
+      }
+
+      // Perform the write operation.
+      switch (test_type) {
+        case WriteTestType_B:
+          hyperram_memset_b(dstb, data, chunk_len);
+          break;
+        case WriteTestType_H:
+          hyperram_memset_h(dsth, data, chunk_len);
+          break;
+        // HB test writes two bytes and a half-word, so it requires half-word alignment.
+        case WriteTestType_HB:
+          hyperram_memset_hb(dsth, data, chunk_len);
+          break;
+        case WriteTestType_W:
+          hyperram_memset_w(dstw, data, chunk_len);
+          break;
+        case WriteTestType_WR:
+          hyperram_memset_wr(dstw, data, chunk_len);
+          break;
+        case WriteTestType_WD:
+          hyperram_memset_wd(edstw, data, chunk_len);
+          break;
+        case WriteTestType_C:
+          hyperram_memset_c(dstd, data, chunk_len);
+          break;
+        default:
+          assert(WriteTestType_CD == test_type);
+          hyperram_memset_cd(edstd, data, chunk_len);
+          break;
+      }
+      // Interject a read at this point, to test its interaction with the coalescing of
+      // writes into bursts.
+      if (intersperse_reads) {
+        volatile uint8_t rd_data = hyperram_b_area[dst_off + rd_off];
+        // This is the default value with which the target area was initialised.
+        uint8_t exp_data = (uint8_t)~data;
+        // Does the byte that we're checking lie within the range that should have been overwritten
+        // at this point in the test?
+        if (rd_off >= written_start && rd_off < written_end) exp_data = ~exp_data;
+        failures += (rd_data != exp_data);
+      }
+
+      // Advance the destination pointers for the next chunk, maintaining natural alignment
+      // in case this is not the final chunk of the transfer.
+      dstb += chunk_len;
+      dsth += chunk_len >> 1;
+      dstw += chunk_len >> 2;
+      dstd += chunk_len >> 3;
+      edstw -= chunk_len >> 2;
+      edstd -= chunk_len >> 3;
+      bytes_left -= chunk_len;
+    }
+
+    // Read and check the control area; this primarily serves to ensure that we are checking
+    // what was written to the HyperRAM itself, and not merely the contents of the internal
+    // read buffer within the controller interface.
+    for (uint32_t i = 0u; i < 0x80 / 4; i++) {
+      failures += (hyperram_w_area[i + (ctrl_off >> 2)] != 0u);
+    }
+
+    // Read and check the entire target area.
+    for (uint32_t i = 0u; i < len; i++) {
+      // This is the default value with which the target area was initialised.
+      uint8_t exp_data = (uint8_t)~data;
+      // Does the byte that we're checking lie within the range that should have been overwritten?
+      if (i >= exp_start && (i - exp_start) < len) exp_data = ~exp_data;
+      failures += (hyperram_b_area[i + dst_off] != exp_data);
+    }
+  }
+
   return failures;
 }
 
@@ -251,6 +638,7 @@ extern "C" [[noreturn]] void entry_point(void *rwRoot) {
   ds::xoroshiro::P64R32 prng;
   prng.set_state(0xDEADBEEF, 0xBAADCAFE);
 
+  // Default is word-based accesses, which is sufficient for most tests.
   Capability<volatile uint32_t> hyperram_area = root.cast<volatile uint32_t>();
   hyperram_area.address()                     = HYPERRAM_ADDRESS;
   hyperram_area.bounds()                      = HYPERRAM_BOUNDS;
@@ -259,34 +647,109 @@ extern "C" [[noreturn]] void entry_point(void *rwRoot) {
   hyperram_cap_area.address()                                 = HYPERRAM_ADDRESS;
   hyperram_cap_area.bounds()                                  = HYPERRAM_BOUNDS;
 
-  while (true) {
-    int failures = 0;
+  // We also want byte, hword and dword access for some tests.
+  Capability<volatile uint8_t> hyperram_b_area  = root.cast<volatile uint8_t>();
+  hyperram_b_area.address()                     = HYPERRAM_ADDRESS;
+  hyperram_b_area.bounds()                      = HYPERRAM_BOUNDS;
+  Capability<volatile uint16_t> hyperram_h_area = root.cast<volatile uint16_t>();
+  hyperram_h_area.address()                     = HYPERRAM_ADDRESS;
+  hyperram_h_area.bounds()                      = HYPERRAM_BOUNDS;
+  Capability<volatile uint64_t> hyperram_d_area = root.cast<volatile uint64_t>();
+  hyperram_d_area.address()                     = HYPERRAM_ADDRESS;
+  hyperram_d_area.bounds()                      = HYPERRAM_BOUNDS;
+
+  // Run indefinitely, soak testing until we observe one or more failures.
+  int failures = 0;
+  while (!failures) {
+    const uint32_t burst_len = 32u;
+    const bool report_times  = true;
+
     log.print("Running RND cap test...");
     failures += rand_cap_test(hyperram_area, hyperram_cap_area, prng, HyperramSize / 4);
     write_test_result(log, failures);
 
     log.print("Running RND data test...");
-    failures = rand_data_test_full(hyperram_area, prng);
+    failures += rand_data_test_full(hyperram_area, prng);
     write_test_result(log, failures);
 
     log.print("Running RND data & address test...");
-    failures = rand_data_addr_test(hyperram_area, prng, HyperramSize / 4);
+    failures += rand_data_addr_test(hyperram_area, prng, HyperramSize / 4);
     write_test_result(log, failures);
 
     log.print("Running 0101 stripe test...");
-    failures = stripe_test(hyperram_area, 0x55555555);
+    failures += stripe_test(hyperram_area, 0x55555555, log, report_times);
     write_test_result(log, failures);
 
     log.print("Running 1001 stripe test...");
-    failures = stripe_test(hyperram_area, 0x99999999);
+    failures += stripe_test(hyperram_area, 0x99999999, log, report_times);
     write_test_result(log, failures);
 
     log.print("Running 0000_1111 stripe test...");
-    failures = stripe_test(hyperram_area, 0x0F0F0F0F);
+    failures += stripe_test(hyperram_area, 0x0F0F0F0F, log, report_times);
     write_test_result(log, failures);
 
     log.print("Running Execution test...");
-    failures = execute_test(hyperram_area, prng, HyperramSize / 4);
+    failures += execute_test(hyperram_area, prng, HyperramSize / 4, log, report_times);
+    write_test_result(log, failures);
+
+    // Performance test copies a significant chunk of data from one buffer to another.
+    icache_invalidate();
+    // Ensure that the icache is enabled.
+    icache_enabled_set(true);
+    log.println("Running performance test with icache enabled...");
+    failures += perf_burst_test(hyperram_area, log, prng, 0x1000u);
+    write_test_result(log, failures);
+
+    // Executing with the icache disabled places more strain on the HyperRAM controller because
+    // it will receive many more instruction fetches.
+    icache_enabled_set(false);
+    icache_invalidate();
+    log.println("Running performance test with icache disabled...");
+    failures += perf_burst_test(hyperram_area, log, prng, 0x1000u);
+    write_test_result(log, failures);
+    // Reinstate the normal icache operation.
+    icache_enabled_set(true);
+
+    // Run the same burst performance tests again, with all possible alignments, but also checking
+    // the completed destination buffer against the source.
+    log.print("Running alignment tests ");
+    bool clean_first = false;
+    do {
+      clean_first = !clean_first;
+      log.println(clean_first ? "with cleaning..." : "without cleaning...");
+      for (uint32_t src_addr = 0x1000u; src_addr < 0x1000u + burst_len; src_addr += 4u) {
+        for (uint32_t dst_addr = 0u; dst_addr < burst_len; dst_addr += 4u) {
+          // We may want to investigate the impact of alignment upon performance; this is useful
+          // in development/analysis...
+          const bool report_times = false;  // ... but not required for regression testing.
+          if (report_times) {
+            log.print("  dst: {:#04x} src: {:#04x}...", dst_addr, src_addr & (burst_len - 1u));
+          }
+          failures += perf_burst_test(hyperram_area, log, prng, 0x1000u, clean_first, report_times, dst_addr, src_addr);
+          if (report_times) {
+            write_test_result(log, failures);
+          }
+        }
+      }
+    } while (clean_first);
+    write_test_result(log, failures);
+
+    // Buffering test checks the interaction of write traffic with buffered read data.
+    log.print("Running buffering test...");
+    failures += buffering_test(hyperram_area, prng, 0x1000u);
+    write_test_result(log, failures);
+
+    // Write tests exercise the write coalescing logic of the HyperRAM controller interface.
+    log.println("Running write tests...");
+    for (int test_type = WriteTestType_B; test_type <= WriteTestType_CD; ++test_type) {
+      failures += write_tests(hyperram_b_area, hyperram_h_area, hyperram_area, hyperram_d_area, prng, log,
+                              (WriteTestType)test_type, 0x400u);
+    }
+    log.print("  result...");
     write_test_result(log, failures);
   }
+
+  // Report test failure.
+  log.println("Test(s) failed: {}", failures);
+  while (true) asm volatile("wfi");
 }
diff --git a/sw/cheri/common/CMakeLists.txt b/sw/cheri/common/CMakeLists.txt
index 3068a2846..1988c2b96 100644
--- a/sw/cheri/common/CMakeLists.txt
+++ b/sw/cheri/common/CMakeLists.txt
@@ -7,5 +7,5 @@ add_library(${NAME} OBJECT hyperram_exec_test.S boot.S default-handlers.cc)
 target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES})
 
 set(NAME block_tests)
-add_library(${NAME} OBJECT block_tests.cc)
+add_library(${NAME} OBJECT hyperram_perf_test.S block_tests.cc)
 target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES})
diff --git a/sw/cheri/common/hyperram_perf_test.S b/sw/cheri/common/hyperram_perf_test.S
new file mode 100644
index 000000000..8f125d445
--- /dev/null
+++ b/sw/cheri/common/hyperram_perf_test.S
@@ -0,0 +1,211 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+// This is a memory copying function intended to exercise both the instruction memory system
+// and the load/store data performance by performing a number of back-to-back transactions.
+//
+// Any permutation of the following may be in the HyperRAM:
+// (i)   source data buffer
+// (ii)  destination data buffer
+// (iii) instruction sequence
+//
+// To ensure that data loads and stores are back-to-back, and to facilitate copying of this
+// code into HyperRAM for execution, we employ handwritten assembler.
+//
+// Functions are aligned to 32-byte boundaries on the expectation that this is a multiple of
+// the burst length and thus the code starts at the beginning of a read buffer. This just makes
+// the performance measurements more robust against changes elsewhere in the code. All
+// alignments of source and destination buffer are checked in other tests.
+
+  .section .text, "ax", @progbits
+
+  .option norvc
+  .globl hyperram_copy_block
+  .p2align 5
+  .type hyperram_copy_block,@function
+
+// Controlled, performant copying routine that is position-independent and may be copied into
+// the HyperRAM for in-place execution.
+//
+// ca0 -> destination (word-aligned)
+// ca1 -> source (word-aligned)
+// a2  =  number of bytes to copy
+// return -> beyond destination data.
+hyperram_copy_block:
+  srl   a3, a2, 5
+  andi  a2, a2, 31  // 0-31 bytes remaining after 32-byte loop.
+  sll   a3, a3, 5
+  add   a3, a1, a3
+  bgeu  a1, a3, copy32fix
+
+  // 32 bytes/iteration.
+copy32:
+  clw   a5,   (ca1)
+  clw   t0,  4(ca1)
+  clw   t1,  8(ca1)
+  clw   t2, 12(ca1)
+  csw   a5,   (ca0)
+  csw   t0,  4(ca0)
+  csw   t1,  8(ca0)
+  csw   t2, 12(ca0)
+  clw   a5, 16(ca1)
+  clw   t0, 20(ca1)
+  clw   t1, 24(ca1)
+  clw   t2, 28(ca1)
+  cincoffset ca1, ca1, 32
+  csw   a5, 16(ca0)
+  csw   t0, 20(ca0)
+  csw   t1, 24(ca0)
+  csw   t2, 28(ca0)
+  cincoffset ca0, ca0, 32
+  bltu  a1, a3, copy32
+copy32fix:
+  srl   a3, a2, 2
+  andi  a2, a2, 3  // 0-3 bytes remaining after 4-byte loop.
+  sll   a3, a3, 2
+  add   a3, a1, a3
+  bgeu  a1, a3, copy4fix
+
+  // 4 bytes/iteration.
+copy4:
+  clw   a5, (ca1)
+  cincoffset ca1, ca1, 4
+  csw   a5, (ca0)
+  cincoffset ca0, ca0, 4
+  bltu  a1, a3, copy4
+copy4fix:
+  beqz  a2, copyret
+  li    a3, 2  // This constant allows conditional branching on 1, 2 or > 2
+
+  // 1 byte/iteration; 0-3 bytes left, so fully unrolled.
+copy1:
+  clbu  a5,  (ca1)
+  csb   a5,  (ca0)
+  bltu  a2, a3, copyret
+  clbu  a5, 1(ca1)
+  csb   a5, 1(ca0)
+  beq   a2, a3, copyret
+  clbu  a5, 2(ca1)
+  csb   a5, 2(ca0)
+copyret:
+  cret
+hyperram_copy_end:
+
+// Size of the copying code, in bytes; the copying routine is itself copied into HyperRAM in
+// order to exercise instruction and load/store traffic simultaneously.
+  .globl hyperram_copy_size
+hyperram_copy_size:
+  .long hyperram_copy_end-hyperram_copy_block
+
+
+  .globl hyperram_cmp_block
+  .p2align 5
+  .type hyperram_cmp_block,@function
+
+// Compare the contents of two word-aligned buffers.
+//
+// ca0 -> first buffer (word-aligned)
+// ca1 -> second buffer (word-aligned)
+// a2  =  number of bytes to compare
+hyperram_cmp_block:
+  cincoffset csp, csp, -16
+  csc   cs0,   (csp)
+  csc   cs1,  8(csp)
+  andi  a3, a2, 15
+  xor   a2, a2, a3
+  add   a2, a1, a2
+cmp16:
+  bgeu a1, a2, cmp16done
+  // This implementation is rather specific to the current buffering scheme
+  // of the HyperRAM controller and is designed to minimise thrashing on the
+  // LSU read buffer; read 16 bytes of data from one buffer into the register
+  // file and only then start reading from the other and performing comparisons.
+  //
+  // Note: We could replace this with a more conventional implementation now
+  // and compare a larger number of bytes per iteration.
+  clw   a4,   (ca1)
+  clw   a5,  4(ca1)
+  clw   t0,  8(ca1)
+  clw   t1, 12(ca1)  // End of 16 bytes from second buffer.
+  clw   s0,   (ca0)
+  clw   s1,  4(ca0)
+  cincoffset ca1, ca1, 16
+  bne   a4, s0, cmp_mismatch
+  clw   a4,  8(ca0)
+  bne   a5, s1, cmp_mismatch
+  clw   a5, 12(ca0)
+  cincoffset ca0, ca0, 16
+  bne   t0, a4, cmp_mismatch
+  beq   t1, a5, cmp16
+cmp_mismatch:
+  li    a0, 1
+  j     cmp_leave
+cmp16done:
+  // We're not too concerned with performance here, so just do the remainder
+  // as individual bytes, but unroll and order instructions to minimise the
+  // loop overhead and limit load-use stalls.
+  //
+  // a3 = number of bytes remaining.
+  beqz  a3, cmp_match
+  li    t0, 2 // This constant allows conditional branch on 1, 2 or > 2
+cmp1:
+  clbu  a4, (ca1)
+  clbu  s0, (ca0)
+  bltu  a3, t0, cmp_last0
+  clbu  a5, 1(ca1)  // Safe to read the next byte pair.
+  clbu  s1, 1(ca0)
+  bne   a4, s0, cmp_mismatch
+  beq   a3, t0, cmp_last1
+  bne   a5, s1, cmp_mismatch
+  clbu  a4, 2(ca1)
+  clbu  s0, 2(ca0)
+  cincoffset ca1, ca1, 3
+  cincoffset ca0, ca0, 3
+  bne   a4, s0, cmp_mismatch
+  addi  a3, a3, -3
+  bnez  a3, cmp1
+  j     cmp_match
+cmp_last0:
+  bne   a5, s1, cmp_mismatch
+  j     cmp_match
+cmp_last1:
+  bne   a4, s0, cmp_mismatch
+cmp_match:
+  li    a0, 0
+cmp_leave:
+  clc   cs0,   (csp)
+  clc   cs1,  8(csp)
+  cincoffset csp, csp, 16
+  cret
+
+
+  .globl hyperram_cleaner
+  .p2align 5
+  .type hyperram_cleaner,@function
+
+// Utility function to clean any buffered read data out of the HyperRAM controller interface.
+//
+// entry  ca0 -> start of an unused memory area in the HyperRAM
+//        ca1 -> end of the unused memory area, exclusive
+hyperram_cleaner:
+  bgeu  a0, a1, clean_leave
+  clw   a2, (ca0)
+  cincoffset ca0, ca0, 64  // Should be larger than or equal to the burst length
+  j     hyperram_cleaner
+clean_leave:
+  cret
+
+
+  .globl icache_enabled_set
+  .p2align 5
+  .type icache_enabled_set,@function
+
+// Set the enabled/disabled state of the icache.
+//
+// a0 = 1 to enable, 0 to disable.
+icache_enabled_set:
+  andi  a0, a0, 1
+  csrw  0x7c0, a0
+  cret
+
diff --git a/sw/cheri/common/hyperram_perf_test.h b/sw/cheri/common/hyperram_perf_test.h
new file mode 100644
index 000000000..1ef8b48b8
--- /dev/null
+++ b/sw/cheri/common/hyperram_perf_test.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright lowRISC contributors.
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+#include <stdint.h>
+
+// Helper code for manipulation of the instruction cache.
+extern "C" void icache_enabled_set(bool enabled);
+// Helper code to ensure that there is no stale code in the instruction cache after writing
+// new code into memory.
+//
+// void icache_invalidate(void);
+#define icache_invalidate() asm volatile("fence.i");
+
+// Size in bytes of the `hyperram_copy_block` code. This function is used to copy itself into
+// the HyperRAM memory before execution from there, so the size of the code is required.
+extern uint32_t hyperram_copy_size;
+
+// Memory-copying routine in the manner of `memcpy` but with a word alignment constraint on the
+// base addresses of the source and destination buffers, and handwritten in manner that exercises
+// the burst read/write behaviour of the HyperRAM controller interface and permits the code to be
+// copied into the HyperRAM itself for execution.
+extern "C" void hyperram_copy_block(volatile uint32_t *d, const volatile uint32_t *s, size_t nbytes);
+
+// Utility function to clean any buffered read data out of the HyperRAM controller interface.
+// This may be important for performance measurements or for ensuring that read requests actually/
+// access the HyperRAM device rather than returning buffered data.
+//
+// For the current implementation it is sufficient to perform just 4 reads that are separated by
+// 32 bytes, but for future-proofing this routine should be called with at least
+// 128 x 64 bytes = 8192 bytes of data.
+extern "C" void hyperram_cleaner(const volatile void *start, const volatile void *end);
+
+// Memory comparison function in the manner of `memcmp` but with the same specific requirements
+// and constraints as `hyperram_copy_block.`
+extern "C" int hyperram_cmp_block(const volatile uint32_t *d, const volatile uint32_t *s, size_t nbytes);
diff --git a/sw/cheri/common/sonata-devices.hh b/sw/cheri/common/sonata-devices.hh
index 7e68f700c..828dd5490 100644
--- a/sw/cheri/common/sonata-devices.hh
+++ b/sw/cheri/common/sonata-devices.hh
@@ -83,7 +83,7 @@ using PinmuxPtrs = std::pair<PinSinksPtr, BlockSinksPtr>;
   return hyperram;
 }
 
-[[maybe_unnused]] static TimerPtr timer_ptr(CapRoot root) {
+[[maybe_unused]] static TimerPtr timer_ptr(CapRoot root) {
   CHERI::Capability<volatile uint32_t> timer = root.cast<volatile uint32_t>();
   timer.address()                            = CLINT_ADDRESS;
   timer.bounds()                             = CLINT_BOUNDS;
diff --git a/sw/cheri/tests/hyperram_exec_test.S b/sw/cheri/tests/hyperram_exec_test.S
deleted file mode 100644
index dbff7f781..000000000
--- a/sw/cheri/tests/hyperram_exec_test.S
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright lowRISC contributors.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-.include "assembly-helpers.s"
-
-	.section .text, "ax", @progbits
-
-	.globl get_hyperram_fn_ptr
-	.p2align 2
-    .type get_hyperram_fn_ptr,@function
-get_hyperram_fn_ptr:
-  auipcc ct0, 0
-  csetaddr ca0, ct0, a0
-  cret
diff --git a/sw/cheri/tests/hyperram_tests.hh b/sw/cheri/tests/hyperram_tests.hh
index a2f87f575..930a21461 100644
--- a/sw/cheri/tests/hyperram_tests.hh
+++ b/sw/cheri/tests/hyperram_tests.hh
@@ -1,5 +1,5 @@
-
 // Copyright lowRISC Contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
@@ -12,6 +12,7 @@
 #include <platform-uart.hh>
 
 #include "../common/console.hh"
+#include "../common/hyperram_perf_test.h"
 #include "../common/sonata-devices.hh"
 #include "../common/uart-utils.hh"
 #include "test_runner.hh"
@@ -32,7 +33,7 @@ using namespace CHERI;
  * It can be overwride with a compilation flag
  */
 #ifndef TEST_COVERAGE_AREA
-// Test only 1% of the total memory to be fast enough for varilator.
+// Test only 1% of the total memory to be fast enough for Verilator.
 #define TEST_COVERAGE_AREA 1
 #endif
 _Static_assert(TEST_COVERAGE_AREA <= 100, "TEST_COVERAGE_AREA Should be less than 100");
@@ -191,7 +192,7 @@ int stripe_test(Capability<volatile uint32_t> hyperram_area, uint32_t initial_va
 
 typedef void *(*test_fn_t)(uint32_t *);
 /*
- * Gets a function pointer to an address in hyperram, expectes to be called with
+ * Gets a function pointer to an address in hyperram, expects to be called with
  * a PC capability that provides execution in hyperram. 'addr' is relative to
  * the base of hyperram.
  */
@@ -217,6 +218,11 @@ void write_prog(Capability<volatile uint32_t> hyperram_area, uint32_t addr) {
   hyperram_area[addr + 4] = 0x8082;
 
   asm volatile("fence.i" : : : "memory");
+
+  // By writing the first word of the code again we can ensure that the code is
+  // flushed out to the HyperRAM and will thus be coherent with instruction
+  // fetching when the code is executed.
+  hyperram_area[addr] = hyperram_area[addr];
 }
 
 /*
@@ -257,6 +263,43 @@ int execute_test(Capability<volatile uint32_t> hyperram_area, ds::xoroshiro::P64
   return failures;
 }
 
+// Performance test to exercise burst transfers from/to the HyperRAM.
+int perf_burst_test(Capability<volatile uint32_t> hyperram_area, ds::xoroshiro::P64R32 &prng, size_t nbytes) {
+  typedef volatile void *(*hr_copy_fn_t)(volatile uint32_t *, const volatile uint32_t *, size_t);
+  int failures = 0;
+
+  // Randomised word offsets.
+  uint32_t dst_addr = prng() & 0x3ffu;
+  uint32_t src_addr = 0x1000u - dst_addr;
+
+  volatile uint32_t *d = &hyperram_area[dst_addr];
+  volatile uint32_t *s = &hyperram_area[src_addr];
+
+  // Complete the source buffer with complete words; it doesn't matter that we may write a few extra bytes.
+  const uint32_t whole_words = nbytes / 4u;
+  for (unsigned idx = 0u; idx <= whole_words; idx++) {
+    hyperram_area[src_addr + idx] = prng();
+  }
+
+  // Copy the code into the HyperRAM, using itself.
+  const uint32_t prog_addr = 0x903u;
+  hyperram_copy_block(&hyperram_area[prog_addr], (volatile uint32_t *)hyperram_copy_block, hyperram_copy_size);
+  hr_copy_fn_t hr_copy_ptr = (hr_copy_fn_t)get_hyperram_fn_ptr(HYPERRAM_ADDRESS + (prog_addr * 4));
+
+  for (unsigned code_in_hr = 0; code_in_hr < 2; ++code_in_hr) {
+    if (code_in_hr) {
+      hr_copy_ptr(d, s, nbytes);
+    } else {
+      hyperram_copy_block(d, s, nbytes);
+    }
+
+    // Read back and check the destination buffer.
+    failures += (0 != hyperram_cmp_block(d, s, nbytes));
+  }
+
+  return failures;
+}
+
 void hyperram_tests(CapRoot root, Log &log) {
   auto hyperram_area = hyperram_ptr(root);
 
@@ -310,6 +353,11 @@ void hyperram_tests(CapRoot root, Log &log) {
     test_failed |= (failures > 0);
     write_test_result(log, failures);
 
+    log.print("  Running Performance test...");
+    failures = perf_burst_test(hyperram_area, prng, 0x1000u);
+    test_failed |= (failures > 0);
+    write_test_result(log, failures);
+
     check_result(log, !test_failed);
   }
 }