diff --git a/sw/cheri/checks/CMakeLists.txt b/sw/cheri/checks/CMakeLists.txt index fe4366bc4..a0de9367a 100644 --- a/sw/cheri/checks/CMakeLists.txt +++ b/sw/cheri/checks/CMakeLists.txt @@ -51,9 +51,9 @@ add_custom_command( install(TARGETS ${NAME}) set(NAME hyperram_test) -add_executable(${NAME} hyperram_test.cc) +add_executable(${NAME} hyperram_test.cc hyperram_memset.S) target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES} "${reisfmt_SOURCE_DIR}/include") -target_link_libraries(${NAME} common) +target_link_libraries(${NAME} common block_tests) add_custom_command( TARGET ${NAME} POST_BUILD diff --git a/sw/cheri/checks/hyperram_memset.S b/sw/cheri/checks/hyperram_memset.S new file mode 100644 index 000000000..f64b8a1f8 --- /dev/null +++ b/sw/cheri/checks/hyperram_memset.S @@ -0,0 +1,251 @@ +# Copyright lowRISC contributors. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +// A set of memory initialisation functions using different access sizes, to check that +// the write transactions are properly coalesced into write bursts to the HyperBus +// Memory Controller. +// +// Each routine is expected to be writing a single, defined byte to each address that is +// modified. By initialising the entirety of a target buffer to a different value first, +// the set of modified addresses may be ascertained. + + .section .text, "ax", @progbits + + .option norvc + +// Byte-based memory writing. +// +// entry ca0 -> byte-aligned destination buffer +// a1 = byte to be stored +// a2 = number of bytes +// exit - + .globl hyperram_memset_b + .p2align 5 +hyperram_memset_b: + addi a2, a2, -8 + bltz a2, memset_b_8fix +memset_b_8: + csb a1, (ca0) + csb a1, 1(ca0) + csb a1, 2(ca0) + csb a1, 3(ca0) + csb a1, 4(ca0) + csb a1, 5(ca0) + csb a1, 6(ca0) + csb a1, 7(ca0) + cincoffset ca0, ca0, 8 + addi a2, a2, -8 + bgez a2, memset_b_8 +memset_b_8fix: + addi a2, a2, 8 + bgtz a2, memset_b_tail + cret + +// Just complete the request using byte stores; this is shared among all _ascending_ routines. +// Performance is not very important, but it would be better to keep the byte writes temporally +// close together to try to provoke races. +// +// ca0 -> byte-aligned pointer into destination buffer +// a1 = byte to be stored +// a2 = non-zero count of bytes remaining +memset_b_tail: + add a2, a0, a2 +memset_b_1: + csb a1, (ca0) + cincoffset ca0, ca0, 1 + bltu a0, a2, memset_b_1 + cret + +// Descending transfer, pre-decrementing address; shared among the two _descending_ routines. +// +// ca0 -> just beyond next address to be written, decrement before use. +// a1 = byte to be stored +// a2 = non-zero count of bytes remaining +memset_b_desc_tail: + sub a2, a0, a2 +memset_b_desc_1: + csb a1, -1(ca0) + cincoffset ca0, ca0, -1 + bgtu a0, a2, memset_b_desc_1 + cret + +// Byte and half word-based memory writing; each word is written using +// 2 byte stores and a half-word store. +// +// entry ca0 -> word-aligned destination buffer +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes +// exit - + .globl hyperram_memset_hb + .p2align 5 +hyperram_memset_hb: + addi a2, a2, -4 + bltz a2, memset_hb_4fix +memset_hb_4: + csb a1, (ca0) + csb a1, 1(ca0) + csh a1, 2(ca0) + cincoffset ca0, ca0, 4 + addi a2, a2, -4 + bgez a2, memset_hb_4 +memset_hb_4fix: + addi a2, a2, 4 + bgtz a2, memset_b_tail + cret + +// Half word-based memory writing. +// +// entry ca0 -> half-word aligned destination buffer +// a1 = byte to be stored, replicated throughout half-word +// a2 = number of bytes +// exit - + .globl hyperram_memset_h + .p2align 5 +hyperram_memset_h: + addi a2, a2, -16 + bltz a2, memset_h_16fix +memset_h_16: + csh a1, (ca0) + csh a1, 2(ca0) + csh a1, 4(ca0) + csh a1, 6(ca0) + csh a1, 8(ca0) + csh a1, 10(ca0) + csh a1, 12(ca0) + csh a1, 14(ca0) + cincoffset ca0, ca0, 16 + addi a2, a2, -16 + bgez a2, memset_h_16 +memset_h_16fix: + addi a2, a2, 16 + bgtz a2, memset_b_tail + cret + +// Word-based memory writing. +// +// entry ca0 -> word-aligned destination buffer +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes + .globl hyperram_memset_w + .p2align 5 +hyperram_memset_w: + addi a2, a2, -32 + bltz a2, memset_w_32fix +memset_w_32: + csw a1, (ca0) + csw a1, 4(ca0) + csw a1, 8(ca0) + csw a1, 12(ca0) + csw a1, 16(ca0) + csw a1, 20(ca0) + csw a1, 24(ca0) + csw a1, 28(ca0) + cincoffset ca0, ca0, 32 + addi a2, a2, -32 + bgez a2, memset_w_32 +memset_w_32fix: + addi a2, a2, 32 + bgtz a2, memset_b_tail + cret + +// Repeated words memory writing; the performance of this code is of no consequence. +// It is concerned purely with ensuring the correctness of the written data. +// +// entry ca0 -> word-aligned destination buffer +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes + .globl hyperram_memset_wr + .p2align 5 +hyperram_memset_wr: + addi a2, a2, -4 + bltz a2, memset_wr_4fix + xori a3, a1, -1 +memset_wr_4: + csw a3, (ca0) // This word should be overwritten... + csw a1, (ca0) // ...by the original value. + cincoffset ca0, ca0, 4 + addi a2, a2, -4 + bgez a2, memset_wr_4 +memset_wr_4fix: + addi a2, a2, 4 + bgtz a2, memset_b_tail + cret + +// Word-based memory writing to descending addresses. +// +// entry ca0 -> word-aligned end of destination buffer, exclusive +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes + .globl hyperram_memset_wd + .p2align 5 +hyperram_memset_wd: + addi a2, a2, -8 + bltz a2, memset_wd_8fix +memset_wd_8: + csw a1, -4(ca0) + csw a1, -8(ca0) + cincoffset ca0, ca0, -8 + addi a2, a2, -8 + bgez a2, memset_wd_8 +memset_wd_8fix: + addi a2, a2, 8 + bgtz a2, memset_b_desc_tail + cret + +// Capability stores to ascending addresses. +// +// These are issued as two back-to-back word writes and we're just using +// this as a way to issue 64-bit writes rather than trying to create +// sensible/valid capabilities. +// +// entry ca0 -> double-word aligned destination buffer +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes + .globl hyperram_memset_c + .p2align 5 +hyperram_memset_c: + // Replicate the data word to yield a double word. + cincoffset csp, csp, -8 + csw a1, (csp) + csw a1, 4(csp) + clc ca1,(csp) + cincoffset csp, csp, 8 + addi a2, a2, -8 + bltz a2, memset_c_8fix +memset_c_8: + csc ca1, (ca0) + cincoffset ca0, ca0, 8 + addi a2, a2, -8 + bgez a2, memset_c_8 +memset_c_8fix: + addi a2, a2, 8 + bgtz a2, memset_b_tail + cret + +// Capability stores to descending addresses. See above. +// +// entry ca0 -> double-word aligned end of destination buffer, exclusive +// a1 = byte to be stored, replicated throughout word +// a2 = number of bytes + .globl hyperram_memset_cd + .p2align 5 +hyperram_memset_cd: + // Replicate the data word to yield a double word. + cincoffset csp, csp, -8 + csw a1, (csp) + csw a1, 4(csp) + clc ca1, (csp) + cincoffset csp, csp, 8 + addi a2, a2, -8 + bltz a2, memset_cd_8fix +memset_cd_8: + csc ca1, -8(ca0) + cincoffset ca0, ca0, -8 + addi a2, a2, -8 + bgez a2, memset_cd_8 +memset_cd_8fix: + addi a2, a2, 8 + bgtz a2, memset_b_desc_tail + cret + diff --git a/sw/cheri/checks/hyperram_memset.h b/sw/cheri/checks/hyperram_memset.h new file mode 100644 index 000000000..5063620be --- /dev/null +++ b/sw/cheri/checks/hyperram_memset.h @@ -0,0 +1,32 @@ +/** + * Copyright lowRISC contributors. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +enum WriteTestType { + WriteTestType_B = 0, + WriteTestType_H, + WriteTestType_HB, + WriteTestType_W, + WriteTestType_WR, + WriteTestType_WD, + WriteTestType_C, + WriteTestType_CD +}; + +// Memory-writing routines; these all mimic the ISO C function `memset` except that they have +// a constraint of 'natural alignment' upon the buffer address and have very specific +// implementations to ensure defined traffic for testing the write coalescing/buffering logic +// of the HyperRAM controller interface. +extern "C" void hyperram_memset_b(volatile uint8_t *dst, int c, size_t n); +extern "C" void hyperram_memset_h(volatile uint16_t *dst, int c, size_t n); +extern "C" void hyperram_memset_hb(volatile uint16_t *dst, int c, size_t n); +extern "C" void hyperram_memset_w(volatile uint32_t *dst, int c, size_t n); +extern "C" void hyperram_memset_wr(volatile uint32_t *dst, int c, size_t n); +extern "C" void hyperram_memset_wd(volatile uint32_t *dst, int c, size_t n); +extern "C" void hyperram_memset_c(volatile uint64_t *dst, int c, size_t n); +extern "C" void hyperram_memset_cd(volatile uint64_t *dst, int c, size_t n); diff --git a/sw/cheri/checks/hyperram_test.cc b/sw/cheri/checks/hyperram_test.cc index 999002dd8..f874e68ac 100644 --- a/sw/cheri/checks/hyperram_test.cc +++ b/sw/cheri/checks/hyperram_test.cc @@ -17,8 +17,12 @@ #include #include "../common/console.hh" +#include "../common/hyperram_perf_test.h" +#include "../common/timer-utils.hh" #include "../common/uart-utils.hh" +#include "hyperram_memset.h" + using namespace CHERI; const int RandTestBlockSize = 256; @@ -141,9 +145,10 @@ int rand_cap_test(Capability hyperram_area, // Writes a 32-bit value in every location in hyperram and then reads back to // check read values match written values. The values written alternate between // 'initial_val' and the inversion of 'initial_val'. -int stripe_test(Capability hyperram_area, uint32_t initial_val) { +int stripe_test(Capability hyperram_area, uint32_t initial_val, Log &log, bool report_time = false) { uint32_t failures = 0; uint32_t cur_write_val = initial_val; + uint32_t start_time = get_mcycle(); for (uint32_t addr = 0; addr < HyperramSize; addr++) { hyperram_area[addr] = cur_write_val; @@ -161,6 +166,11 @@ int stripe_test(Capability hyperram_area, uint32_t initial_va cur_expected_val = ~cur_expected_val; } + if (report_time) { + uint32_t end_time = get_mcycle(); + log.print(" ({} cycles)...", end_time - start_time); + } + return failures; } @@ -190,13 +200,20 @@ void write_prog(Capability &hyperram_area, uint32_t addr) { hyperram_area[addr + 4] = 0x8082; asm volatile("fence.i" : : : "memory"); + + // By writing the first word of the code again we can ensure that the code is + // flushed out to the HyperRAM and will thus be coherent with instruction + // fetching when the code is executed. + hyperram_area[addr] = hyperram_area[addr]; } // Writes a short function to a random area of hyperram and executes it checking // for successful execution (see 'write_prog' for details on the function // written). -int execute_test(Capability &hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations) { - int failures = 0; +int execute_test(Capability hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations, Log &log, + bool report_time = false) { + uint32_t start_time = get_mcycle(); + int failures = 0; for (int i = 0; i < iterations; ++i) { uint32_t prog_addr = prng() % (HyperramSize - 5); @@ -225,6 +242,376 @@ int execute_test(Capability &hyperram_area, ds::xoroshiro::P6 } } + if (report_time) { + uint32_t end_time = get_mcycle(); + log.print(" ({} cycles)...", end_time - start_time); + } + + return failures; +} + +// Perform partial writes to addresses that may collide with the read buffer to check that +// read and write accesses are coherent. +int buffering_test(Capability hyperram_area, ds::xoroshiro::P64R32 &prng, int iterations) { + const uint32_t burst_len = 32u; + int failures = 0; + + // Create an expectation buffer that we may update ourselves; this is a local buffer on the + // stack and thus stored in the main system RAM. + uint32_t exp_data[burst_len / 4]; + for (unsigned idx = 0u; idx < burst_len / 4; ++idx) { + exp_data[idx] = prng(); + } + + for (int i = 0; i < iterations; ++i) { + // Leave a small gap at the end of the test area, so that we may advance by half a burst length + // and still perform a full length burst. + uint32_t burst_addr = prng() % ((HyperramSize * 4) - 2 * burst_len); + // Align to the start of a burst. + burst_addr &= ~(burst_len - 1u); + // With 50% probability, ensure that the buffer crosses a burst boundary, to make + // the interaction of write and read less predictable. + if (prng() & 1) { + burst_addr += burst_len / 2; + } + + // Decide upon a list of actions to be performed; the number of actions/iteration is pretty + // arbitrary. + const unsigned num_actions = 7u; + uint32_t act_addr[num_actions]; + uint32_t act_data[num_actions]; + for (unsigned act = 0u; act < num_actions; ++act) { + uint32_t wr_offset = prng() % burst_len; + uint32_t wr_type = prng() % 3; + // Ensure that the write offset has natural alignment. + wr_offset &= ~((1u << (wr_type)) - 1u); + // Store the action type, offset and data compactly to keep the write operations + // close together. We want to ensure that writes and reads occur simultaneously/close + // together. + act_addr[act] = wr_offset | (wr_type << 24); + act_data[act] = prng(); + } + + // Randomise the word offset from which we read; read bursts are wrapping. + uint32_t rd_off = prng() % (burst_len / 4); + + // ----- Avoid the use of randomisation after this point; timing would become predictable. ----- + + // We need pointers to write to data of different sizes. + volatile uint32_t *burst_wp = &hyperram_area[burst_addr / 4]; + volatile uint16_t *burst_hp = reinterpret_cast(burst_wp); + volatile uint8_t *burst_bp = reinterpret_cast(burst_wp); + + // Initialise the data at the target address. + hyperram_copy_block(burst_wp, exp_data, burst_len); + + // Ensure that we have read data from an address, to provoke fetching of a burst of + // read data from the HyperRAM. Check the first word is as expected. + volatile uint32_t rd_data = burst_wp[rd_off]; + failures += (rd_data != exp_data[rd_off]); + + // Update the expectation according to the actions that we're about to perform. + for (unsigned act = 0u; act < num_actions; ++act) { + uint32_t wr_offset = act_addr[act] & ~0xff000000u; + // Modify the affected word in each case. Note that `wr_offset` has natural alignment. + unsigned sh = 8u * (wr_offset & 3u); + uint32_t mask; + switch (act_addr[act] >> 24) { + case 0u: + mask = 0xffu << sh; + break; + case 1u: + mask = 0xffffu << sh; + break; + default: + mask = ~0u; + break; + } + exp_data[wr_offset >> 2] = (exp_data[wr_offset >> 2] & ~mask) | ((act_data[act] << sh) & mask); + } + + // Perform the actions; keep this code short and fast to ensure that the write traffic + // is still held within the controller interface. (Write traffic is flushed out to the + // HyperRAM after a short while, rather than being held indefinitely.) + for (unsigned act = 0u; act < num_actions; ++act) { + uint32_t wr_offset = act_addr[act] & ~0xff000000u; + uint32_t d = act_data[act]; + switch (act_addr[act] >> 24) { + case 0u: + burst_bp[wr_offset] = (uint8_t)d; + break; + case 1u: + burst_hp[wr_offset >> 1] = (uint16_t)d; + break; + default: + burst_wp[wr_offset >> 2] = d; + break; + } + } + + // ----- Avoid the use of randomisation before this point ----- + + // Check the entire contents of the target buffer against our expectations. + for (unsigned idx = 0u; idx < burst_len / 4; ++idx) { + failures += (exp_data[idx] != burst_wp[idx]); + } + } + + return failures; +} + +// Performance test to exercise burst transfers from/to the HyperRAM. +// +// - read buffers may be 'cleaned' before commencing the performance test, so that they have no +// history of earlier read accesses. +// - source address and/or destination address will be randomised if not already set. +// - run time of each of the 'copy' and 'compare' operations in `mcycle` ticks may optionally be reported. +int perf_burst_test(Capability hyperram_area, Log &log, ds::xoroshiro::P64R32 &prng, size_t nbytes, + bool clean_first = true, bool report_times = true, uint32_t dst_addr = UINT32_MAX, + uint32_t src_addr = UINT32_MAX) { + typedef volatile void *(*hr_copy_fn_t)(volatile uint32_t *, const volatile uint32_t *, size_t); + int failures = 0; + + // Randomised word offsets. + if (dst_addr == UINT32_MAX) { + dst_addr = prng() & 0x3ffu; + } + if (src_addr == UINT32_MAX) { + // Ensuring that the source and destination buffers cannot overlap. + src_addr = 0x1000u - dst_addr; + } + + volatile uint32_t *d = &hyperram_area[dst_addr]; + volatile uint32_t *s = &hyperram_area[src_addr]; + + // Complete the source buffer with complete words; it doesn't matter that we may write a few extra bytes. + const uint32_t whole_words = nbytes / 4u; + for (unsigned idx = 0u; idx <= whole_words; idx++) { + hyperram_area[src_addr + idx] = prng(); + } + + // Copy the code into the HyperRAM, using itself. + const uint32_t prog_addr = 0x903u; + hyperram_copy_block(&hyperram_area[prog_addr], (volatile uint32_t *)hyperram_copy_block, hyperram_copy_size); + hr_copy_fn_t hr_copy_ptr = (hr_copy_fn_t)get_hyperram_fn_ptr(HYPERRAM_ADDRESS + (prog_addr * 4)); + + for (unsigned code_in_hr = 0; code_in_hr < 2; ++code_in_hr) { + // Do we need to clean all buffered read data out of the HyperRAM controller interface first? + if (clean_first) { + // Use 8KiB of data to ensure that 128 buffers of 64 bytes/burst can be cleaned out; this + // should be more than enough for any current/future implementation. + hyperram_cleaner(&hyperram_area[0x2000u], &hyperram_area[0x4000u]); + } + + // Time the memory copy operation. + uint32_t start_time = get_mcycle(); + if (code_in_hr) { + hr_copy_ptr(d, s, nbytes); + } else { + hyperram_copy_block(d, s, nbytes); + } + + // Read back and check the destination buffer; this becomes a significant part of the memory + // traffic/execution time because presently there is no prefetching of read data, so we time + // it separately. + uint32_t cmp_start_time = get_mcycle(); + failures += (0 != hyperram_cmp_block(d, s, nbytes)); + + if (report_times) { + // Report the duration of the copy and compare operations separately. + uint32_t end_time = get_mcycle(); + log.print(" copy: {:#6d} - cmp: {:#6d} - total: {:#6d}...", cmp_start_time - start_time, + end_time - cmp_start_time, end_time - start_time); + } + } + + return failures; +} + +// Memory-writing tests to exercise the write coalescing behaviour. +// +// - tests all transfer sizes (byte, half-word, word and double-word) +// - exercises coalescing of partial word writes to form complete words +// - performs descending writes as well as ascending +int write_tests(Capability hyperram_b_area, Capability hyperram_h_area, + Capability hyperram_w_area, Capability hyperram_d_area, + ds::xoroshiro::P64R32 &prng, Log &log, WriteTestType test_type, int iterations = 1, + uint32_t dst_addr = UINT32_MAX, uint32_t src_addr = UINT32_MAX) { + int failures = 0; + + log.println(" Test type {}: {} iteration(s)", (int)test_type, iterations); + + for (int iter = 0; iter < iterations; ++iter) { + // Choose a random start address and whether we are to intersperse reads. + bool intersperse_reads = ((prng() & 1u) != 0u); + uint32_t dst_off = dst_addr; + uint32_t src_off = src_addr; + // Choose source and destination addresses if not supplied. + if (UINT32_MAX == dst_off) { + dst_off = prng() & 0x3ffu; + } + if (UINT32_MAX == src_off) { + src_off = prng() & 0x3ffu; + } + // Control area must not be overlapped by any write operation. + const uint32_t ctrl_off = 0x800u; + + // Choose a random byte value to store, and a constrained length. + uint32_t data = (uint8_t)prng(); + data |= data << 8; + data |= data << 16; + // Write at least 32 bytes and up to 95 into a buffer of 128. + size_t len = 0x20u + (prng() & 0x3fu); + const uint32_t init_len = 0x80u; + + // Dword, word, hword and bytes aliases to the chosen target buffer. + volatile uint64_t *dstd = &hyperram_d_area[(dst_off + 7u) >> 3]; + volatile uint32_t *dstw = &hyperram_w_area[(dst_off + 3u) >> 2]; + volatile uint16_t *dsth = &hyperram_h_area[(dst_off + 1u) >> 1]; + volatile uint8_t *dstb = &hyperram_b_area[dst_off]; + // End of target buffer; these addresses are exclusive, i.e. the pointers reference + // the first value _above_ the range to be modified. + volatile uint64_t *edstd = &hyperram_d_area[(dst_off + len + 7u) >> 3]; + volatile uint32_t *edstw = &hyperram_w_area[(dst_off + len + 3u) >> 2]; + + // The different tests have different alignment requirements so determine the offset + // of the lowest address at which writing should be expected to occur. + uint32_t align_mask; + switch (test_type) { + // Half word-aligned test cases. + case WriteTestType_H: + case WriteTestType_HB: + align_mask = 1u; + break; + // Word-aligned test cases. + case WriteTestType_WR: + case WriteTestType_W: + case WriteTestType_WD: + align_mask = 3u; + break; + // Double word-aligned test cases. + case WriteTestType_C: + case WriteTestType_CD: + align_mask = 7u; + break; + // Byte-aligned test cases. + default: + align_mask = 0u; + break; + } + + // Calculate the number of bytes that we have skipped to achieve natural alignment. + // This works for most cases... + uint32_t exp_start = ((align_mask + 1u) - (dst_off & align_mask)) & align_mask; + uint32_t written_start = exp_start; + // ... but the descending transfers require special treatment. + if (test_type == WriteTestType_WD) { + exp_start = ((dst_off + len + 3u) & ~3u) - (dst_off + len); + written_start = exp_start + len; + } else if (test_type == WriteTestType_CD) { + exp_start = ((dst_off + len + 7u) & ~7u) - (dst_off + len); + written_start = exp_start + len; + } + + // Initialise the control area. + hyperram_memset_w(&hyperram_w_area[ctrl_off >> 2], 0u, init_len); + + // Initialise the target area, using data that we know should never be stored in the + // ensuing memory writing code. Thus we maximise the chance of detecting any bytes + // that are erroneously overwritten. + hyperram_memset_b(dstb, ~data, init_len); + + uint32_t written_end = written_start; + uint32_t bytes_left = len; + while (bytes_left > 0u) { + // Decide upon a word-aligned offset from which to read; do this before the + // memory writing because the random number generation is time-consuming and we want + // to test the interaction of the read with the under-construction write burst. + uint32_t rd_off = prng() % init_len; + size_t chunk_len = intersperse_reads ? (1u + (prng() % bytes_left)) : bytes_left; + if (bytes_left <= align_mask) { // Ensure the loop terminates. + chunk_len = bytes_left; + } else if (chunk_len < bytes_left) { // Non-final chunk? + // The next chunk must have natural alignment too; note that we could end up + // with a chunk size of zero; memset routines accept that. + chunk_len &= ~align_mask; + } + // Adjust our expectations of the area that should have been written after this chunk. + if (test_type == WriteTestType_WD || test_type == WriteTestType_CD) { + written_start -= chunk_len; + } else { + written_end += chunk_len; + } + + // Perform the write operation. + switch (test_type) { + case WriteTestType_B: + hyperram_memset_b(dstb, data, chunk_len); + break; + case WriteTestType_H: + hyperram_memset_h(dsth, data, chunk_len); + break; + // HB test writes two bytes and a half-word, so it requires half-word alignment. + case WriteTestType_HB: + hyperram_memset_hb(dsth, data, chunk_len); + break; + case WriteTestType_W: + hyperram_memset_w(dstw, data, chunk_len); + break; + case WriteTestType_WR: + hyperram_memset_wr(dstw, data, chunk_len); + break; + case WriteTestType_WD: + hyperram_memset_wd(edstw, data, chunk_len); + break; + case WriteTestType_C: + hyperram_memset_c(dstd, data, chunk_len); + break; + default: + assert(WriteTestType_CD == test_type); + hyperram_memset_cd(edstd, data, chunk_len); + break; + } + // Interject a read at this point, to test its interaction with the coalescing of + // writes into bursts. + if (intersperse_reads) { + volatile uint8_t rd_data = hyperram_b_area[dst_off + rd_off]; + // This is the default value with which the target area was initialised. + uint8_t exp_data = (uint8_t)~data; + // Does the byte that we're checking lie within the range that should have been overwritten + // at this point in the test? + if (rd_off >= written_start && rd_off < written_end) exp_data = ~exp_data; + failures += (rd_data != exp_data); + } + + // Advance the destination pointers for the next chunk, maintaining natural alignment + // in case this is not the final chunk of the transfer. + dstb += chunk_len; + dsth += chunk_len >> 1; + dstw += chunk_len >> 2; + dstd += chunk_len >> 3; + edstw -= chunk_len >> 2; + edstd -= chunk_len >> 3; + bytes_left -= chunk_len; + } + + // Read and check the control area; this primarily serves to ensure that we are checking + // what was written to the HyperRAM itself, and not merely the contents of the internal + // read buffer within the controller interface. + for (uint32_t i = 0u; i < 0x80 / 4; i++) { + failures += (hyperram_w_area[i + (ctrl_off >> 2)] != 0u); + } + + // Read and check the entire target area. + for (uint32_t i = 0u; i < len; i++) { + // This is the default value with which the target area was initialised. + uint8_t exp_data = (uint8_t)~data; + // Does the byte that we're checking lie within the range that should have been overwritten? + if (i >= exp_start && (i - exp_start) < len) exp_data = ~exp_data; + failures += (hyperram_b_area[i + dst_off] != exp_data); + } + } + return failures; } @@ -251,6 +638,7 @@ extern "C" [[noreturn]] void entry_point(void *rwRoot) { ds::xoroshiro::P64R32 prng; prng.set_state(0xDEADBEEF, 0xBAADCAFE); + // Default is word-based accesses, which is sufficient for most tests. Capability hyperram_area = root.cast(); hyperram_area.address() = HYPERRAM_ADDRESS; hyperram_area.bounds() = HYPERRAM_BOUNDS; @@ -259,34 +647,109 @@ extern "C" [[noreturn]] void entry_point(void *rwRoot) { hyperram_cap_area.address() = HYPERRAM_ADDRESS; hyperram_cap_area.bounds() = HYPERRAM_BOUNDS; - while (true) { - int failures = 0; + // We also want byte, hword and dword access for some tests. + Capability hyperram_b_area = root.cast(); + hyperram_b_area.address() = HYPERRAM_ADDRESS; + hyperram_b_area.bounds() = HYPERRAM_BOUNDS; + Capability hyperram_h_area = root.cast(); + hyperram_h_area.address() = HYPERRAM_ADDRESS; + hyperram_h_area.bounds() = HYPERRAM_BOUNDS; + Capability hyperram_d_area = root.cast(); + hyperram_d_area.address() = HYPERRAM_ADDRESS; + hyperram_d_area.bounds() = HYPERRAM_BOUNDS; + + // Run indefinitely, soak testing until we observe one or more failures. + int failures = 0; + while (!failures) { + const uint32_t burst_len = 32u; + const bool report_times = true; + log.print("Running RND cap test..."); failures += rand_cap_test(hyperram_area, hyperram_cap_area, prng, HyperramSize / 4); write_test_result(log, failures); log.print("Running RND data test..."); - failures = rand_data_test_full(hyperram_area, prng); + failures += rand_data_test_full(hyperram_area, prng); write_test_result(log, failures); log.print("Running RND data & address test..."); - failures = rand_data_addr_test(hyperram_area, prng, HyperramSize / 4); + failures += rand_data_addr_test(hyperram_area, prng, HyperramSize / 4); write_test_result(log, failures); log.print("Running 0101 stripe test..."); - failures = stripe_test(hyperram_area, 0x55555555); + failures += stripe_test(hyperram_area, 0x55555555, log, report_times); write_test_result(log, failures); log.print("Running 1001 stripe test..."); - failures = stripe_test(hyperram_area, 0x99999999); + failures += stripe_test(hyperram_area, 0x99999999, log, report_times); write_test_result(log, failures); log.print("Running 0000_1111 stripe test..."); - failures = stripe_test(hyperram_area, 0x0F0F0F0F); + failures += stripe_test(hyperram_area, 0x0F0F0F0F, log, report_times); write_test_result(log, failures); log.print("Running Execution test..."); - failures = execute_test(hyperram_area, prng, HyperramSize / 4); + failures += execute_test(hyperram_area, prng, HyperramSize / 4, log, report_times); + write_test_result(log, failures); + + // Performance test copies a significant chunk of data from one buffer to another. + icache_invalidate(); + // Ensure that the icache is enabled. + icache_enabled_set(true); + log.println("Running performance test with icache enabled..."); + failures += perf_burst_test(hyperram_area, log, prng, 0x1000u); + write_test_result(log, failures); + + // Executing with the icache disabled places more strain on the HyperRAM controller because + // it will receive many more instruction fetches. + icache_enabled_set(false); + icache_invalidate(); + log.println("Running performance test with icache disabled..."); + failures += perf_burst_test(hyperram_area, log, prng, 0x1000u); + write_test_result(log, failures); + // Reinstate the normal icache operation. + icache_enabled_set(true); + + // Run the same burst performance tests again, with all possible alignments, but also checking + // the completed destination buffer against the source. + log.print("Running alignment tests "); + bool clean_first = false; + do { + clean_first = !clean_first; + log.println(clean_first ? "with cleaning..." : "without cleaning..."); + for (uint32_t src_addr = 0x1000u; src_addr < 0x1000u + burst_len; src_addr += 4u) { + for (uint32_t dst_addr = 0u; dst_addr < burst_len; dst_addr += 4u) { + // We may want to investigate the impact of alignment upon performance; this is useful + // in development/analysis... + const bool report_times = false; // ... but not required for regression testing. + if (report_times) { + log.print(" dst: {:#04x} src: {:#04x}...", dst_addr, src_addr & (burst_len - 1u)); + } + failures += perf_burst_test(hyperram_area, log, prng, 0x1000u, clean_first, report_times, dst_addr, src_addr); + if (report_times) { + write_test_result(log, failures); + } + } + } + } while (clean_first); + write_test_result(log, failures); + + // Buffering test checks the interaction of write traffic with buffered read data. + log.print("Running buffering test..."); + failures += buffering_test(hyperram_area, prng, 0x1000u); + write_test_result(log, failures); + + // Write tests exercise the write coalescing logic of the HyperRAM controller interface. + log.println("Running write tests..."); + for (int test_type = WriteTestType_B; test_type <= WriteTestType_CD; ++test_type) { + failures += write_tests(hyperram_b_area, hyperram_h_area, hyperram_area, hyperram_d_area, prng, log, + (WriteTestType)test_type, 0x400u); + } + log.print(" result..."); write_test_result(log, failures); } + + // Report test failure. + log.println("Test(s) failed: {}", failures); + while (true) asm volatile("wfi"); } diff --git a/sw/cheri/common/CMakeLists.txt b/sw/cheri/common/CMakeLists.txt index 3068a2846..1988c2b96 100644 --- a/sw/cheri/common/CMakeLists.txt +++ b/sw/cheri/common/CMakeLists.txt @@ -7,5 +7,5 @@ add_library(${NAME} OBJECT hyperram_exec_test.S boot.S default-handlers.cc) target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES}) set(NAME block_tests) -add_library(${NAME} OBJECT block_tests.cc) +add_library(${NAME} OBJECT hyperram_perf_test.S block_tests.cc) target_include_directories(${NAME} PRIVATE ${CHERIOT_SDK_INCLUDES}) diff --git a/sw/cheri/common/hyperram_perf_test.S b/sw/cheri/common/hyperram_perf_test.S new file mode 100644 index 000000000..8f125d445 --- /dev/null +++ b/sw/cheri/common/hyperram_perf_test.S @@ -0,0 +1,211 @@ +# Copyright lowRISC contributors. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +// This is a memory copying function intended to exercise both the instruction memory system +// and the load/store data performance by performing a number of back-to-back transactions. +// +// Any permutation of the following may be in the HyperRAM: +// (i) source data buffer +// (ii) destination data buffer +// (iii) instruction sequence +// +// To ensure that data loads and stores are back-to-back, and to facilitate copying of this +// code into HyperRAM for execution, we employ handwritten assembler. +// +// Functions are aligned to 32-byte boundaries on the expectation that this is a multiple of +// the burst length and thus the code starts at the beginning of a read buffer. This just makes +// the performance measurements more robust against changes elsewhere in the code. All +// alignments of source and destination buffer are checked in other tests. + + .section .text, "ax", @progbits + + .option norvc + .globl hyperram_copy_block + .p2align 5 + .type hyperram_copy_block,@function + +// Controlled, performant copying routine that is position-independent and may be copied into +// the HyperRAM for in-place execution. +// +// ca0 -> destination (word-aligned) +// ca1 -> source (word-aligned) +// a2 = number of bytes to copy +// return -> beyond destination data. +hyperram_copy_block: + srl a3, a2, 5 + andi a2, a2, 31 // 0-31 bytes remaining after 32-byte loop. + sll a3, a3, 5 + add a3, a1, a3 + bgeu a1, a3, copy32fix + + // 32 bytes/iteration. +copy32: + clw a5, (ca1) + clw t0, 4(ca1) + clw t1, 8(ca1) + clw t2, 12(ca1) + csw a5, (ca0) + csw t0, 4(ca0) + csw t1, 8(ca0) + csw t2, 12(ca0) + clw a5, 16(ca1) + clw t0, 20(ca1) + clw t1, 24(ca1) + clw t2, 28(ca1) + cincoffset ca1, ca1, 32 + csw a5, 16(ca0) + csw t0, 20(ca0) + csw t1, 24(ca0) + csw t2, 28(ca0) + cincoffset ca0, ca0, 32 + bltu a1, a3, copy32 +copy32fix: + srl a3, a2, 2 + andi a2, a2, 3 // 0-3 bytes remaining after 4-byte loop. + sll a3, a3, 2 + add a3, a1, a3 + bgeu a1, a3, copy4fix + + // 4 bytes/iteration. +copy4: + clw a5, (ca1) + cincoffset ca1, ca1, 4 + csw a5, (ca0) + cincoffset ca0, ca0, 4 + bltu a1, a3, copy4 +copy4fix: + beqz a2, copyret + li a3, 2 // This constant allows conditional branching on 1, 2 or > 2 + + // 1 byte/iteration; 0-3 bytes left, so fully unrolled. +copy1: + clbu a5, (ca1) + csb a5, (ca0) + bltu a2, a3, copyret + clbu a5, 1(ca1) + csb a5, 1(ca0) + beq a2, a3, copyret + clbu a5, 2(ca1) + csb a5, 2(ca0) +copyret: + cret +hyperram_copy_end: + +// Size of the copying code, in bytes; the copying routine is itself copied into HyperRAM in +// order to exercise instruction and load/store traffic simultaneously. + .globl hyperram_copy_size +hyperram_copy_size: + .long hyperram_copy_end-hyperram_copy_block + + + .globl hyperram_cmp_block + .p2align 5 + .type hyperram_cmp_block,@function + +// Compare the contents of two word-aligned buffers. +// +// ca0 -> first buffer (word-aligned) +// ca1 -> second buffer (word-aligned) +// a2 = number of bytes to compare +hyperram_cmp_block: + cincoffset csp, csp, -16 + csc cs0, (csp) + csc cs1, 8(csp) + andi a3, a2, 15 + xor a2, a2, a3 + add a2, a1, a2 +cmp16: + bgeu a1, a2, cmp16done + // This implementation is rather specific to the current buffering scheme + // of the HyperRAM controller and is designed to minimise thrashing on the + // LSU read buffer; read 16 bytes of data from one buffer into the register + // file and only then start reading from the other and performing comparisons. + // + // Note: We could replace this with a more conventional implementation now + // and compare a larger number of bytes per iteration. + clw a4, (ca1) + clw a5, 4(ca1) + clw t0, 8(ca1) + clw t1, 12(ca1) // End of 16 bytes from second buffer. + clw s0, (ca0) + clw s1, 4(ca0) + cincoffset ca1, ca1, 16 + bne a4, s0, cmp_mismatch + clw a4, 8(ca0) + bne a5, s1, cmp_mismatch + clw a5, 12(ca0) + cincoffset ca0, ca0, 16 + bne t0, a4, cmp_mismatch + beq t1, a5, cmp16 +cmp_mismatch: + li a0, 1 + j cmp_leave +cmp16done: + // We're not too concerned with performance here, so just do the remainder + // as individual bytes, but unroll and order instructions to minimise the + // loop overhead and limit load-use stalls. + // + // a3 = number of bytes remaining. + beqz a3, cmp_match + li t0, 2 // This constant allows conditional branch on 1, 2 or > 2 +cmp1: + clbu a4, (ca1) + clbu s0, (ca0) + bltu a3, t0, cmp_last0 + clbu a5, 1(ca1) // Safe to read the next byte pair. + clbu s1, 1(ca0) + bne a4, s0, cmp_mismatch + beq a3, t0, cmp_last1 + bne a5, s1, cmp_mismatch + clbu a4, 2(ca1) + clbu s0, 2(ca0) + cincoffset ca1, ca1, 3 + cincoffset ca0, ca0, 3 + bne a4, s0, cmp_mismatch + addi a3, a3, -3 + bnez a3, cmp1 + j cmp_match +cmp_last0: + bne a5, s1, cmp_mismatch + j cmp_match +cmp_last1: + bne a4, s0, cmp_mismatch +cmp_match: + li a0, 0 +cmp_leave: + clc cs0, (csp) + clc cs1, 8(csp) + cincoffset csp, csp, 16 + cret + + + .globl hyperram_cleaner + .p2align 5 + .type hyperram_cleaner,@function + +// Utility function to clean any buffered read data out of the HyperRAM controller interface. +// +// entry ca0 -> start of an unused memory area in the HyperRAM +// ca1 -> end of the unused memory area, exclusive +hyperram_cleaner: + bgeu a0, a1, clean_leave + clw a2, (ca0) + cincoffset ca0, ca0, 64 // Should be larger than or equal to the burst length + j hyperram_cleaner +clean_leave: + cret + + + .globl icache_enabled_set + .p2align 5 + .type icache_enabled_set,@function + +// Set the enabled/disabled state of the icache. +// +// a0 = 1 to enable, 0 to disable. +icache_enabled_set: + andi a0, a0, 1 + csrw 0x7c0, a0 + cret + diff --git a/sw/cheri/common/hyperram_perf_test.h b/sw/cheri/common/hyperram_perf_test.h new file mode 100644 index 000000000..1ef8b48b8 --- /dev/null +++ b/sw/cheri/common/hyperram_perf_test.h @@ -0,0 +1,39 @@ +/** + * Copyright lowRISC contributors. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include + +// Helper code for manipulation of the instruction cache. +extern "C" void icache_enabled_set(bool enabled); +// Helper code to ensure that there is no stale code in the instruction cache after writing +// new code into memory. +// +// void icache_invalidate(void); +#define icache_invalidate() asm volatile("fence.i"); + +// Size in bytes of the `hyperram_copy_block` code. This function is used to copy itself into +// the HyperRAM memory before execution from there, so the size of the code is required. +extern uint32_t hyperram_copy_size; + +// Memory-copying routine in the manner of `memcpy` but with a word alignment constraint on the +// base addresses of the source and destination buffers, and handwritten in manner that exercises +// the burst read/write behaviour of the HyperRAM controller interface and permits the code to be +// copied into the HyperRAM itself for execution. +extern "C" void hyperram_copy_block(volatile uint32_t *d, const volatile uint32_t *s, size_t nbytes); + +// Utility function to clean any buffered read data out of the HyperRAM controller interface. +// This may be important for performance measurements or for ensuring that read requests actually/ +// access the HyperRAM device rather than returning buffered data. +// +// For the current implementation it is sufficient to perform just 4 reads that are separated by +// 32 bytes, but for future-proofing this routine should be called with at least +// 128 x 64 bytes = 8192 bytes of data. +extern "C" void hyperram_cleaner(const volatile void *start, const volatile void *end); + +// Memory comparison function in the manner of `memcmp` but with the same specific requirements +// and constraints as `hyperram_copy_block.` +extern "C" int hyperram_cmp_block(const volatile uint32_t *d, const volatile uint32_t *s, size_t nbytes); diff --git a/sw/cheri/common/sonata-devices.hh b/sw/cheri/common/sonata-devices.hh index 7e68f700c..828dd5490 100644 --- a/sw/cheri/common/sonata-devices.hh +++ b/sw/cheri/common/sonata-devices.hh @@ -83,7 +83,7 @@ using PinmuxPtrs = std::pair; return hyperram; } -[[maybe_unnused]] static TimerPtr timer_ptr(CapRoot root) { +[[maybe_unused]] static TimerPtr timer_ptr(CapRoot root) { CHERI::Capability timer = root.cast(); timer.address() = CLINT_ADDRESS; timer.bounds() = CLINT_BOUNDS; diff --git a/sw/cheri/tests/hyperram_exec_test.S b/sw/cheri/tests/hyperram_exec_test.S deleted file mode 100644 index dbff7f781..000000000 --- a/sw/cheri/tests/hyperram_exec_test.S +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright lowRISC contributors. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -.include "assembly-helpers.s" - - .section .text, "ax", @progbits - - .globl get_hyperram_fn_ptr - .p2align 2 - .type get_hyperram_fn_ptr,@function -get_hyperram_fn_ptr: - auipcc ct0, 0 - csetaddr ca0, ct0, a0 - cret diff --git a/sw/cheri/tests/hyperram_tests.hh b/sw/cheri/tests/hyperram_tests.hh index a2f87f575..930a21461 100644 --- a/sw/cheri/tests/hyperram_tests.hh +++ b/sw/cheri/tests/hyperram_tests.hh @@ -1,5 +1,5 @@ - // Copyright lowRISC Contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 #pragma once @@ -12,6 +12,7 @@ #include #include "../common/console.hh" +#include "../common/hyperram_perf_test.h" #include "../common/sonata-devices.hh" #include "../common/uart-utils.hh" #include "test_runner.hh" @@ -32,7 +33,7 @@ using namespace CHERI; * It can be overwride with a compilation flag */ #ifndef TEST_COVERAGE_AREA -// Test only 1% of the total memory to be fast enough for varilator. +// Test only 1% of the total memory to be fast enough for Verilator. #define TEST_COVERAGE_AREA 1 #endif _Static_assert(TEST_COVERAGE_AREA <= 100, "TEST_COVERAGE_AREA Should be less than 100"); @@ -191,7 +192,7 @@ int stripe_test(Capability hyperram_area, uint32_t initial_va typedef void *(*test_fn_t)(uint32_t *); /* - * Gets a function pointer to an address in hyperram, expectes to be called with + * Gets a function pointer to an address in hyperram, expects to be called with * a PC capability that provides execution in hyperram. 'addr' is relative to * the base of hyperram. */ @@ -217,6 +218,11 @@ void write_prog(Capability hyperram_area, uint32_t addr) { hyperram_area[addr + 4] = 0x8082; asm volatile("fence.i" : : : "memory"); + + // By writing the first word of the code again we can ensure that the code is + // flushed out to the HyperRAM and will thus be coherent with instruction + // fetching when the code is executed. + hyperram_area[addr] = hyperram_area[addr]; } /* @@ -257,6 +263,43 @@ int execute_test(Capability hyperram_area, ds::xoroshiro::P64 return failures; } +// Performance test to exercise burst transfers from/to the HyperRAM. +int perf_burst_test(Capability hyperram_area, ds::xoroshiro::P64R32 &prng, size_t nbytes) { + typedef volatile void *(*hr_copy_fn_t)(volatile uint32_t *, const volatile uint32_t *, size_t); + int failures = 0; + + // Randomised word offsets. + uint32_t dst_addr = prng() & 0x3ffu; + uint32_t src_addr = 0x1000u - dst_addr; + + volatile uint32_t *d = &hyperram_area[dst_addr]; + volatile uint32_t *s = &hyperram_area[src_addr]; + + // Complete the source buffer with complete words; it doesn't matter that we may write a few extra bytes. + const uint32_t whole_words = nbytes / 4u; + for (unsigned idx = 0u; idx <= whole_words; idx++) { + hyperram_area[src_addr + idx] = prng(); + } + + // Copy the code into the HyperRAM, using itself. + const uint32_t prog_addr = 0x903u; + hyperram_copy_block(&hyperram_area[prog_addr], (volatile uint32_t *)hyperram_copy_block, hyperram_copy_size); + hr_copy_fn_t hr_copy_ptr = (hr_copy_fn_t)get_hyperram_fn_ptr(HYPERRAM_ADDRESS + (prog_addr * 4)); + + for (unsigned code_in_hr = 0; code_in_hr < 2; ++code_in_hr) { + if (code_in_hr) { + hr_copy_ptr(d, s, nbytes); + } else { + hyperram_copy_block(d, s, nbytes); + } + + // Read back and check the destination buffer. + failures += (0 != hyperram_cmp_block(d, s, nbytes)); + } + + return failures; +} + void hyperram_tests(CapRoot root, Log &log) { auto hyperram_area = hyperram_ptr(root); @@ -310,6 +353,11 @@ void hyperram_tests(CapRoot root, Log &log) { test_failed |= (failures > 0); write_test_result(log, failures); + log.print(" Running Performance test..."); + failures = perf_burst_test(hyperram_area, prng, 0x1000u); + test_failed |= (failures > 0); + write_test_result(log, failures); + check_result(log, !test_failed); } }