Skip to content

Commit 69d95b5

Browse files
author
Jamie Smith
authored
Make DMA SPI driver aware of CPU cache, fix data corruption and other SPI issues on STM32H7 (#199)
* Handle cache alignment in DMA SPI driver * Fix build on cache-less devices * Fix a couple things I missed * Run formatter, improve CacheAlignedBuffer docs * Add missing license identifiers to source files * Make CacheAlignedBuffer heap-allocatable, try and add exclusion for Nordic license in scancode_evaluate.py * Formatting, docs, revert accidental change * Update code blocks to pass spell checker
1 parent d9676cc commit 69d95b5

File tree

28 files changed

+614
-104
lines changed

28 files changed

+614
-104
lines changed

drivers/include/drivers/SPI.h

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "drivers/DigitalOut.h"
2727
#include "platform/SingletonPtr.h"
2828
#include "platform/NonCopyable.h"
29+
#include "platform/CacheAlignedBuffer.h"
2930

3031
#if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
3132
#define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
@@ -194,6 +195,11 @@ const use_gpio_ssel_t use_gpio_ssel;
194195
* the transfer but others can execute). Here's a sample of how to send the same data as above
195196
* using the blocking async API:</p>
196197
*
198+
* <p>Note that when using the asynchronous API, you must use the CacheAlignedBuffer class when declaring the
199+
* receive buffer. This is because some processors' async SPI implementations require the received buffer to
200+
* be at an address which is aligned to the processor cache line size. CacheAlignedBuffer takes care of this
201+
* for you and provides functions (data(), begin(), end()) to access the underlying data in the buffer.</p>
202+
*
197203
* @code
198204
* #include "mbed.h"
199205
*
@@ -203,8 +209,8 @@ const use_gpio_ssel_t use_gpio_ssel;
203209
* device.format(8, 0);
204210
*
205211
* uint8_t command[2] = {0x0A, 0x0B};
206-
* uint8_t response[2];
207-
* int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(response));
212+
* CacheAlignedBuffer<uint8_t, 2> response;
213+
* int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(command));
208214
* }
209215
* @endcode
210216
*
@@ -458,8 +464,9 @@ class SPI : private NonCopyable<SPI> {
458464
* @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
459465
* the default %SPI value is sent.
460466
* @param tx_length The length of TX buffer in bytes.
461-
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
462-
* received data are ignored.
467+
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
468+
* structure must be passed so that cache alignment can be handled for data received from DMA.
469+
* May be nullptr if rx_length is 0.
463470
* @param rx_length The length of RX buffer in bytes.
464471
* @param callback The event callback function.
465472
* @param event The logical OR of events to subscribe to. May be #SPI_EVENT_ALL, or some combination
@@ -471,16 +478,18 @@ class SPI : private NonCopyable<SPI> {
471478
*/
472479
template<typename WordT>
473480
typename std::enable_if<std::is_integral<WordT>::value, int>::type
474-
transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
481+
transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
475482
{
476-
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
483+
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
484+
return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
477485
}
478486

479487
// Overloads of the above to support passing nullptr
480488
template<typename WordT>
481489
typename std::enable_if<std::is_integral<WordT>::value, int>::type
482-
transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
490+
transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
483491
{
492+
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
484493
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
485494
}
486495
template<typename WordT>
@@ -502,8 +511,10 @@ class SPI : private NonCopyable<SPI> {
502511
* Internally, the chip vendor may implement this function using either DMA or interrupts.
503512
*
504513
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
505-
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
506-
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
514+
* @param tx_length The length of TX buffer in bytes. If 0, the default %SPI data value is sent when receiving data.
515+
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
516+
* structure must be passed so that cache alignment can be handled for data received from DMA.
517+
* May be nullptr if rx_length is 0.
507518
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
508519
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
509520
*
@@ -515,17 +526,19 @@ class SPI : private NonCopyable<SPI> {
515526
*/
516527
template<typename WordT>
517528
typename std::enable_if<std::is_integral<WordT>::value, int>::type
518-
transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
529+
transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
519530
{
520-
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
531+
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
532+
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
521533
}
522534

523535
// Overloads of the above to support passing nullptr
524536
template<typename WordT>
525537
typename std::enable_if<std::is_integral<WordT>::value, int>::type
526-
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
538+
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
527539
{
528-
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
540+
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
541+
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
529542
}
530543
template<typename WordT>
531544
typename std::enable_if<std::is_integral<WordT>::value, int>::type
@@ -574,7 +587,8 @@ class SPI : private NonCopyable<SPI> {
574587
* the default SPI value is sent
575588
* @param tx_length The length of TX buffer in bytes.
576589
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
577-
* received data are ignored.
590+
* received data are ignored. This buffer is guaranteed to be cache aligned
591+
* if the MCU has a cache.
578592
* @param rx_length The length of RX buffer in bytes.
579593
* @param callback The event callback function.
580594
* @param event The event mask of events to modify.
@@ -599,6 +613,7 @@ class SPI : private NonCopyable<SPI> {
599613
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
600614
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
601615
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
616+
* This buffer is guaranteed to be cache aligned if the MCU has a cache.
602617
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
603618
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
604619
*
@@ -722,6 +737,18 @@ class SPI : private NonCopyable<SPI> {
722737
* iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
723738
* has NOT been called yet. */
724739
volatile bool _transfer_in_progress = false;
740+
741+
// If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
742+
// flush at the end
743+
bool _transfer_in_progress_uses_dma;
744+
745+
#if __DCACHE_PRESENT
746+
// These variables store the location and length in bytes of the Rx buffer if an async transfer
747+
// is in progress. They are used for invalidating the cache after the transfer completes.
748+
void *_transfer_in_progress_rx_buffer;
749+
size_t _transfer_in_progress_rx_len;
750+
#endif
751+
725752
/* Event flags used for transfer_and_wait() */
726753
rtos::EventFlags _transfer_and_wait_flags;
727754
#endif // DEVICE_SPI_ASYNCH

drivers/source/SPI.cpp

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,16 @@ void SPI::abort_transfer()
396396
_set_ssel(1);
397397
}
398398

399+
#if __DCACHE_PRESENT
400+
if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
401+
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
402+
// We only want to do this if DMA actually got used for the transfer because, if interrupts
403+
// were used instead, the cache might have the correct data and NOT the main memory.
404+
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
405+
}
406+
#endif
407+
_transfer_in_progress = false;
408+
399409
#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
400410
dequeue_transaction();
401411
#endif
@@ -466,8 +476,31 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
466476

467477
_callback = callback;
468478
_irq.callback(&SPI::irq_handler_asynch);
479+
480+
#if __DCACHE_PRESENT
481+
// On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
482+
// We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
483+
// not interoperate with the CPU cache. So, manual flushing/invalidation will be required.
484+
// This page is very useful for how to do this correctly:
485+
// https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
486+
if (tx_length > 0) {
487+
// For chips with a cache, we need to evict the Tx data from cache to main memory.
488+
// This ensures that the DMA controller can see the most up-to-date copy of the data.
489+
SCB_CleanDCache_by_Addr(const_cast<void *>(tx_buffer), tx_length);
490+
}
491+
492+
// Additionally, we have to make sure that there aren't any pending changes which could be written back
493+
// to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
494+
if (rx_length > 0) {
495+
SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
496+
}
497+
_transfer_in_progress_rx_buffer = rx_buffer;
498+
_transfer_in_progress_rx_len = rx_length;
499+
#endif
500+
469501
_transfer_in_progress = true;
470-
spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
502+
503+
_transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
471504
}
472505

473506
void SPI::lock_deep_sleep()
@@ -508,7 +541,17 @@ void SPI::dequeue_transaction()
508541
void SPI::irq_handler_asynch(void)
509542
{
510543
int event = spi_irq_handler_asynch(&_peripheral->spi);
511-
if (_callback && (event & SPI_EVENT_ALL)) {
544+
if ((event & SPI_EVENT_ALL)) {
545+
546+
#if __DCACHE_PRESENT
547+
if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
548+
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
549+
// We only want to do this if DMA actually got used for the transfer because, if interrupts
550+
// were used instead, the cache might have the correct data and NOT the main memory.
551+
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
552+
}
553+
#endif
554+
512555
// If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
513556
// If there's another transfer queued, we *do* want to deselect the peripheral now.
514557
// It will be reselected in start_transfer() which is called by dequeue_transaction() below.
@@ -518,7 +561,10 @@ void SPI::irq_handler_asynch(void)
518561
_transfer_in_progress = false;
519562

520563
unlock_deep_sleep();
521-
_callback.call(event & SPI_EVENT_ALL);
564+
565+
if (_callback) {
566+
_callback.call(event & SPI_EVENT_ALL);
567+
}
522568
}
523569
#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
524570
if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {

hal/include/hal/spi_api.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#include "hal/dma_api.h"
2626
#include "hal/buffer.h"
2727

28+
#include <stdbool.h>
29+
2830
#if DEVICE_SPI
2931

3032
/**
@@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
393395
* @{
394396
*/
395397

396-
/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
398+
/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
399+
*
400+
* If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
401+
* to main memory already. Additionally, the Rx buffer is guaranteed to be cache aligned, and will
402+
* be invalidated by the SPI layer after the transfer is complete.
397403
*
398404
* @param[in] obj The SPI object that holds the transfer information
399405
* @param[in] tx The transmit buffer
@@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
404410
* @param[in] event The logical OR of events to be registered
405411
* @param[in] handler SPI interrupt handler
406412
* @param[in] hint A suggestion for how to use DMA with this transfer
413+
*
414+
* @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
415+
* method is used to do the transfer).
416+
*
417+
* @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
418+
* after the transfer is complete. If this function returns true, the driver layer will cache invalidate the Rx buffer under
419+
* the assumption that the data needs to be re-read from main memory. Be careful, because if the read was not actually
420+
* done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
407421
*/
408-
void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
422+
bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
409423

410424
/** The asynchronous IRQ handler
411425
*

0 commit comments

Comments
 (0)