ESP32-S3: Support execute in place from PSRAM

EliteTK · EliteTK · commit 4697f616b124 · 2025-01-23T20:19:26.000Z
This implementation mirrors how the ESP-IDF implementation of this feature (which is based on the `Cache_Flash_To_SPIRAM_Copy` rom function) works except it differs in a few key ways: The ESP-IDF seems to map `.text` and `.rodata` into the first and second 128 cache pages respectively (although looking at the linker scripts, I'm not sure how, but a runtime check confirmed this seemed to be the case). This is reflected in how the `Cache_Count_Flash_Pages`, `Cache_Flash_To_SPIRAM_Copy` rom functions and the ESP-IDF code executing them works. The count function can only be made to count flash pages within the first 256 pages (of which there are 512 on the ESP32-S3). Likewise, the copy function will only copy flash pages which are mapped within the first 256 entries (across two calls). As the esp-hal handles mapping `.text` and `.rodata` differently, these ROM functions are technically not appropriate if more than 256 pages of flash (`.text` and `.rodata` combined) are in use by the application. Additionally, the functions both contain bugs, one of which the IDF attempts to work around incorrectly, and the other which the IDF does not appear to be aware of. Details of these bugs can be found on the IDF issue/PR tracker[0][1]. As a result, this commit contains a heavily modified/adjusted rust re-write of the reverse engineered ROM code combined with a vague port of the ESP-IDF code. There are three additional noteworthy differences from the ESP-IDF version of the code: 1. The ESP-IDF allows the `.text` and `.rodata` segments to be mapped independently and separately allowing only one to be mapped. But the current version of the code does not allow this flexibility. This can be implemented by checking the address of each page entry against the segment locations to determine which segment each address belongs to. 2. The ESP-IDF calls `cache_ll_l1_enable_bus(..., cache_ll_l1_get_bus(..., SOC_EXTRAM_DATA_HIGH, 0));` (functions from the ESP-IDF) in order to "Enable the most high bus, which is used for copying FLASH `.text` to PSRAM" but on the ESP32-S3 after careful inspection these calls result in a no-op as the address passed to cache_ll_l1_get_bus will result in an empty cache bus mask. It's currently unclear to me if this is a bug in the ESP-IDF code, or if this code (which from cursory investigation is probably not a no-op on the -S2) is solely targetting the ESP32-S3. 3. The ESP-IDF calls `Cache_Flash_To_SPIRAM_Copy` with an icache address when copying `.text` and a dcache address when copying `.rodata`. This affects which cache the reads will occur through. But the writes always go through a "spare page" (name I came up with during reverse engineering) via the dcache. This code performs all reads through the dcache. I don't know if there's a proper reason to read through the correct cache when doing the copy and this doesn't appear to have any negative impact. [0]: espressif/esp-idf#15262 [1]: espressif/esp-idf#15263
diff --git a/esp-hal/CHANGELOG.md b/esp-hal/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - SPI: Added support for 3-wire SPI (#2919)
 - Add separate config for Rx and Tx (UART) #2965
+- ESP32-S3: Support execute in place from PSRAM
 
 ### Changed
 
diff --git a/esp-hal/src/soc/esp32s3/psram.rs b/esp-hal/src/soc/esp32s3/psram.rs
@@ -109,6 +109,8 @@ pub struct PsramConfig {
     pub flash_frequency: FlashFreq,
     /// Frequency of PSRAM memory
     pub ram_frequency: SpiRamFreq,
+    /// Copy Flash to PSRAM
+    pub copy_flash: bool,
 }
 
 /// Initialize PSRAM to be used for data.
@@ -119,14 +121,21 @@ pub(crate) fn init_psram(config: PsramConfig) {
     let mut config = config;
     utils::psram_init(&mut config);
 
+    const MMU_PAGE_SIZE: u32 = 0x10000;
     const CONFIG_ESP32S3_INSTRUCTION_CACHE_SIZE: u32 = 0x4000;
     const CONFIG_ESP32S3_ICACHE_ASSOCIATED_WAYS: u8 = 8;
     const CONFIG_ESP32S3_INSTRUCTION_CACHE_LINE_SIZE: u8 = 32;
     const CONFIG_ESP32S3_DATA_CACHE_SIZE: u32 = 0x8000;
     const CONFIG_ESP32S3_DCACHE_ASSOCIATED_WAYS: u8 = 8;
     const CONFIG_ESP32S3_DATA_CACHE_LINE_SIZE: u8 = 32;
+    const MMU_INVALID: u32 = 1 << 14;
     const MMU_ACCESS_SPIRAM: u32 = 1 << 15;
-    const START_PAGE: u32 = 0;
+    const ICACHE_MMU_SIZE: usize = 0x800;
+    const FLASH_MMU_TABLE_SIZE: usize = ICACHE_MMU_SIZE / core::mem::size_of::<u32>();
+    const DR_REG_MMU_TABLE: u32 = 0x600C5000;
+
+    let mut free_page = 0;
+    let mut psram_size = config.size.get();
 
     extern "C" {
         fn rom_config_instruction_cache_mode(
@@ -161,14 +170,122 @@ pub(crate) fn init_psram(config: PsramConfig) {
             num: u32,
             fixed: u32,
         ) -> i32;
+
+        fn Cache_WriteBack_All();
+        fn Cache_Invalidate_Addr(addr: u32, size: u32);
+        fn rom_Cache_WriteBack_Addr(addr: u32, size: u32);
+    }
+
+    // Vaguely based off of the ESP-IDF equivalent code:
+    // https://github.com/espressif/esp-idf/blob/3c99557eeea4e0945e77aabac672fbef52294d54/components/esp_psram/mmu_psram_flash.c#L46-L134
+    if config.copy_flash {
+        const MMU_VALID: u32 = 0;
+        const MMU_TYPE: u32 = 1 << 15;
+        const MMU_ACCESS_FLASH: u32 = 0;
+        const MMU_VALID_VAL_MASK: u32 = 0x3fff;
+        const MMU_DBUS_VADDR_BASE: u32 = 0x3C000000;
+        const SPARE_PAGE: usize = FLASH_MMU_TABLE_SIZE - 1;
+        const SPARE_PAGE_DCACHE_ADDR: u32 = MMU_DBUS_VADDR_BASE + SPARE_PAGE as u32 * MMU_PAGE_SIZE;
+
+        let mmu_table_ptr = DR_REG_MMU_TABLE as *mut u32;
+
+        unsafe fn move_flash_to_psram_with_spare(mmu_table_ptr: *mut u32, target_entry: usize, psram_page: u32, spare_entry: usize) {
+            let target_entry_addr = MMU_DBUS_VADDR_BASE + target_entry as u32 * MMU_PAGE_SIZE;
+            let spare_entry_addr = MMU_DBUS_VADDR_BASE + spare_entry as u32 * MMU_PAGE_SIZE;
+            unsafe {
+                mmu_table_ptr
+                    .add(spare_entry)
+                    .write_volatile(psram_page | MMU_ACCESS_SPIRAM);
+                Cache_Invalidate_Addr(spare_entry_addr, MMU_PAGE_SIZE);
+                core::ptr::copy_nonoverlapping(
+                    target_entry_addr as *const u8,
+                    spare_entry_addr as *mut u8,
+                    MMU_PAGE_SIZE as usize,
+                );
+                rom_Cache_WriteBack_Addr(spare_entry_addr, MMU_PAGE_SIZE);
+                mmu_table_ptr
+                    .add(target_entry)
+                    .write_volatile(psram_page | MMU_ACCESS_SPIRAM);
+            }
+        }
+
+        let spare_page_mapping = unsafe { mmu_table_ptr.add(SPARE_PAGE).read_volatile() };
+
+        // All entries mapping flash page 0 will be mapped to the same page later so are only
+        // counted once
+        let mut page0_seen = false;
+        let mut flash_pages = 0;
+        for i in 0..(FLASH_MMU_TABLE_SIZE - 1) {
+            let mapping = unsafe { mmu_table_ptr.add(i).read_volatile() };
+            if mapping & (MMU_INVALID | MMU_TYPE) == MMU_VALID | MMU_ACCESS_FLASH {
+                if mapping & MMU_VALID_VAL_MASK == 0 {
+                    if page0_seen {
+                        continue;
+                    }
+                    page0_seen = true;
+                }
+                flash_pages += 1;
+            }
+        }
+
+        if flash_pages > (psram_size / MMU_PAGE_SIZE as usize) as u32 {
+            panic!("PSRAM is too small to fit a copy of flash");
+        }
+
+        let mut page0_page = None;
+
+        unsafe { Cache_WriteBack_All() };
+        for i in 0..(FLASH_MMU_TABLE_SIZE - 1) {
+            let mapping = unsafe { mmu_table_ptr.add(i).read_volatile() };
+            if mapping & (MMU_INVALID | MMU_TYPE) != MMU_VALID | MMU_ACCESS_FLASH {
+                continue;
+            }
+            if mapping & MMU_VALID_VAL_MASK == 0 {
+                match page0_page {
+                    Some(page) => {
+                        unsafe {
+                            mmu_table_ptr
+                                .add(i)
+                                .write_volatile(page | MMU_ACCESS_SPIRAM)
+                        };
+                        continue;
+                    }
+                    None => page0_page = Some(free_page),
+                }
+            }
+            unsafe { move_flash_to_psram_with_spare(mmu_table_ptr, i, free_page, SPARE_PAGE) };
+            free_page += 1;
+        }
+
+        // Restore spare page mapping
+        unsafe {
+            mmu_table_ptr
+                .add(SPARE_PAGE)
+                .write_volatile(spare_page_mapping);
+            Cache_Invalidate_Addr(SPARE_PAGE_DCACHE_ADDR, MMU_PAGE_SIZE);
+        }
+
+        // Special handling if the spare page was mapped to flash
+        if spare_page_mapping & (MMU_INVALID | MMU_TYPE) == MMU_VALID | MMU_ACCESS_FLASH {
+            unsafe {
+                // We're running from ram so using the first page should not cause issues
+                const SECOND_SPARE: usize = 0;
+                let second_spare_mapping = mmu_table_ptr.add(SECOND_SPARE).read_volatile();
+
+                move_flash_to_psram_with_spare(mmu_table_ptr, SPARE_PAGE, free_page, SECOND_SPARE);
+
+                // Restore spare page mapping
+                mmu_table_ptr.add(0).write_volatile(second_spare_mapping);
+                Cache_Invalidate_Addr(MMU_DBUS_VADDR_BASE + SECOND_SPARE as u32 * MMU_PAGE_SIZE, MMU_PAGE_SIZE);
+            }
+            free_page += 1;
+        }
+
+        psram_size -= free_page as usize * MMU_PAGE_SIZE as usize;
     }
 
     let start = unsafe {
-        const MMU_PAGE_SIZE: u32 = 0x10000;
-        const ICACHE_MMU_SIZE: usize = 0x800;
-        const FLASH_MMU_TABLE_SIZE: usize = ICACHE_MMU_SIZE / core::mem::size_of::<u32>();
-        const MMU_INVALID: u32 = 1 << 14;
-        const DR_REG_MMU_TABLE: u32 = 0x600C5000;
+        let mmu_table_ptr = DR_REG_MMU_TABLE as *const u32;
 
         // calculate the PSRAM start address to map
         // the linker scripts can produce a gap between mapped IROM and DROM segments
@@ -177,7 +294,6 @@ pub(crate) fn init_psram(config: PsramConfig) {
         //
         // More general information about the MMU can be found here:
         // https://docs.espressif.com/projects/esp-idf/en/stable/esp32s3/api-reference/system/mm.html#introduction
-        let mmu_table_ptr = DR_REG_MMU_TABLE as *const u32;
         let mut mapped_pages = 0;
         for i in (0..FLASH_MMU_TABLE_SIZE).rev() {
             if mmu_table_ptr.add(i).read_volatile() != MMU_INVALID {
@@ -208,9 +324,9 @@ pub(crate) fn init_psram(config: PsramConfig) {
         if cache_dbus_mmu_set(
             MMU_ACCESS_SPIRAM,
             start,
-            START_PAGE << 16,
+            free_page << 16,
             64,
-            config.size.get() as u32 / 1024 / 64, // number of pages to map
+            (psram_size / MMU_PAGE_SIZE as usize) as u32, // number of pages to map
             0,
         ) != 0
         {