@@ -272,6 +272,21 @@ void hal_delay_us(uint32_t us)
272272 ;
273273}
274274
275+ /**
276+ * Get current time in microseconds (for benchmarking)
277+ */
278+ uint64_t hal_get_timer_us (void )
279+ {
280+ uint64_t cntpct = timer_get_count ();
281+ uint64_t cntfrq = timer_get_freq ();
282+
283+ if (cntfrq == 0 )
284+ cntfrq = TIMER_CLK_FREQ ;
285+
286+ /* Convert to microseconds: (count * 1000000) / freq */
287+ return (cntpct * 1000000ULL ) / cntfrq ;
288+ }
289+
275290
276291/* ============================================================================
277292 * QSPI Flash Driver (GQSPI)
@@ -473,7 +488,9 @@ static void flush_dcache_range(uintptr_t start, uintptr_t end)
473488 __asm__ volatile ("dsb sy" : : : "memory" );
474489}
475490
476- /* Wait for DMA completion */
491+ /* Wait for DMA completion
492+ * Returns: 0 on success, -1 on timeout
493+ */
477494static int qspi_dma_wait (void )
478495{
479496 uint32_t timeout = GQSPIDMA_TIMEOUT_TRIES ;
@@ -483,6 +500,8 @@ static int qspi_dma_wait(void)
483500
484501 if (timeout == 0 ) {
485502 QSPI_DEBUG_PRINTF ("QSPI: DMA timeout\n" );
503+ /* Clear any pending interrupts */
504+ GQSPIDMA_ISR = GQSPIDMA_ISR_ALL_MASK ;
486505 return -1 ;
487506 }
488507
@@ -782,10 +801,12 @@ static int qspi_transfer_qread_dma(QspiDev_t *dev, const uint8_t *cmd, uint32_t
782801
783802 /* DMA RX Phase */
784803 if (ret == 0 && rxLen > 0 ) {
785- uint32_t remaining = rxLen ;
804+ uint32_t remaining ;
786805 uint32_t xferSz ;
787806
788- /* Check alignment - DMA requires cache-line aligned buffer */
807+ /* Check alignment - DMA requires cache-line aligned buffer.
808+ * If unaligned or not a multiple of 4 bytes, use temp buffer.
809+ * CRITICAL: GenFIFO transfer size must match DMA size! */
789810 if (((uintptr_t )rxData & (GQSPI_DMA_ALIGN - 1 )) || (rxLen & 3 )) {
790811 /* Use temp buffer for unaligned data */
791812 dmaPtr = dma_tmpbuf ;
@@ -799,6 +820,9 @@ static int qspi_transfer_qread_dma(QspiDev_t *dev, const uint8_t *cmd, uint32_t
799820 dmaLen = rxLen ;
800821 }
801822
823+ /* GenFIFO must request the same number of bytes as DMA expects */
824+ remaining = dmaLen ;
825+
802826 /* Setup DMA destination */
803827 GQSPIDMA_DST = ((uintptr_t )dmaPtr & 0xFFFFFFFFUL );
804828 GQSPIDMA_DST_MSB = ((uintptr_t )dmaPtr >> 32 );
@@ -839,7 +863,7 @@ static int qspi_transfer_qread_dma(QspiDev_t *dev, const uint8_t *cmd, uint32_t
839863 /* Invalidate cache after DMA */
840864 flush_dcache_range ((uintptr_t )dmaPtr , (uintptr_t )dmaPtr + dmaLen );
841865
842- /* Copy from temp buffer if needed */
866+ /* Copy from temp buffer if needed (only copy requested bytes) */
843867 if (ret == 0 && useTemp ) {
844868 memcpy (rxData , dmaPtr , rxLen );
845869 }
@@ -1224,11 +1248,12 @@ static void qspi_init(void)
12241248 GQSPI_ISR = GQSPI_IXR_ALL_MASK ;
12251249 dsb ();
12261250
1227- /* Preserve PLM's CFG but switch to IO mode for our transfers
1251+ /* Preserve PLM's CFG but set IO mode for initial commands (ID read, etc.)
12281252 * PLM: 0xA0080010 = DMA mode | manual start | WP_HOLD | CLK_POL
1229- * Key: Keep manual start mode (bit 29) and clock settings */
1253+ * Key: Keep manual start mode (bit 29) and clock settings
1254+ * Note: qspi_transfer_qread_dma() will switch to DMA mode for reads */
12301255 cfg = (cfg & ~GQSPI_CFG_MODE_EN_MASK ); /* Clear mode bits */
1231- cfg |= GQSPI_CFG_MODE_EN_IO ; /* Set IO mode */
1256+ cfg |= GQSPI_CFG_MODE_EN_IO ; /* Set IO mode for init */
12321257 GQSPI_CFG = cfg ;
12331258 dsb ();
12341259
@@ -1237,6 +1262,18 @@ static void qspi_init(void)
12371262 GQSPI_RX_THRESH = 1 ;
12381263 GQSPI_GF_THRESH = 16 ;
12391264
1265+ #ifndef GQSPI_MODE_IO
1266+ /* Initialize DMA controller - this was missing compared to zynq.c!
1267+ * Without this, DMA transfers can hang or timeout because the DMA
1268+ * controller is in an undefined state after PLM handoff.
1269+ */
1270+ GQSPIDMA_CTRL = GQSPIDMA_CTRL_DEF ;
1271+ GQSPIDMA_CTRL2 = GQSPIDMA_CTRL2_DEF ;
1272+ GQSPIDMA_ISR = GQSPIDMA_ISR_ALL_MASK ; /* Clear all pending interrupts */
1273+ GQSPIDMA_IER = GQSPIDMA_ISR_ALL_MASK ; /* Enable all interrupts */
1274+ dsb ();
1275+ #endif
1276+
12401277 QSPI_DEBUG_PRINTF ("QSPI: After config - CFG=0x%08x\n" , GQSPI_CFG );
12411278
12421279 /* Configure device for single flash (lower) first */
@@ -1353,9 +1390,35 @@ void hal_prepare_boot(void)
13531390 }
13541391#endif
13551392
1356- /* Memory barriers before jumping to application */
1357- dsb ();
1358- isb ();
1393+ /* Clean and invalidate caches for the loaded application.
1394+ * The application was written to RAM via D-cache, but the CPU will
1395+ * fetch instructions via I-cache from main memory. We must:
1396+ * 1. Clean D-cache (flush dirty data to memory)
1397+ * 2. Invalidate I-cache (ensure fresh instruction fetch)
1398+ */
1399+
1400+ /* Clean entire D-cache to Point of Coherency */
1401+ __asm__ volatile ("dsb sy" );
1402+
1403+ /* Clean D-cache for application region (0x10000000, 1MB should be enough) */
1404+ {
1405+ uintptr_t addr ;
1406+ uintptr_t end = 0x10000000 + (1 * 1024 * 1024 );
1407+ for (addr = 0x10000000 ; addr < end ; addr += 64 ) {
1408+ /* DC CVAC - Clean data cache line by VA to PoC */
1409+ __asm__ volatile ("dc cvac, %0" : : "r" (addr ));
1410+ }
1411+ }
1412+
1413+ /* Data synchronization barrier - ensure clean completes */
1414+ __asm__ volatile ("dsb sy" );
1415+
1416+ /* Invalidate instruction cache to ensure fresh code is fetched */
1417+ __asm__ volatile ("ic iallu" );
1418+
1419+ /* Ensure cache invalidation completes before jumping */
1420+ __asm__ volatile ("dsb sy" );
1421+ __asm__ volatile ("isb" );
13591422}
13601423
13611424#ifdef MMU
@@ -1504,9 +1567,14 @@ int ext_flash_read(uintptr_t address, uint8_t *data, int len)
15041567 return -1 ;
15051568 }
15061569
1570+ QSPI_DEBUG_PRINTF ("ext_flash_read: addr=0x%lx len=%d\n" ,
1571+ (unsigned long )address , len );
1572+
15071573 if (qspiDev .stripe ) {
15081574 /* For dual parallel the address is divided by 2 */
15091575 addr /= 2 ;
1576+ QSPI_DEBUG_PRINTF (" stripe mode: flash_addr=0x%lx\n" ,
1577+ (unsigned long )addr );
15101578 }
15111579
15121580 /* Use Quad Read command (0x6C) with 4-byte address */
@@ -1523,7 +1591,22 @@ int ext_flash_read(uintptr_t address, uint8_t *data, int len)
15231591 ret = qspi_transfer_qread_dma (& qspiDev , cmd , 5 , data , len , GQSPI_DUMMY_READ );
15241592#endif
15251593
1526- return ret ;
1594+ /* On DMA timeout, fill buffer with 0xFF to simulate unwritten flash.
1595+ * This handles reads to partition trailer areas that haven't been written.
1596+ * wolfBoot will see 0xFF (not magic) and handle appropriately. */
1597+ if (ret != 0 ) {
1598+ memset (data , 0xFF , len );
1599+ }
1600+
1601+ QSPI_DEBUG_PRINTF ("ext_flash_read: ret=%d data[0-7]=%02x %02x %02x %02x %02x %02x %02x %02x\n" ,
1602+ ret ,
1603+ len > 0 ? data [0 ] : 0 , len > 1 ? data [1 ] : 0 ,
1604+ len > 2 ? data [2 ] : 0 , len > 3 ? data [3 ] : 0 ,
1605+ len > 4 ? data [4 ] : 0 , len > 5 ? data [5 ] : 0 ,
1606+ len > 6 ? data [6 ] : 0 , len > 7 ? data [7 ] : 0 );
1607+
1608+ /* Return bytes read on success (like zynq.c) */
1609+ return (ret == 0 ) ? len : ret ;
15271610}
15281611
15291612int ext_flash_erase (uintptr_t address , int len )
0 commit comments