Skip to content

Commit 28ea70b

Browse files
committed
GX2+TCL: Reimplement command buffer submission
- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo - Submission & retire timestamps are correctly implemented as incremental counters - Command buffering behaviour matches console - Fixes race conditions on aarch64
1 parent 96765e4 commit 28ea70b

21 files changed

+757
-468
lines changed

src/Cafe/HW/Latte/Core/Latte.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ struct LatteGPUState_t
4747
gx2GPUSharedArea_t* sharedArea; // quick reference to shared area
4848
MPTR sharedAreaAddr;
4949
// other
50-
// todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually.
51-
std::atomic<uint64> lastSubmittedCommandBufferTimestamp;
5250
uint32 gx2InitCalled; // incremented every time GX2Init() is called
5351
// OpenGL control
5452
uint32 glVendor; // GLVENDOR_*
@@ -75,8 +73,6 @@ struct LatteGPUState_t
7573

7674
extern LatteGPUState_t LatteGPUState;
7775

78-
extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
79-
8076
// texture
8177

8278
#include "Cafe/HW/Latte/Core/LatteTexture.h"

src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp

Lines changed: 74 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "Cafe/HW/Latte/Core/LattePM4.h"
1414

1515
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
16+
#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer
1617

1718
#include "Cafe/CafeSystem.h"
1819

@@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr;
2829
#define LatteReadCMD() ((uint32)*(cmd++))
2930
#define LatteSkipCMD(_nWords) cmd += (_nWords)
3031

31-
uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
32-
uint8* gx2CPParserDisplayListPtr;
33-
uint8* gx2CPParserDisplayListStart; // used for debugging
34-
uint8* gx2CPParserDisplayListEnd;
35-
3632
void LatteThread_HandleOSScreen();
3733

3834
void LatteThread_Exit();
@@ -155,16 +151,12 @@ void LatteCP_signalEnterWait()
155151
*/
156152
uint32 LatteCP_readU32Deprc()
157153
{
158-
uint32 v;
159-
uint8* gxRingBufferWritePtr;
160-
sint32 readDistance;
161154
// no display list active
162155
while (true)
163156
{
164-
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
165-
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
166-
if (readDistance != 0)
167-
break;
157+
uint32 cmdWord;
158+
if ( TCL::TCLGPUReadRBWord(cmdWord) )
159+
return cmdWord;
168160

169161
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
170162
performanceMonitor.gpuTime_idleTime.beginMeasuring();
@@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc()
175167
}
176168
LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API
177169

178-
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
179-
if (readDistance != 0)
180-
break;
181-
if (Latte_GetStopSignal())
182-
LatteThread_Exit();
183-
184-
// still no command data available, do some other tasks
185-
LatteTiming_HandleTimedVsync();
186-
LatteAsyncCommands_checkAndExecute();
187-
std::this_thread::yield();
188-
performanceMonitor.gpuTime_idleTime.endMeasuring();
189-
}
190-
v = *(uint32*)gxRingBufferReadPtr;
191-
gxRingBufferReadPtr += 4;
192-
#ifdef CEMU_DEBUG_ASSERT
193-
if (v == 0xcdcdcdcd)
194-
assert_dbg();
195-
#endif
196-
v = _swapEndianU32(v);
197-
return v;
198-
}
199-
200-
void LatteCP_waitForNWords(uint32 numWords)
201-
{
202-
uint8* gxRingBufferWritePtr;
203-
sint32 readDistance;
204-
bool isFlushed = false;
205-
sint32 waitDistance = numWords * sizeof(uint32be);
206-
// no display list active
207-
while (true)
208-
{
209-
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
210-
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
211-
if (readDistance < 0)
212-
return; // wrap around means there is at least one full command queued after this
213-
if (readDistance >= waitDistance)
214-
break;
215-
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
216-
performanceMonitor.gpuTime_idleTime.beginMeasuring();
217-
// no command data available, spin in a busy loop for a while then check again
218-
for (sint32 busy = 0; busy < 80; busy++)
219-
{
220-
_mm_pause();
221-
}
222-
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
223-
if (readDistance < 0)
224-
return; // wrap around means there is at least one full command queued after this
225-
if (readDistance >= waitDistance)
226-
break;
227-
170+
if ( TCL::TCLGPUReadRBWord(cmdWord) )
171+
return cmdWord;
228172
if (Latte_GetStopSignal())
229173
LatteThread_Exit();
230174

@@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords)
234178
std::this_thread::yield();
235179
performanceMonitor.gpuTime_idleTime.endMeasuring();
236180
}
181+
UNREACHABLE;
237182
}
238183

239184
template<uint32 readU32()>
@@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords)
270215
cemu_assert_debug(nWords == 3);
271216
uint32 physicalAddress = LatteReadCMD();
272217
uint32 physicalAddressHigh = LatteReadCMD(); // unused
273-
uint32 sizeInDWords = LatteReadCMD();
274-
uint32 displayListSize = sizeInDWords * 4;
275-
DrawPassContext drawPassCtx;
218+
uint32 sizeInU32s = LatteReadCMD();
276219

277220
#ifdef LATTE_CP_LOGGING
278221
if (GetAsyncKeyState('A'))
279222
LatteCP_DebugPrintCmdBuffer(MEMPTR<uint32be>(physicalAddress), displayListSize);
280223
#endif
281224

282-
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
283-
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
225+
if (sizeInU32s > 0)
226+
{
227+
DrawPassContext drawPassCtx;
228+
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
229+
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s);
284230

285-
LatteCP_processCommandBuffer(drawPassCtx);
286-
if (drawPassCtx.isWithinDrawPass())
287-
drawPassCtx.endDrawPass();
231+
LatteCP_processCommandBuffer(drawPassCtx);
232+
if (drawPassCtx.isWithinDrawPass())
233+
drawPassCtx.endDrawPass();
234+
}
288235
}
289236

290237
// pushes the command buffer to the stack
@@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d
294241
uint32 physicalAddress = LatteReadCMD();
295242
uint32 physicalAddressHigh = LatteReadCMD(); // unused
296243
uint32 sizeInDWords = LatteReadCMD();
297-
uint32 displayListSize = sizeInDWords * 4;
298-
cemu_assert_debug(displayListSize >= 4);
299-
300-
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
301-
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
244+
if (sizeInDWords > 0)
245+
{
246+
uint32 displayListSize = sizeInDWords * 4;
247+
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
248+
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
249+
}
302250
}
303251

304252
LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords)
@@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords)
565513
if (word1 == 0x40000)
566514
{
567515
// write U32
568-
*memPtr = word2;
516+
stdx::atomic_ref<uint32be> atomicRef(*memPtr);
517+
atomicRef.store(word2);
569518
}
570519
else if (word1 == 0x00000)
571520
{
572-
// write U64 (as two U32)
573-
// note: The U32s are swapped
574-
memPtr[0] = word2;
575-
memPtr[1] = word3;
521+
// write U64
522+
// note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte
523+
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memPtr);
524+
atomicRef.store(((uint64le)word2 << 32) | word3);
576525
}
577526
else if (word1 == 0x20000)
578527
{
579528
// write U64 (little endian)
580-
memPtr[0] = _swapEndianU32(word2);
581-
memPtr[1] = _swapEndianU32(word3);
529+
stdx::atomic_ref<uint64le> atomicRef(*(uint64le*)memPtr);
530+
atomicRef.store(((uint64le)word3 << 32) | word2);
582531
}
583532
else
584533
cemu_assert_unimplemented();
585534
return cmd;
586535
}
587536

537+
LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords)
538+
{
539+
cemu_assert_debug(nWords == 5);
540+
uint32 word0 = LatteReadCMD();
541+
uint32 word1 = LatteReadCMD();
542+
uint32 word2 = LatteReadCMD();
543+
uint32 word3 = LatteReadCMD(); // value low bits
544+
uint32 word4 = LatteReadCMD(); // value high bits
545+
546+
cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000);
547+
548+
if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags
549+
{
550+
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1));
551+
uint64 val = ((uint64)word4 << 32) | word3;
552+
atomicRef.store(val);
553+
}
554+
else
555+
{ cemu_assert_unimplemented();
556+
}
557+
bool triggerInterrupt = (word2 & 0x2000000) != 0;
558+
if (triggerInterrupt)
559+
{
560+
// todo - timestamp interrupt
561+
}
562+
TCL::TCLGPUNotifyNewRetirementTimestamp();
563+
return cmd;
564+
}
588565

589566
LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
590567
{
@@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont
783760

784761
drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR);
785762
return cmd;
786-
787-
}
788-
789-
LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords)
790-
{
791-
cemu_assert_debug(nWords == 1);
792-
uint32 unused = LatteReadCMD();
793-
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
794-
cmd = (LatteCMDPtr)gxRingBufferReadPtr;
795-
return cmd;
796763
}
797764

798765
LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords)
@@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords)
819786
return cmd;
820787
}
821788

822-
LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords)
823-
{
824-
cemu_assert_debug(nWords == 2);
825-
uint32 timestampHigh = (uint32)LatteReadCMD();
826-
uint32 timestampLow = (uint32)LatteReadCMD();
827-
uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow;
828-
GX2::__GX2NotifyNewRetirementTimestamp(timestamp);
829-
return cmd;
830-
}
831-
832789
LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords)
833790
{
834791
cemu_assert_debug(nWords == 1);
@@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
11451102
LatteCMDPtr cmd, cmdStart, cmdEnd;
11461103
if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd))
11471104
break;
1105+
uint32 itHeader;
11481106
while (cmd < cmdEnd)
11491107
{
1150-
uint32 itHeader = LatteReadCMD();
1108+
itHeader = LatteReadCMD();
11511109
uint32 itHeaderType = (itHeader >> 30) & 3;
11521110
if (itHeaderType == 3)
11531111
{
@@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
13611319
LatteCP_itHLEEndOcclusionQuery(cmdData, nWords);
13621320
break;
13631321
}
1364-
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
1365-
{
1366-
LatteCP_itHLESetRetirementTimestamp(cmdData, nWords);
1367-
break;
1368-
}
13691322
case IT_HLE_BOTTOM_OF_PIPE_CB:
13701323
{
13711324
LatteCP_itHLEBottomOfPipeCB(cmdData, nWords);
@@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
14211374
void LatteCP_ProcessRingbuffer()
14221375
{
14231376
sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called
1377+
uint32be tmpBuffer[128];
14241378
while (true)
14251379
{
14261380
uint32 itHeader = LatteCP_readU32Deprc();
@@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer()
14291383
{
14301384
uint32 itCode = (itHeader >> 8) & 0xFF;
14311385
uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1;
1432-
LatteCP_waitForNWords(nWords);
1433-
LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr;
1434-
uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4;
1435-
gxRingBufferReadPtr = cmdEnd;
1386+
cemu_assert(nWords < 128);
1387+
for (sint32 i=0; i<nWords; i++)
1388+
{
1389+
uint32 word = LatteCP_readU32Deprc();
1390+
tmpBuffer[i] = word;
1391+
}
1392+
LatteCMDPtr cmd = (LatteCMDPtr)tmpBuffer;
14361393
switch (itCode)
14371394
{
14381395
case IT_SURFACE_SYNC:
@@ -1599,6 +1556,11 @@ void LatteCP_ProcessRingbuffer()
15991556
timerRecheck += CP_TIMER_RECHECK / 512;
16001557
break;
16011558
}
1559+
case IT_EVENT_WRITE_EOP:
1560+
{
1561+
LatteCP_itEventWriteEOP(cmd, nWords);
1562+
break;
1563+
}
16021564
case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER:
16031565
{
16041566
LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords);
@@ -1637,12 +1599,6 @@ void LatteCP_ProcessRingbuffer()
16371599
timerRecheck += CP_TIMER_RECHECK / 128;
16381600
break;
16391601
}
1640-
case IT_HLE_FIFO_WRAP_AROUND:
1641-
{
1642-
LatteCP_itHLEFifoWrapAround(cmd, nWords);
1643-
timerRecheck += CP_TIMER_RECHECK / 512;
1644-
break;
1645-
}
16461602
case IT_HLE_SAMPLE_TIMER:
16471603
{
16481604
LatteCP_itHLESampleTimer(cmd, nWords);
@@ -1667,12 +1623,6 @@ void LatteCP_ProcessRingbuffer()
16671623
timerRecheck += CP_TIMER_RECHECK / 512;
16681624
break;
16691625
}
1670-
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
1671-
{
1672-
LatteCP_itHLESetRetirementTimestamp(cmd, nWords);
1673-
timerRecheck += CP_TIMER_RECHECK / 512;
1674-
break;
1675-
}
16761626
case IT_HLE_BOTTOM_OF_PIPE_CB:
16771627
{
16781628
LatteCP_itHLEBottomOfPipeCB(cmd, nWords);
@@ -1933,11 +1883,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
19331883
cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix);
19341884
break;
19351885
}
1936-
case IT_HLE_FIFO_WRAP_AROUND:
1937-
{
1938-
cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix);
1939-
break;
1940-
}
19411886
case IT_HLE_SAMPLE_TIMER:
19421887
{
19431888
cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix);
@@ -1958,11 +1903,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
19581903
cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix);
19591904
break;
19601905
}
1961-
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
1962-
{
1963-
cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix);
1964-
break;
1965-
}
19661906
case IT_HLE_BOTTOM_OF_PIPE_CB:
19671907
{
19681908
cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix);

0 commit comments

Comments
 (0)