Skip to content

Commit ba09daf

Browse files
committed
PPCRec: Reenable float copy optimization
1 parent 557aff4 commit ba09daf

File tree

4 files changed

+84
-67
lines changed

4 files changed

+84
-67
lines changed

src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,30 @@ const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml)
3636
return "MULS";
3737
else if (op == PPCREC_IML_OP_DIVIDE_SIGNED)
3838
return "DIVS";
39+
else if (op == PPCREC_IML_OP_FPR_ASSIGN)
40+
return "FMOV";
41+
else if (op == PPCREC_IML_OP_FPR_ADD)
42+
return "FADD";
43+
else if (op == PPCREC_IML_OP_FPR_SUB)
44+
return "FSUB";
45+
else if (op == PPCREC_IML_OP_FPR_MULTIPLY)
46+
return "FMUL";
47+
else if (op == PPCREC_IML_OP_FPR_DIVIDE)
48+
return "FDIV";
49+
else if (op == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64)
50+
return "F32TOF64";
51+
else if (op == PPCREC_IML_OP_FPR_ABS)
52+
return "FABS";
53+
else if (op == PPCREC_IML_OP_FPR_NEGATE)
54+
return "FNEG";
55+
else if (op == PPCREC_IML_OP_FPR_NEGATIVE_ABS)
56+
return "FNABS";
57+
else if (op == PPCREC_IML_OP_FPR_FLOAT_TO_INT)
58+
return "F2I";
59+
else if (op == PPCREC_IML_OP_FPR_INT_TO_FLOAT)
60+
return "I2F";
61+
else if (op == PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT)
62+
return "BITMOVE";
3963

4064
sprintf(_tempOpcodename, "OP0%02x_T%d", iml->operation, iml->type);
4165
return _tempOpcodename;
@@ -409,19 +433,24 @@ void IMLDebug_DisassembleInstruction(const IMLInstruction& inst, std::string& di
409433
strOutput.addFmt("{} [t{}+{}]", inst.op_storeLoad.copyWidth / 8, inst.op_storeLoad.registerMem.GetRegID(), inst.op_storeLoad.immS32);
410434
strOutput.addFmt(" = {} mode {}", IMLDebug_GetRegName(inst.op_storeLoad.registerData), inst.op_storeLoad.mode);
411435
}
436+
else if (inst.type == PPCREC_IML_TYPE_FPR_R)
437+
{
438+
strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
439+
strOutput.addFmt("{}", IMLDebug_GetRegName(inst.op_fpr_r.regR));
440+
}
412441
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R)
413442
{
414-
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst));
443+
strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
415444
strOutput.addFmt("{}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r.regA));
416445
}
417446
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R_R)
418447
{
419-
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst));
448+
strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
420449
strOutput.addFmt("{}, {}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regB), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regC));
421450
}
422451
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R)
423452
{
424-
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst));
453+
strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
425454
strOutput.addFmt("{}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regB));
426455
}
427456
else if (inst.type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK)

src/Cafe/HW/Espresso/Recompiler/IML/IMLOptimizer.cpp

Lines changed: 52 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
2323
IMLInstruction* imlInstructionLoad = imlSegment->imlList.data() + imlIndexLoad;
2424
if (imlInstructionLoad->op_storeLoad.flags2.notExpanded)
2525
return;
26-
26+
boost::container::static_vector<sint32, 4> trackedMoves; // only track up to 4 copies
2727
IMLUsedRegisters registersUsed;
2828
sint32 scanRangeEnd = std::min<sint32>(imlIndexLoad + 25, imlSegment->imlList.size()); // don't scan too far (saves performance and also the chances we can merge the load+store become low at high distances)
2929
bool foundMatch = false;
@@ -54,8 +54,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
5454
continue;
5555
}
5656
}
57-
58-
// check if FPR is overwritten (we can actually ignore read operations?)
57+
// if the FPR is copied then keep track of it. We can expand the copies instead of the original
58+
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlInstruction->operation == PPCREC_IML_OP_FPR_ASSIGN && imlInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex)
59+
{
60+
if (imlInstruction->op_fpr_r_r.regR.GetRegID() == fprIndex)
61+
{
62+
// unexpected no-op
63+
break;
64+
}
65+
if (trackedMoves.size() >= trackedMoves.capacity())
66+
{
67+
// we cant track any more moves, expand here
68+
lastStore = i;
69+
break;
70+
}
71+
trackedMoves.push_back(i);
72+
continue;
73+
}
74+
// check if FPR is overwritten
5975
imlInstruction->CheckRegisterUsage(&registersUsed);
6076
if (registersUsed.writtenGPR1.IsValidAndSameRegID(fprIndex) || registersUsed.writtenGPR2.IsValidAndSameRegID(fprIndex))
6177
break;
@@ -71,6 +87,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
7187

7288
if (foundMatch)
7389
{
90+
// insert expand instructions for each target register of a move
91+
sint32 positionBias = 0;
92+
for (auto& trackedMove : trackedMoves)
93+
{
94+
sint32 realPosition = trackedMove + positionBias;
95+
IMLInstruction* imlMoveInstruction = imlSegment->imlList.data() + realPosition;
96+
if (realPosition >= lastStore)
97+
break; // expand is inserted before this move
98+
else
99+
lastStore++;
100+
101+
cemu_assert_debug(imlMoveInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlMoveInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex);
102+
cemu_assert_debug(imlMoveInstruction->op_fpr_r_r.regA.GetRegFormat() == IMLRegFormat::F64);
103+
auto dstReg = imlMoveInstruction->op_fpr_r_r.regR;
104+
IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, realPosition+1); // one after the move
105+
newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, dstReg);
106+
positionBias++;
107+
}
74108
// insert expand instruction after store
75109
IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, lastStore);
76110
newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, _FPRRegFromID(fprIndex));
@@ -90,23 +124,21 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
90124
*/
91125
void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext)
92126
{
93-
cemuLog_logDebugOnce(LogType::Force, "IMLOptimizer_OptimizeDirectFloatCopies(): Currently disabled\n");
94-
return;
95-
// for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
96-
// {
97-
// for (sint32 i = 0; i < segIt->imlList.size(); i++)
98-
// {
99-
// IMLInstruction* imlInstruction = segIt->imlList.data() + i;
100-
// if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
101-
// {
102-
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
103-
// }
104-
// else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
105-
// {
106-
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
107-
// }
108-
// }
109-
// }
127+
for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
128+
{
129+
for (sint32 i = 0; i < segIt->imlList.size(); i++)
130+
{
131+
IMLInstruction* imlInstruction = segIt->imlList.data() + i;
132+
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE)
133+
{
134+
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
135+
}
136+
else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE)
137+
{
138+
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
139+
}
140+
}
141+
}
110142
}
111143

112144
void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg)

src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -685,45 +685,6 @@ void PPCRecompiler_init()
685685
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
686686
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());
687687

688-
// setup GQR scale tables
689-
690-
for (uint32 i = 0; i < 32; i++)
691-
{
692-
float a = 1.0f / (float)(1u << i);
693-
float b = 0;
694-
if (i == 0)
695-
b = 4294967296.0f;
696-
else
697-
b = (float)(1u << (32u - i));
698-
699-
float ar = (float)(1u << i);
700-
float br = 0;
701-
if (i == 0)
702-
br = 1.0f / 4294967296.0f;
703-
else
704-
br = 1.0f / (float)(1u << (32u - i));
705-
706-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 0] = a;
707-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 1] = 1.0f;
708-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 0] = b;
709-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
710-
711-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 0] = a;
712-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 1] = a;
713-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 0] = b;
714-
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 1] = b;
715-
716-
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 0] = ar;
717-
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 1] = 1.0f;
718-
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 0] = br;
719-
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
720-
721-
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 0] = ar;
722-
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 1] = ar;
723-
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 0] = br;
724-
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
725-
}
726-
727688
PPCRecompiler_initPlatform();
728689

729690
cemuLog_log(LogType::Force, "Recompiler initialized");

src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,6 @@ typedef struct
136136
alignas(16) float _x64XMM_constFloatMin[2];
137137
alignas(16) uint32 _x64XMM_flushDenormalMask1[4];
138138
alignas(16) uint32 _x64XMM_flushDenormalMaskResetSignBits[4];
139-
// PSQ load/store scale tables
140-
double _psq_ld_scale_ps0_ps1[64 * 2];
141-
double _psq_ld_scale_ps0_1[64 * 2];
142-
double _psq_st_scale_ps0_ps1[64 * 2];
143-
double _psq_st_scale_ps0_1[64 * 2];
144139
// MXCSR
145140
uint32 _x64XMM_mxCsr_ftzOn;
146141
uint32 _x64XMM_mxCsr_ftzOff;

0 commit comments

Comments
 (0)