@@ -5356,7 +5356,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5356
5356
return std::numeric_limits<int64_t>::min();
5357
5357
case AMDGPU::S_ADD_U64_PSEUDO:
5358
5358
case AMDGPU::S_SUB_U64_PSEUDO:
5359
+ case AMDGPU::S_OR_B64:
5360
+ case AMDGPU::S_XOR_B64:
5359
5361
return std::numeric_limits<uint64_t>::min();
5362
+ case AMDGPU::S_AND_B64:
5363
+ return std::numeric_limits<uint64_t>::max();
5360
5364
default:
5361
5365
llvm_unreachable(
5362
5366
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5398,16 +5402,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5398
5402
RetBB = &BB;
5399
5403
break;
5400
5404
}
5401
- case AMDGPU::V_CMP_LT_U64_e64: // umin
5402
- case AMDGPU::V_CMP_LT_I64_e64: // min
5403
- case AMDGPU::V_CMP_GT_U64_e64: // umax
5404
- case AMDGPU::V_CMP_GT_I64_e64: { // max
5405
+ case AMDGPU::V_CMP_LT_U64_e64: // umin
5406
+ case AMDGPU::V_CMP_LT_I64_e64: // min
5407
+ case AMDGPU::V_CMP_GT_U64_e64: // umax
5408
+ case AMDGPU::V_CMP_GT_I64_e64: // max
5409
+ case AMDGPU::S_AND_B64:
5410
+ case AMDGPU::S_OR_B64: {
5405
5411
// Idempotent operations.
5406
5412
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5407
5413
RetBB = &BB;
5408
5414
break;
5409
5415
}
5410
5416
case AMDGPU::S_XOR_B32:
5417
+ case AMDGPU::S_XOR_B64:
5411
5418
case AMDGPU::S_ADD_I32:
5412
5419
case AMDGPU::S_ADD_U64_PSEUDO:
5413
5420
case AMDGPU::S_SUB_I32:
@@ -5431,7 +5438,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5431
5438
.addReg(ExecMask);
5432
5439
5433
5440
switch (Opc) {
5434
- case AMDGPU::S_XOR_B32: {
5441
+ case AMDGPU::S_XOR_B32:
5442
+ case AMDGPU::S_XOR_B64: {
5435
5443
// Performing an XOR operation on a uniform value
5436
5444
// depends on the parity of the number of active lanes.
5437
5445
// For even parity, the result will be 0, for odd
@@ -5443,9 +5451,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5443
5451
.addReg(NewAccumulator->getOperand(0).getReg())
5444
5452
.addImm(1)
5445
5453
.setOperandDead(3); // Dead scc
5446
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5447
- .addReg(SrcReg)
5448
- .addReg(ParityRegister);
5454
+ if (Opc == AMDGPU::S_XOR_B32) {
5455
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5456
+ .addReg(SrcReg)
5457
+ .addReg(ParityRegister);
5458
+ } else {
5459
+ Register DestSub0 =
5460
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461
+ Register DestSub1 =
5462
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463
+
5464
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5465
+ const TargetRegisterClass *SrcSubRC =
5466
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5467
+
5468
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5469
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5470
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5471
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5472
+
5473
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5474
+ .add(Op1L)
5475
+ .addReg(ParityRegister);
5476
+
5477
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5478
+ .add(Op1H)
5479
+ .addReg(ParityRegister);
5480
+
5481
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5482
+ .addReg(DestSub0)
5483
+ .addImm(AMDGPU::sub0)
5484
+ .addReg(DestSub1)
5485
+ .addImm(AMDGPU::sub1);
5486
+ }
5449
5487
break;
5450
5488
}
5451
5489
case AMDGPU::S_SUB_I32: {
@@ -5643,6 +5681,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5643
5681
.addReg(LaneValueHiReg)
5644
5682
.addImm(AMDGPU::sub1);
5645
5683
switch (Opc) {
5684
+ case AMDGPU::S_OR_B64:
5685
+ case AMDGPU::S_AND_B64:
5686
+ case AMDGPU::S_XOR_B64: {
5687
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688
+ .addReg(Accumulator->getOperand(0).getReg())
5689
+ .addReg(LaneValue->getOperand(0).getReg())
5690
+ .setOperandDead(3); // Dead scc
5691
+ break;
5692
+ }
5646
5693
case AMDGPU::V_CMP_GT_I64_e64:
5647
5694
case AMDGPU::V_CMP_GT_U64_e64:
5648
5695
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5751,10 +5798,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5751
5798
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5752
5799
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5753
5800
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5801
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5802
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5754
5803
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5755
5804
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5805
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5806
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5756
5807
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5757
5808
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5809
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5810
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5758
5811
case AMDGPU::S_UADDO_PSEUDO:
5759
5812
case AMDGPU::S_USUBO_PSEUDO: {
5760
5813
const DebugLoc &DL = MI.getDebugLoc();
0 commit comments