@@ -5270,6 +5270,57 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5270
5270
return LoopBB;
5271
5271
}
5272
5272
5273
+ static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5274
+ MachineBasicBlock *BB) {
5275
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
5276
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
5277
+ MachineFunction *MF = BB->getParent();
5278
+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5280
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5281
+ const DebugLoc &DL = MI.getDebugLoc();
5282
+ MachineOperand &Dest = MI.getOperand(0);
5283
+ MachineOperand &Src0 = MI.getOperand(1);
5284
+ MachineOperand &Src1 = MI.getOperand(2);
5285
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5286
+ if (ST.hasScalarAddSub64()) {
5287
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5288
+ // clang-format off
5289
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5290
+ .add(Src0)
5291
+ .add(Src1);
5292
+ // clang-format on
5293
+ } else {
5294
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5295
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5296
+
5297
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5298
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299
+
5300
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5301
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5302
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5303
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5304
+
5305
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5306
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5307
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5308
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5309
+
5310
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5311
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5312
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5313
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5314
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5315
+ .addReg(DestSub0)
5316
+ .addImm(AMDGPU::sub0)
5317
+ .addReg(DestSub1)
5318
+ .addImm(AMDGPU::sub1);
5319
+ }
5320
+ MI.eraseFromParent();
5321
+ return BB;
5322
+ }
5323
+
5273
5324
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5274
5325
switch (Opc) {
5275
5326
case AMDGPU::S_MIN_U32:
@@ -5303,6 +5354,9 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5303
5354
return std::numeric_limits<uint64_t>::min();
5304
5355
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5305
5356
return std::numeric_limits<int64_t>::min();
5357
+ case AMDGPU::S_ADD_U64_PSEUDO:
5358
+ case AMDGPU::S_SUB_U64_PSEUDO:
5359
+ return std::numeric_limits<uint64_t>::min();
5306
5360
default:
5307
5361
llvm_unreachable(
5308
5362
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5355,51 +5409,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5355
5409
}
5356
5410
case AMDGPU::S_XOR_B32:
5357
5411
case AMDGPU::S_ADD_I32:
5358
- case AMDGPU::S_SUB_I32: {
5412
+ case AMDGPU::S_ADD_U64_PSEUDO:
5413
+ case AMDGPU::S_SUB_I32:
5414
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5359
5415
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5360
5416
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5361
5417
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5362
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5418
+ Register NumActiveLanes =
5419
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363
5420
5364
5421
bool IsWave32 = ST.isWave32();
5365
5422
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5366
5423
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5367
- unsigned CountReg =
5424
+ unsigned BitCountOpc =
5368
5425
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5369
5426
5370
- auto Exec =
5371
- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5427
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5372
5428
5373
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5374
- .addReg(Exec->getOperand(0).getReg());
5429
+ auto NewAccumulator =
5430
+ BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5431
+ .addReg(ExecMask);
5375
5432
5376
5433
switch (Opc) {
5377
5434
case AMDGPU::S_XOR_B32: {
5378
5435
// Performing an XOR operation on a uniform value
5379
5436
// depends on the parity of the number of active lanes.
5380
5437
// For even parity, the result will be 0, for odd
5381
5438
// parity the result will be the same as the input value.
5382
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5439
+ Register ParityRegister =
5440
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5383
5441
5384
- auto ParityReg =
5385
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister )
5386
- .addReg(NewAccumulator->getOperand(0).getReg() )
5387
- .addImm(1);
5442
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5443
+ .addReg(NewAccumulator->getOperand(0).getReg() )
5444
+ .addImm(1 )
5445
+ .setOperandDead(3); // Dead scc
5388
5446
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5389
5447
.addReg(SrcReg)
5390
- .addReg(ParityReg->getOperand(0).getReg() );
5448
+ .addReg(ParityRegister );
5391
5449
break;
5392
5450
}
5393
5451
case AMDGPU::S_SUB_I32: {
5394
5452
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5395
5453
5396
5454
// Take the negation of the source operand.
5397
- auto InvertedValReg =
5398
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5399
- .addImm(-1)
5400
- .addReg(SrcReg);
5455
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5456
+ .addImm(0)
5457
+ .addReg(SrcReg);
5401
5458
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5402
- .addReg(InvertedValReg->getOperand(0).getReg() )
5459
+ .addReg(NegatedVal )
5403
5460
.addReg(NewAccumulator->getOperand(0).getReg());
5404
5461
break;
5405
5462
}
@@ -5409,6 +5466,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5409
5466
.addReg(NewAccumulator->getOperand(0).getReg());
5410
5467
break;
5411
5468
}
5469
+ case AMDGPU::S_ADD_U64_PSEUDO:
5470
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5471
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5472
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5473
+ Register Op1H_Op0L_Reg =
5474
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5475
+ Register Op1L_Op0H_Reg =
5476
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5477
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5478
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5479
+ Register NegatedValLo =
5480
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5481
+ Register NegatedValHi =
5482
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5483
+
5484
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5485
+ const TargetRegisterClass *Src1SubRC =
5486
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5487
+
5488
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5489
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5490
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5491
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5492
+
5493
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5494
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5495
+ .addImm(0)
5496
+ .addReg(NewAccumulator->getOperand(0).getReg())
5497
+ .setOperandDead(3); // Dead scc
5498
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5499
+ .addReg(NegatedValLo)
5500
+ .addImm(31)
5501
+ .setOperandDead(3); // Dead scc
5502
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5503
+ .add(Op1L)
5504
+ .addReg(NegatedValHi);
5505
+ }
5506
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5507
+ ? NegatedValLo
5508
+ : NewAccumulator->getOperand(0).getReg();
5509
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5510
+ .add(Op1L)
5511
+ .addReg(LowOpcode);
5512
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5513
+ .add(Op1L)
5514
+ .addReg(LowOpcode);
5515
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5516
+ .add(Op1H)
5517
+ .addReg(LowOpcode);
5518
+
5519
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5520
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5521
+ .addReg(CarryReg)
5522
+ .addReg(Op1H_Op0L_Reg)
5523
+ .setOperandDead(3); // Dead scc
5524
+
5525
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5526
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5527
+ .addReg(HiVal)
5528
+ .addReg(Op1L_Op0H_Reg)
5529
+ .setOperandDead(3); // Dead scc
5530
+ }
5531
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5532
+ .addReg(DestSub0)
5533
+ .addImm(AMDGPU::sub0)
5534
+ .addReg(DestSub1)
5535
+ .addImm(AMDGPU::sub1);
5536
+ break;
5537
+ }
5412
5538
}
5413
5539
RetBB = &BB;
5414
5540
}
@@ -5555,6 +5681,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5555
5681
.addReg(Accumulator->getOperand(0).getReg());
5556
5682
break;
5557
5683
}
5684
+ case AMDGPU::S_ADD_U64_PSEUDO:
5685
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5686
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5687
+ .addReg(Accumulator->getOperand(0).getReg())
5688
+ .addReg(LaneValue->getOperand(0).getReg());
5689
+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5690
+ break;
5691
+ }
5558
5692
}
5559
5693
}
5560
5694
// Manipulate the iterator to get the next active lane
@@ -5565,8 +5699,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5565
5699
.addReg(ActiveBitsReg);
5566
5700
5567
5701
// Add phi nodes
5568
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5569
- .addMBB(ComputeLoop);
5702
+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5570
5703
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5571
5704
5572
5705
// Creating branching
@@ -5610,8 +5743,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5610
5743
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5611
5744
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5612
5745
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5746
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5747
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5613
5748
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5614
5749
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5750
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5751
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5615
5752
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5616
5753
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5617
5754
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
@@ -5644,55 +5781,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5644
5781
}
5645
5782
case AMDGPU::S_ADD_U64_PSEUDO:
5646
5783
case AMDGPU::S_SUB_U64_PSEUDO: {
5647
- // For targets older than GFX12, we emit a sequence of 32-bit operations.
5648
- // For GFX12, we emit s_add_u64 and s_sub_u64.
5649
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5650
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5651
- const DebugLoc &DL = MI.getDebugLoc();
5652
- MachineOperand &Dest = MI.getOperand(0);
5653
- MachineOperand &Src0 = MI.getOperand(1);
5654
- MachineOperand &Src1 = MI.getOperand(2);
5655
- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5656
- if (Subtarget->hasScalarAddSub64()) {
5657
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5658
- // clang-format off
5659
- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5660
- .add(Src0)
5661
- .add(Src1);
5662
- // clang-format on
5663
- } else {
5664
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
5665
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5666
-
5667
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5668
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5669
-
5670
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5671
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5672
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5673
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5674
-
5675
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5676
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5677
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5678
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5679
-
5680
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5681
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5682
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5683
- .add(Src0Sub0)
5684
- .add(Src1Sub0);
5685
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5686
- .add(Src0Sub1)
5687
- .add(Src1Sub1);
5688
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5689
- .addReg(DestSub0)
5690
- .addImm(AMDGPU::sub0)
5691
- .addReg(DestSub1)
5692
- .addImm(AMDGPU::sub1);
5693
- }
5694
- MI.eraseFromParent();
5695
- return BB;
5784
+ return Expand64BitScalarArithmetic(MI, BB);
5696
5785
}
5697
5786
case AMDGPU::V_ADD_U64_PSEUDO:
5698
5787
case AMDGPU::V_SUB_U64_PSEUDO: {
0 commit comments