@@ -5404,19 +5404,58 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
5404
5404
else
5405
5405
{
5406
5406
// Emits below instructions when simdChannel isn't immediate.
5407
- //shl (16) r8.0<1>:ud r6.0<0;1,0>:d 0x2:uw {Align1, H1, NoMask}
5408
- //add (16) a0.0<1>:uw r8.0<16;8,2>:uw 0x80:uw {Align1, H1, NoMask}
5409
- //mov (16) r10.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5410
- // For SIMD32:
5411
- // shl(M1, 32) V465(0, 0)<1> V464(0, 0)<16; 8, 2> 0x2:uw /// $592
5412
- // mov(M1, 32) V466(0, 0)<1> V70(0, 0)<1; 1, 0> /// $593
5413
- // addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1; 1, 0> /// $594
5414
- // mov(M1, 16) V463(0, 0)<1> r[A0(0), 0]<1, 0> : f /// $595
5415
- // addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0, 16)<1; 1, 0> /// $596
5416
- // mov(M5, 16) V463(1, 0)<1> r[A0(0), 0]<1, 0> : f /// $597
5407
+ //
5408
+ // 1) GenISA_WaveShuffleIndex:
5409
+ //
5410
+ // a) SIMD16
5411
+ // shl (M1, 16) ShuffleTmp(0,0)<1> {{.+}}(0,0)<16;8,2> 0x2:uw
5412
+ // addr_add (M1, 16) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<1;1,0>
5413
+ // mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
5414
+ //
5415
+ // b) SIMD32 (two SIMD16 ADDR_ADD instructions must be generated,
5416
+ // because address register has only 16 elements):
5417
+ // shl(M1, 32) V465(0,0)<1> V464(0,0)<16;8,2> 0x2:uw
5418
+ // mov(M1, 32) V466(0,0)<1> V70(0,0)<1;1,0>
5419
+ // addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1;1,0>
5420
+ // mov(M1, 16) V463(0,0)<1> r[A0(0),0]<1, 0>:f
5421
+ // addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0,16)<1;1,0>
5422
+ // mov(M5, 16) V463(1,0)<1> r[A0(0),0]<1,0>:f
5423
+ //
5424
+ // 2) GenISA_WaveBroadcast:
5425
+ //
5426
+ // shl (M1_NM, 1) ShuffleTmp(0,0)<1> {{.+}}(0,0)<0;1,0> 0x2:uw
5427
+ // addr_add(M1_NM, 1) A0(0) <1> &{{V[0 - 9]+}} ShuffleTmp(0, 0) < 0;1,0 >
5428
+ // a) SIMD16:
5429
+ // mov(M1, 16) simdBroadcast(0,0) <1> r[A0(0),0] <0;1,0>:d
5430
+ // b) SIMD32 (no need for two SIMD16 instructions, because offset in A0 is uniform):
5431
+ // mov(M1, 32) simdBroadcast(0,0) <1> r[A0(0),0] <0;1,0>:d
5417
5432
5418
5433
bool channelUniform = simdChannel->IsUniform();
5419
5434
5435
+ auto* GII = dyn_cast<GenIntrinsicInst>(inst);
5436
+ if (GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveBroadcast &&
5437
+ !channelUniform)
5438
+ {
5439
+ // OpGroupBroadcast guarantees that all channels must be enabled and the
5440
+ // simdChannel value must be the same for all of them. Therefore, even though
5441
+ // it was not possible to deduce, during compilation time, that simdChannel is
5442
+ // uniform, let's force it to be uniform by taking it always from the first channel.
5443
+ CVariable* valueFromFirstChannel = m_currShader->GetNewVariable(
5444
+ numLanes(SIMDMode::SIMD1),
5445
+ simdChannel->GetType(),
5446
+ simdChannel->GetAlign(), true, CName::NONE);
5447
+
5448
+ m_encoder->SetSimdSize(SIMDMode::SIMD1);
5449
+ m_encoder->SetNoMask();
5450
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
5451
+
5452
+ m_encoder->Copy(valueFromFirstChannel, simdChannel);
5453
+ m_encoder->Push();
5454
+
5455
+ simdChannel = valueFromFirstChannel;
5456
+ channelUniform = true;
5457
+ }
5458
+
5420
5459
IGC_ASSERT_MESSAGE(m_encoder->GetCISADataTypeSize(simdChannel->GetType()) == 4,
5421
5460
"simdChannel size of simdShuffle should be 4 bytes!");
5422
5461
@@ -5450,7 +5489,6 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
5450
5489
CVariable* src = data;
5451
5490
if (m_currShader->m_numberInstance == 1 && m_currShader->m_SIMDSize == SIMDMode::SIMD32)
5452
5491
{
5453
-
5454
5492
uint16_t addrSize = channelUniform ? 1 : numLanes(SIMDMode::SIMD16);
5455
5493
5456
5494
// VectorUniform for shuffle is true as all simd lanes will
@@ -5462,34 +5500,43 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
5462
5500
true,
5463
5501
m_destination->getName());
5464
5502
5465
- m_encoder->SetSimdSize(SIMDMode::SIMD16);
5466
-
5467
- m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5468
- m_encoder->Push();
5469
-
5470
- m_encoder->SetSimdSize(SIMDMode::SIMD16);
5471
-
5472
- m_encoder->Copy(m_destination, pDstArrElm);
5473
- m_encoder->Push();
5474
-
5475
- // If destination is uniform, don't execute second half.
5476
- if (!channelUniform && !m_destination->IsUniform())
5503
+ if (GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveBroadcast)
5504
+ {
5505
+ m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5506
+ m_encoder->Push();
5507
+ m_encoder->Copy(m_destination, pDstArrElm);
5508
+ m_encoder->Push();
5509
+ }
5510
+ else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
5477
5511
{
5478
-
5479
5512
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5480
- m_encoder->SetMask(EMASK_H2);
5481
- m_encoder->SetSrcSubReg(0, 16);
5482
- m_encoder->SetSrcSubReg(1, 16);
5513
+
5483
5514
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5484
5515
m_encoder->Push();
5485
5516
5486
5517
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5487
5518
5488
- m_encoder->SetMask(EMASK_H2);
5489
- m_encoder->SetDstSubReg(16);
5490
5519
m_encoder->Copy(m_destination, pDstArrElm);
5491
5520
m_encoder->Push();
5492
- m_encoder->SetSecondHalf(false);
5521
+
5522
+ if (!channelUniform)
5523
+ {
5524
+
5525
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
5526
+ m_encoder->SetMask(EMASK_H2);
5527
+ m_encoder->SetSrcSubReg(0, 16);
5528
+ m_encoder->SetSrcSubReg(1, 16);
5529
+ m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5530
+ m_encoder->Push();
5531
+
5532
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
5533
+
5534
+ m_encoder->SetMask(EMASK_H2);
5535
+ m_encoder->SetDstSubReg(16);
5536
+ m_encoder->Copy(m_destination, pDstArrElm);
5537
+ m_encoder->Push();
5538
+ m_encoder->SetSecondHalf(false);
5539
+ }
5493
5540
}
5494
5541
if (disableHelperLanes)
5495
5542
{
0 commit comments