@@ -8905,6 +8905,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
89058905 case GenISAIntrinsic::GenISA_WaveAll:
89068906 emitWaveAll(inst);
89078907 break;
8908+ case GenISAIntrinsic::GenISA_WaveInterleave:
8909+ emitWaveInterleave(inst);
8910+ break;
89088911 case GenISAIntrinsic::GenISA_WaveClustered:
89098912 emitWaveClustered(inst);
89108913 break;
@@ -13167,8 +13170,45 @@ CVariable* EmitPass::ScanReducePrepareSrc(VISA_Type type, uint64_t identityValue
1316713170}
1316813171
1316913172// Reduction all reduce helper: dst_lane{k} = src_lane{simd + k} OP src_lane{k}, k = 0..(simd-1)
13170- CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, CVariable* src)
13173+ CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, CVariable* src, CVariable* srcSecondHalf )
1317113174{
13175+ const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
13176+ const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
13177+
13178+ if (simd == SIMDMode::SIMD16 && m_currShader->m_numberInstance > 1)
13179+ {
13180+ IGC_ASSERT(srcSecondHalf);
13181+
13182+ CVariable* temp = m_currShader->GetNewVariable(
13183+ numLanes(simd),
13184+ type,
13185+ EALIGN_GRF,
13186+ false,
13187+ CName("reduceDstSecondHalf"));
13188+
13189+ if (!int64EmulationNeeded)
13190+ {
13191+ m_encoder->SetNoMask();
13192+ m_encoder->SetSimdSize(simd);
13193+ m_encoder->GenericAlu(op, temp, src, srcSecondHalf);
13194+ m_encoder->Push();
13195+ }
13196+ else
13197+ {
13198+ if (isInt64Mul)
13199+ {
13200+ CVariable* tmpMulSrc[2] = { src, srcSecondHalf };
13201+ Mul64(temp, tmpMulSrc, simd, true /* noMask */);
13202+ }
13203+ else
13204+ {
13205+ IGC_ASSERT_MESSAGE(0, "Unsupported");
13206+ }
13207+ }
13208+
13209+ return temp;
13210+ }
13211+
1317213212 const bool is64bitType = ScanReduceIs64BitType(type);
1317313213 const auto alignment = is64bitType ? IGC::EALIGN_QWORD : IGC::EALIGN_DWORD;
1317413214 CVariable* temp = m_currShader->GetNewVariable(
@@ -13178,9 +13218,6 @@ CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode
1317813218 false,
1317913219 CName("reduceDst_SIMD", std::to_string(numLanes(simd)).c_str()));
1318013220
13181- const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
13182- const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
13183-
1318413221 if (!int64EmulationNeeded)
1318513222 {
1318613223 m_encoder->SetNoMask();
@@ -13546,34 +13583,7 @@ void EmitPass::emitReductionAll(
1354613583 CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
1354713584 src, nullptr /* dst */);
1354813585
13549- temp = m_currShader->GetNewVariable(
13550- numLanes(simd),
13551- type,
13552- EALIGN_GRF,
13553- false,
13554- CName("reduceDstSecondHalf"));
13555-
13556- const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
13557- const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
13558- if (!int64EmulationNeeded)
13559- {
13560- m_encoder->SetNoMask();
13561- m_encoder->SetSimdSize(simd);
13562- m_encoder->GenericAlu(op, temp, srcH1, srcH2);
13563- m_encoder->Push();
13564- }
13565- else
13566- {
13567- if (isInt64Mul)
13568- {
13569- CVariable* tmpMulSrc[2] = { srcH1, srcH2 };
13570- Mul64(temp, tmpMulSrc, simd, true /* noMask */);
13571- }
13572- else
13573- {
13574- IGC_ASSERT_MESSAGE(0, "Unsupported");
13575- }
13576- }
13586+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp, srcH2);
1357713587 }
1357813588 }
1357913589 if (m_currShader->m_dispatchSize >= SIMDMode::SIMD16)
@@ -13723,6 +13733,54 @@ void EmitPass::emitReductionClustered(const e_opcode op, const uint64_t identity
1372313733 }
1372413734}
1372513735
13736+ void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
13737+ const bool negate, const unsigned int step, CVariable* const src, CVariable* const dst)
13738+ {
13739+ if (step == 1)
13740+ {
13741+ // TODO: consider if it is possible to detect and handle this case in frontends
13742+ // and emit GenISA_WaveAll there, to enable optimizations specific to the ReduceAll intrinsic.
13743+ return emitReductionAll(op, identityValue, type, negate, src, dst);
13744+ }
13745+
13746+ const uint16_t firstStep = numLanes(m_currShader->m_dispatchSize) / 2;
13747+
13748+ IGC_ASSERT_MESSAGE(!dst->IsUniform(), "Unsupported: dst must be non-uniform");
13749+ IGC_ASSERT_MESSAGE(step % 2 == 0 && step <= firstStep, "Invalid reduction interleave step");
13750+
13751+ CVariable* srcH1 = ScanReducePrepareSrc(type, identityValue, negate, false /* secondHalf */,
13752+ src, nullptr /* dst */);
13753+ CVariable* temp = srcH1;
13754+
13755+ // Implementation is similar to emitReductionAll(), but we stop reduction before reaching SIMD1.
13756+ for (unsigned int currentStep = firstStep; currentStep >= step; currentStep >>= 1)
13757+ {
13758+ if (currentStep == 16 && m_currShader->m_numberInstance > 1)
13759+ {
13760+ CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
13761+ src, nullptr /* dst */);
13762+
13763+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp, srcH2);
13764+ }
13765+ else
13766+ {
13767+ temp = ReductionReduceHelper(op, type, lanesToSIMDMode(currentStep), temp);
13768+ }
13769+ }
13770+
13771+ // Broadcast result
13772+ m_encoder->SetSimdSize(m_currShader->m_SIMDSize);
13773+ m_encoder->SetSrcRegion(0, 0, step, 1);
13774+ m_encoder->Copy(dst, temp);
13775+ if (m_currShader->m_numberInstance > 1)
13776+ {
13777+ m_encoder->SetSecondHalf(true);
13778+ m_encoder->Copy(dst, temp);
13779+ m_encoder->SetSecondHalf(false);
13780+ }
13781+ m_encoder->Push();
13782+ }
13783+
1372613784// do prefix op across all activate channels
1372713785void EmitPass::emitPreOrPostFixOp(
1372813786 e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc,
@@ -21141,6 +21199,29 @@ void EmitPass::emitWaveClustered(llvm::GenIntrinsicInst* inst)
2114121199 }
2114221200}
2114321201
21202+ void EmitPass::emitWaveInterleave(llvm::GenIntrinsicInst* inst)
21203+ {
21204+ bool disableHelperLanes = int_cast<int>(cast<ConstantInt>(inst->getArgOperand(3))->getSExtValue()) == 2;
21205+ if (disableHelperLanes)
21206+ {
21207+ ForceDMask();
21208+ }
21209+ CVariable* src = GetSymbol(inst->getOperand(0));
21210+ const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
21211+ const unsigned int step = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
21212+ VISA_Type type;
21213+ e_opcode opCode;
21214+ uint64_t identity = 0;
21215+ GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
21216+ CVariable* dst = m_destination;
21217+ m_encoder->SetSubSpanDestination(false);
21218+ emitReductionInterleave(opCode, identity, type, false, step, src, dst);
21219+ if (disableHelperLanes)
21220+ {
21221+ ResetVMask();
21222+ }
21223+ }
21224+
2114421225void EmitPass::emitDP4A(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier, bool isAccSigned) {
2114521226 GenISAIntrinsic::ID GIID = GII->getIntrinsicID();
2114621227 CVariable* dst = m_destination;
0 commit comments