intel
diff --git a/‎IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl‎
Lines changed: 20 additions & 0 deletions b/‎IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp‎
Lines changed: 1 addition & 0 deletions b/‎IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/CodeSinking.cpp‎
Lines changed: 1 addition & 0 deletions b/‎IGC/Compiler/CISACodeGen/CodeSinking.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/EmitVISAPass.cpp‎
Lines changed: 146 additions & 14 deletions b/‎IGC/Compiler/CISACodeGen/EmitVISAPass.cpp‎
Lines changed: 146 additions & 14 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/EmitVISAPass.hpp‎
Lines changed: 5 additions & 2 deletions b/‎IGC/Compiler/CISACodeGen/EmitVISAPass.hpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/HalfPromotion.cpp‎
Lines changed: 1 addition & 0 deletions b/‎IGC/Compiler/CISACodeGen/HalfPromotion.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/PatternMatchPass.cpp‎
Lines changed: 2 additions & 0 deletions b/‎IGC/Compiler/CISACodeGen/PatternMatchPass.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/PromoteInt8Type.cpp‎
Lines changed: 4 additions & 1 deletion b/‎IGC/Compiler/CISACodeGen/PromoteInt8Type.cpp‎
Lines changed: 4 additions & 1 deletion
@@ -903,6 +903,26 @@ DECL_SUB_GROUP_ALL_GROUPS(type, type_abbr, LogicalXorKHR)
 
 DECL_LOGICAL_OPERATIONS(bool, i1)
 
+// __builtin_IB_sub_group_clustered_scan_IAdd/FAdd
+//
+// At the moment only Add operation is supported for clustered scan.
+// If functionality is extended to match (non-clustered) scan, the macro
+// should be moved to DECL_SUB_GROUP_ALL_GROUPS.
+#define DECL_SUB_GROUP_CLUSTERED_ADD(type, type_abbr, type_sign) \
+type __builtin_IB_sub_group_clustered_scan_##type_sign##Add_##type_abbr(type x, uint cluster_size) __attribute__((const));
+
+DECL_SUB_GROUP_CLUSTERED_ADD(char,   i8,  I)
+DECL_SUB_GROUP_CLUSTERED_ADD(short,  i16, I)
+DECL_SUB_GROUP_CLUSTERED_ADD(int,    i32, I)
+DECL_SUB_GROUP_CLUSTERED_ADD(long,   i64, I)
+DECL_SUB_GROUP_CLUSTERED_ADD(float,  f32, F)
+#if defined(cl_khr_fp64)
+DECL_SUB_GROUP_CLUSTERED_ADD(double, f64, F)
+#endif // defined(cl_khr_fp64)
+#if defined(cl_khr_fp16)
+DECL_SUB_GROUP_CLUSTERED_ADD(half,   f16, F)
+#endif // defined(cl_khr_fp16)
+
 // The following mul/fma with rtz is used internally for int div/rem emulation
 // x * y, using round-to-zero
 double __builtin_IB_mul_rtz_f64(double x, double y) __attribute__((const));
 
@@ -344,6 +344,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C)
         case GenISAIntrinsic::GenISA_WaveInverseBallot:
         case GenISAIntrinsic::GenISA_WaveClusteredBallot:
         case GenISAIntrinsic::GenISA_WavePrefix:
+        case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
         case GenISAIntrinsic::GenISA_WaveClustered:
         case GenISAIntrinsic::GenISA_WaveInterleave:
         case GenISAIntrinsic::GenISA_WaveClusteredInterleave:
 
@@ -2497,6 +2497,7 @@ namespace IGC {
                         case GenISAIntrinsic::GenISA_WaveClustered:
                         case GenISAIntrinsic::GenISA_WaveInterleave:
                         case GenISAIntrinsic::GenISA_WavePrefix:
+                        case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
                             PrintDump(VerbosityLevel::High, "Not a real store instruction, may not alias\n");
                             continue;
 
 
@@ -9194,6 +9194,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
     case GenISAIntrinsic::GenISA_QuadPrefix:
         emitQuadPrefix(cast<QuadPrefixIntrinsic>(inst));
         break;
+    case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
+        emitWaveClusteredPrefix(inst);
+        break;
     case GenISAIntrinsic::GenISA_WaveAll:
         emitWaveAll(inst);
         break;
@@ -14431,8 +14434,18 @@ void EmitPass::emitReductionClusteredInterleave(const e_opcode op, const uint64_
 void EmitPass::emitPreOrPostFixOp(
     e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc,
     CVariable* pSrc, CVariable* pSrcsArr[2], CVariable* Flag,
-    bool isPrefix, bool isQuad)
-{
+    bool isPrefix, bool isQuad, int clusterSize)
+{
+    // TODO Arguments isQuad and clusterSize have similar function: both split subgroup into
+    // smaller sets of lanes processed separately. isQuad could be considered clusterSize == 4,
+    // but there is a significant difference in implementation: when shifting input by one lane
+    // to the right for exclusive scan (isPrefix == true), isQuad inserts identity value only
+    // to the first lane in subgroup, where clusterSize == 8/16 inserts identity value to the
+    // first lane of each cluster.
+    //
+    // isQuad/clusterSize could be replaced with one argument, but the code must be refactored
+    // to not break QuadPrefix intrinsic.
+
     const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
     const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
 
@@ -14442,12 +14455,12 @@ void EmitPass::emitPreOrPostFixOp(
         emitPreOrPostFixOpScalar(
             op, identityValue, type, negateSrc,
             pSrc, pSrcsArr, Flag,
-            isPrefix);
+            isPrefix, clusterSize);
         return;
     }
 
-    bool isSimd32 = m_currShader->m_numberInstance == 2;
-    int counter = isSimd32 ? 2 : 1;
+    bool isSimd32AsTwoInstances = m_currShader->m_numberInstance == 2;
+    int counter = isSimd32AsTwoInstances ? 2 : 1;
 
     CVariable* maskedSrc[2] = { 0 };
     for (int i = 0; i < counter; ++i)
@@ -14466,7 +14479,9 @@ void EmitPass::emitPreOrPostFixOp(
             // Copy identity
             m_encoder->SetSimdSize(SIMDMode::SIMD1);
             m_encoder->SetNoMask();
-            if (i == 0)
+            // Before shift, insert identity value to the first lane
+            // in subgroup (or cluster).
+            if (i == 0 || clusterSize > 0)
             {
                 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
                 m_encoder->Copy(pSrcCopy, pIdentityValue);
@@ -14496,7 +14511,25 @@ void EmitPass::emitPreOrPostFixOp(
                 }
                 offset += simdsize;
             }
+
+            // After shifting the input by one lane, in each cluster that starts in
+            // the middle of GRF, set the first lane to the identity value.
+            if (clusterSize > 0)
+            {
+                m_encoder->SetSimdSize(SIMDMode::SIMD1);
+                m_encoder->SetNoMask();
+                CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
+
+                for (int i = clusterSize; i < pSrcCopy->GetNumberElement(); i += clusterSize)
+                {
+                    m_encoder->SetDstSubReg(i);
+                    m_encoder->Copy(pSrcCopy, pIdentityValue);
+                }
+
+                m_encoder->Push();
+            }
         }
+
         pSrcsArr[i] = pSrcCopy;
     }
 
@@ -14593,7 +14626,7 @@ void EmitPass::emitPreOrPostFixOp(
         }
     };
 
-    if (m_currShader->m_dispatchSize == SIMDMode::SIMD32 && !isSimd32)
+    if (m_currShader->m_dispatchSize == SIMDMode::SIMD32 && !isSimd32AsTwoInstances)
     {
         // handling the single SIMD32 size case in PVC
         // the logic is mostly similar to the legacy code sequence below, except that
@@ -14647,6 +14680,12 @@ void EmitPass::emitPreOrPostFixOp(
                 (loop_counter * 8 + 4) /*dst subreg*/, 1 /*dst region*/);
         }
 
+        if (clusterSize == 8)
+        {
+            // With SIMD8 clusters, stop at SIMD8 prefix.
+            return;
+        }
+
         // Merge: 2 SIMD8's to get 2 SIMD16 prefix sequence
         for (uint loop_counter = 0; loop_counter < 2; ++loop_counter)
         {
@@ -14659,6 +14698,12 @@ void EmitPass::emitPreOrPostFixOp(
                 loop_counter * 16 + 8 /*dst subreg*/, 1 /*dst region*/);
         }
 
+        if (clusterSize == 16)
+        {
+            // With SIMD16 clusters, stop at SIMD16 prefix.
+            return;
+        }
+
         // final merge to get 1 SIMD32 prefix sequence and viola!
         {
             const uint src0Region[3] = { 0, 1, 0 };
@@ -14783,7 +14828,13 @@ void EmitPass::emitPreOrPostFixOp(
                 (loop_counter * 8 + 4) /*dst subreg*/, 1 /*dst region*/);
         }
 
-        if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || isSimd32)
+        if (clusterSize == 8)
+        {
+            // Stop ALU ops at SIMD8 lanes.
+            continue;
+        }
+
+        if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || isSimd32AsTwoInstances)
         {
             // Add the last element of the 1st GRF to all the elements of the 2nd GRF
             const uint src0Region[3] = { 0, 1, 0 };
@@ -14796,7 +14847,8 @@ void EmitPass::emitPreOrPostFixOp(
         }
     }
 
-    if (isSimd32 && !isQuad)
+    bool hasClusters = isQuad || clusterSize > 0;
+    if (isSimd32AsTwoInstances && !hasClusters)
     {
         // For SIMD32 we need to write the last element of the prev element to the next 16 elements
         const uint src0Region[3] = { 0, 1, 0 };
@@ -14820,13 +14872,14 @@ void EmitPass::emitPreOrPostFixOpScalar(
     CVariable* src,
     CVariable* result[2],
     CVariable* Flag,
-    bool isPrefix)
+    bool isPrefix,
+    int clusterSize)
 {
     const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
     const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
 
-    bool isSimd32 = m_currShader->m_numberInstance == 2;
-    int counter = isSimd32 ? 2 : 1;
+    bool isSimd32AsTwoInstances = m_currShader->m_numberInstance == 2;
+    int counter = isSimd32AsTwoInstances ? 2 : 1;
     CVariable* pSrcCopy[2] = {};
     for (int i = 0; i < counter; ++i)
     {
@@ -14849,7 +14902,7 @@ void EmitPass::emitPreOrPostFixOpScalar(
         if (isPrefix)
         {
             // For case where we need the prefix shift the source by 1 lane.
-            if (i == 0)
+            if (i == 0 || clusterSize == 8 || clusterSize == 16)
             {
                 // (W) mov (1) result[0] identity
                 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
@@ -14884,6 +14937,23 @@ void EmitPass::emitPreOrPostFixOpScalar(
 
         for (int dstIdx = 1; dstIdx < numLanes(m_currShader->m_SIMDSize); ++dstIdx, ++srcIdx)
         {
+            // Scan is done one by one. With clusters, start each cluster with
+            // initial value.
+            if ((clusterSize == 8 || clusterSize == 16) && dstIdx % clusterSize == 0)
+            {
+                // For case where we need the prefix, start cluster with
+                // identity value.
+                if (isPrefix)
+                {
+                    m_encoder->SetSimdSize(SIMDMode::SIMD1);
+                    m_encoder->SetNoMask();
+                    m_encoder->SetDstSubReg(dstIdx);
+                    CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
+                    m_encoder->Copy(result[i], pIdentityValue);
+                    continue;
+                }
+            }
+
             // do the scan one by one
             // (W) op (1) result[dstIdx] srcCopy[srcIdx] result[dstIdx-1]
             if (!int64EmulationNeeded)
@@ -14924,7 +14994,7 @@ void EmitPass::emitPreOrPostFixOpScalar(
         m_encoder->SetSecondHalf(false);
     }
 
-    if (isSimd32)
+    if (isSimd32AsTwoInstances && !clusterSize)
     {
         const SIMDMode simd = SIMDMode::SIMD16;
 
@@ -22157,6 +22227,68 @@ void EmitPass::emitScan(
     m_encoder->Push();
 }
 
+void EmitPass::emitWaveClusteredPrefix(GenIntrinsicInst* I)
+{
+    auto helperLanes = int_cast<int>(cast<ConstantInt>(I->getArgOperand(3))->getSExtValue());
+    bool disableHelperLanes = (helperLanes == 2);
+
+    IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(I->getOperand(2)), "Unsupported: cluster size must be constant");
+    const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(I->getOperand(2))->getZExtValue());
+
+    IGC_ASSERT_MESSAGE(clusterSize <= numLanes(m_currShader->m_dispatchSize), "Cluster size must be smaller or equal to SIMD");
+    IGC_ASSERT_MESSAGE(clusterSize == 8 || clusterSize == 16 || clusterSize == 32, "Cluster size must be 8/16/32");
+
+    IGC::WaveOps Op = static_cast<IGC::WaveOps>(I->getImm64Operand(1));
+    IGC_ASSERT_MESSAGE(Op == IGC::WaveOps::SUM || Op == IGC::WaveOps::FSUM, "Unsupported op type");
+
+    if (disableHelperLanes)
+    {
+        ForceDMask();
+    }
+
+    Value* Src = I->getOperand(0);
+
+    if (clusterSize == numLanes(m_currShader->m_dispatchSize))
+    {
+        // If cluster size is equal to SIMD size, just run normal scan.
+        emitScan(Src, Op, false, nullptr, false);
+    }
+    else
+    {
+        // Run scan with clusters.
+
+        VISA_Type type;
+        e_opcode opCode;
+        uint64_t identity = 0;
+        GetReductionOp(Op, Src->getType(), identity, opCode, type);
+
+        IGC_ASSERT_MESSAGE((CEncoder::GetCISADataTypeSize(type) == 8 && ScanReduceIsInt64EmulationNeeded(opCode, type)) == false,
+            "Unsupported: 64b data type");
+
+        CVariable* src = GetSymbol(Src);
+        CVariable* dst[2] = { nullptr, nullptr };
+
+        emitPreOrPostFixOp(
+            opCode, identity, type,
+            false, src, dst, nullptr,
+            true, false, clusterSize);
+
+        m_encoder->Copy(m_destination, dst[0]);
+        if (m_currShader->m_numberInstance == 2)
+        {
+            m_encoder->SetSecondHalf(true);
+            m_encoder->Copy(m_destination, dst[1]);
+            m_encoder->SetSecondHalf(false);
+        }
+        m_encoder->Push();
+    }
+
+    if (disableHelperLanes)
+    {
+        ResetVMask();
+    }
+}
+
 void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
 {
     bool disableHelperLanes = int_cast<int>(cast<ConstantInt>(inst->getArgOperand(2))->getSExtValue()) == 2;
 
@@ -377,7 +377,8 @@ class EmitPass : public llvm::FunctionPass
         CVariable* result[2],
         CVariable* Flag = nullptr,
         bool isPrefix = false,
-        bool isQuad = false);
+        bool isQuad = false,
+        int clusterSize = 0);
     void emitPreOrPostFixOpScalar(
         e_opcode op,
         uint64_t identityValue,
@@ -386,7 +387,8 @@ class EmitPass : public llvm::FunctionPass
         CVariable* src,
         CVariable* result[2],
         CVariable* Flag,
-        bool isPrefix);
+        bool isPrefix,
+        int clusterSize = 0);
 
     bool IsUniformAtomic(llvm::Instruction* pInst);
     void emitAtomicRaw(llvm::GenIntrinsicInst *pInst, Value *varOffset,
@@ -473,6 +475,7 @@ class EmitPass : public llvm::FunctionPass
     void emitWaveShuffleIndex(llvm::GenIntrinsicInst* inst);
     void emitWavePrefix(llvm::WavePrefixIntrinsic* I);
     void emitQuadPrefix(llvm::QuadPrefixIntrinsic* I);
+    void emitWaveClusteredPrefix(llvm::GenIntrinsicInst* I);
     void emitWaveAll(llvm::GenIntrinsicInst* inst);
     void emitWaveClustered(llvm::GenIntrinsicInst* inst);
     void emitWaveInterleave(llvm::GenIntrinsicInst* inst);
 
@@ -113,6 +113,7 @@ void IGC::HalfPromotion::handleGenIntrinsic(llvm::GenIntrinsicInst& I)
     GenISAIntrinsic::ID id = I.getIntrinsicID();
     if (id == GenISAIntrinsic::GenISA_WaveAll ||
         id == GenISAIntrinsic::GenISA_WavePrefix ||
+        id == GenISAIntrinsic::GenISA_WaveClusteredPrefix ||
         id == GenISAIntrinsic::GenISA_WaveClustered ||
         id == GenISAIntrinsic::GenISA_WaveInterleave ||
         id == GenISAIntrinsic::GenISA_WaveClusteredInterleave)
 
@@ -1378,6 +1378,7 @@ namespace IGC
             case GenISAIntrinsic::GenISA_WaveInterleave:
             case GenISAIntrinsic::GenISA_WaveClusteredInterleave:
             case GenISAIntrinsic::GenISA_WavePrefix:
+            case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
                 match = MatchWaveInstruction(*GII);
                 break;
             case GenISAIntrinsic::GenISA_simdBlockRead:
@@ -5303,6 +5304,7 @@ namespace IGC
             break;
         case GenISAIntrinsic::GenISA_WaveInterleave:
         case GenISAIntrinsic::GenISA_WaveClustered:
+        case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
             helperLaneIndex = 3;
             break;
         case GenISAIntrinsic::GenISA_WavePrefix:
 
@@ -1138,7 +1138,8 @@ void PromoteInt8Type::promoteIntrinsic()
             GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveInterleave) ||
             GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveClusteredInterleave) ||
             GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WavePrefix) ||
-            GII->isGenIntrinsic(GenISAIntrinsic::GenISA_QuadPrefix))
+            GII->isGenIntrinsic(GenISAIntrinsic::GenISA_QuadPrefix) ||
+            GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveClusteredPrefix))
         {
             // Those are scan or reduce functions. If the operand type
             // is of I8, need to promote it to avoid ALU on I8 type.
@@ -1165,6 +1166,7 @@ void PromoteInt8Type::promoteIntrinsic()
                 gid == GenISAIntrinsic::GenISA_WaveClusteredInterleave ||
                 gid == GenISAIntrinsic::GenISA_WavePrefix ||
                 gid == GenISAIntrinsic::GenISA_QuadPrefix ||
+                gid == GenISAIntrinsic::GenISA_WaveClusteredPrefix ||
                 gid == GenISAIntrinsic::GenISA_WaveShuffleIndex ||
                 gid == GenISAIntrinsic::GenISA_WaveBroadcast ||
                 gid == GenISAIntrinsic::GenISA_WaveClusteredBroadcast ||
@@ -1207,6 +1209,7 @@ void PromoteInt8Type::promoteIntrinsic()
                 case GenISAIntrinsic::GenISA_WaveClustered:
                 case GenISAIntrinsic::GenISA_WaveInterleave:
                 case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
+                case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
                 {
                     // prototype:
                     //     Ty <clustered> (Ty, char, int, int)