intel
diff --git a/‎IGC/Compiler/CISACodeGen/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎IGC/Compiler/CISACodeGen/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/ComputeShaderBase.cpp‎
Lines changed: 22 additions & 242 deletions b/‎IGC/Compiler/CISACodeGen/ComputeShaderBase.cpp‎
Lines changed: 22 additions & 242 deletions
diff --git a/‎IGC/Compiler/CISACodeGen/ComputeShaderBase.hpp‎
Lines changed: 19 additions & 22 deletions b/‎IGC/Compiler/CISACodeGen/ComputeShaderBase.hpp‎
Lines changed: 19 additions & 22 deletions
@@ -37,6 +37,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
     "${CMAKE_CURRENT_SOURCE_DIR}/CodeHoisting.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/CollectGeometryShaderProperties.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCommon.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCodeGen.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/CShader.cpp"
@@ -130,6 +131,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
     "${CMAKE_CURRENT_SOURCE_DIR}/CodeHoisting.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/CollectGeometryShaderProperties.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCommon.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCodeGen.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/CVariable.hpp"
 
@@ -42,256 +42,36 @@ namespace IGC
 
     CComputeShaderBase::~CComputeShaderBase() {}
 
-    void CComputeShaderBase::CreateThreadPayloadData(
-        void*& pThreadPayload,
-        uint& curbeTotalDataLength,
-        uint& curbeReadLength,
-        ThreadIDLayout layout) const
+    void CComputeShaderBase::selectWalkOrder(
+        bool useLinearWalk,
+        uint numberOfTypedAccess,
+        uint numberOfUntypedAccess,
+        uint threadGroupSize_X,
+        uint threadGroupSize_Y,
+        uint threadGroupSize_Z)
     {
-        typedef uint16_t ThreadPayloadEntry;
+        const CodeGenContext* pCtx = GetContext();
 
-        // Find the max thread group dimension
-        const OctEltUnit SIZE_OF_DQWORD = OctEltUnit(2);
-        const OctEltUnit SIZE_OF_OWORD = OctEltUnit(1);
-        uint numberOfId = GetNumberOfId();
-        uint dimX = numLanes(m_dispatchSize);
-        // dimX must align to alignment_X bytes (one GRF)
-        uint alignment_X = EltUnit(SIZE_OF_OWORD).Count() * sizeof(DWORD);
-        uint dimX_aligned = iSTD::Align(dimX * sizeof(ThreadPayloadEntry), alignment_X) / sizeof(ThreadPayloadEntry);
-        uint dimY = (iSTD::Align(m_threadGroupSize, dimX) / dimX) * numberOfId;
-        curbeReadLength = dimX_aligned * numberOfId * sizeof(ThreadPayloadEntry) / alignment_X;
-
-        uint alignedVal = EltUnit(SIZE_OF_DQWORD).Count() * sizeof(ThreadPayloadEntry); // Oct Element is 8 Entries
-        // m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
-        curbeTotalDataLength = iSTD::Align(dimX_aligned * dimY * sizeof(ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
-
-        IGC_ASSERT_MESSAGE((pThreadPayload == nullptr), "Thread payload should be a null variable");
-
-        unsigned threadPayloadEntries = curbeTotalDataLength / sizeof(ThreadPayloadEntry);
-
-        ThreadPayloadEntry* pThreadPayloadMem = (ThreadPayloadEntry*)IGC::aligned_malloc(threadPayloadEntries * sizeof(ThreadPayloadEntry), 16);
-        IGC_ASSERT(nullptr != pThreadPayloadMem);
-        std::fill(pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0);
-
-        pThreadPayload = pThreadPayloadMem;
-
-        // Increase the pointer to per-thread constant data by the number of allocated
-        // cross-thread constants.
-        pThreadPayloadMem += (m_NOSBufferSize / sizeof(ThreadPayloadEntry));
-
-        uint currThreadX = 0;
-        uint currThreadY = 0;
-        uint currThreadZ = 0;
-
-        // Current heuristic is trivial, if there are more typed access than untyped access we walk in tile
-        // otherwise we walk linearly
-
-        for (uint y = 0; y < dimY; y += numberOfId)
-        {
-            for (uint x = 0; x < dimX; ++x)
-            {
-                uint lane = 0;
-                if (m_pThread_ID_in_Group_X)
-                {
-                    pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
-                    lane++;
-                }
-                if (m_pThread_ID_in_Group_Y)
-                {
-                    pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
-                    lane++;
-                }
-                if (m_pThread_ID_in_Group_Z)
-                {
-                    pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
-                    lane++;
-                }
-
-                if(layout == ThreadIDLayout::TileY)
-                {
-                    const unsigned int tileSizeY = 4;
-                    ++currThreadY;
-
-                    if (currThreadY % tileSizeY == 0)
-                    {
-                        currThreadY -= tileSizeY;
-                        ++currThreadX;
-                    }
-
-                    if (currThreadX >= m_threadGroupSize_X)
-                    {
-                        currThreadX = 0;
-                        currThreadY += tileSizeY;
-                    }
-
-                    if (currThreadY >= m_threadGroupSize_Y)
-                    {
-                        currThreadY = 0;
-                        ++currThreadZ;
-                    }
-
-                    if (currThreadZ >= m_threadGroupSize_Z)
-                    {
-                        currThreadZ = 0;
-                    }
-                }
-                else if (layout == ThreadIDLayout::X)
-                {
-                    ++currThreadX;
-
-                    if (currThreadX >= m_threadGroupSize_X)
-                    {
-                        currThreadX = 0;
-                        ++currThreadY;
-                    }
-
-                    if (currThreadY >= m_threadGroupSize_Y)
-                    {
-                        currThreadY = 0;
-                        ++currThreadZ;
-                    }
-
-                    if (currThreadZ >= m_threadGroupSize_Z)
-                    {
-                        currThreadZ = 0;
-                    }
-                }
-                else if (layout == ThreadIDLayout::QuadTile)
-                {
-                    const unsigned int tileSizeX = 2;
-                    const unsigned int tileSizeY = 2;
-                    ++currThreadX;
-
-                    if (currThreadX % tileSizeX == 0)
-                    {
-                        ++currThreadY;
-                    }
-
-                    if ((currThreadX % tileSizeX == 0) &&
-                        (currThreadY % tileSizeY == 0))
-                    {
-                        currThreadY -= tileSizeY;
-                    }
-                    else if (currThreadX % tileSizeX == 0)
-                    {
-                        currThreadX -= tileSizeX;
-                    }
-
-                    if (currThreadX >= m_threadGroupSize_X)
-                    {
-                        currThreadX = 0;
-                        currThreadY += tileSizeY;
-                    }
-
-                    if (currThreadY >= m_threadGroupSize_Y)
-                    {
-                        currThreadX = 0;
-                        currThreadY = 0;
-                        ++currThreadZ;
-                    }
-
-                    if (currThreadZ >= m_threadGroupSize_Z)
-                    {
-                        currThreadZ = 0;
-                    }
-                }
-                else
-                {
-                    IGC_ASSERT_MESSAGE(0, "unhandled layout!");
-                }
-            }
-        }
-    }
-
-    CVariable* CComputeShaderBase::CreateThreadIDinGroup(SGVUsage channelNum)
-    {
-        IGC_ASSERT_MESSAGE((channelNum <= THREAD_ID_IN_GROUP_Z), "Thread id's are in 3 dimensions only");
-        IGC_ASSERT_MESSAGE((channelNum >= THREAD_ID_IN_GROUP_X), "Thread id's are in 3 dimensions only");
-
-        switch(channelNum)
-        {
-        case THREAD_ID_IN_GROUP_X:
-            if(m_pThread_ID_in_Group_X == nullptr)
-            {
-                m_pThread_ID_in_Group_X = GetNewVariable(
-                    numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupX");
-            }
-            return m_pThread_ID_in_Group_X;
-        case THREAD_ID_IN_GROUP_Y:
-            if(m_pThread_ID_in_Group_Y == nullptr)
-            {
-                m_pThread_ID_in_Group_Y = GetNewVariable(
-                    numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupY");
-            }
-            return m_pThread_ID_in_Group_Y;
-        case THREAD_ID_IN_GROUP_Z:
-            if(m_pThread_ID_in_Group_Z == nullptr)
-            {
-                m_pThread_ID_in_Group_Z = GetNewVariable(
-                    numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupZ");
-            }
-            return m_pThread_ID_in_Group_Z;
-        default:
-            IGC_ASSERT_MESSAGE(0, "Invalid channel number");
-            break;
-        }
-
-        return nullptr;
-    }
-
-    void CComputeShaderBase::AllocatePerThreadConstantData(uint32_t &offset)
-    {
-        // Per-thread constant data.
-        if (m_pThread_ID_in_Group_X)
-        {
-            for (uint i = 0; i < m_pThread_ID_in_Group_X->GetNumberInstance(); i++)
-            {
-                AllocateInput(m_pThread_ID_in_Group_X, offset, i);
-                offset += m_pThread_ID_in_Group_X->GetSize();
-                offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign()]);
-            }
-        }
-
-        if (m_pThread_ID_in_Group_Y)
-        {
-            for (uint i = 0; i < m_pThread_ID_in_Group_Y->GetNumberInstance(); i++)
-            {
-                AllocateInput(m_pThread_ID_in_Group_Y, offset, i);
-                offset += m_pThread_ID_in_Group_Y->GetSize();
-                offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign()]);
-            }
-        }
-
-        if (m_pThread_ID_in_Group_Z)
-        {
-            for (uint i = 0; i < m_pThread_ID_in_Group_Z->GetNumberInstance(); i++)
-            {
-                AllocateInput(m_pThread_ID_in_Group_Z, offset, i);
-                offset += m_pThread_ID_in_Group_Z->GetSize();
-                offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign()]);
-            }
-        }
-    }
-
-    uint CComputeShaderBase::GetNumberOfId() const
-    {
-        uint numberIdPushed = 0;
-
-        if (m_pThread_ID_in_Group_X)
+        if (pCtx->getModuleMetaData()->csInfo.neededThreadIdLayout == ThreadIDLayout::QuadTile)
         {
-            ++numberIdPushed;
+            m_ThreadIDLayout = ThreadIDLayout::QuadTile;
+            return;
         }
 
-        if (m_pThread_ID_in_Group_Y)
-        {
-            ++numberIdPushed;
+        if ((numberOfTypedAccess >= numberOfUntypedAccess) &&
+            threadGroupSize_Y % 4 == 0 &&
+            !pCtx->getModuleMetaData()->csInfo.disableLocalIdOrderOptimizations &&
+            IGC_IS_FLAG_ENABLED(UseTiledCSThreadOrder)) {
+            m_ThreadIDLayout = ThreadIDLayout::TileY;
+            m_walkOrder = WO_YXZ;
         }
 
-        if (m_pThread_ID_in_Group_Z)
+        bool needsLinearWalk =
+            pCtx->getModuleMetaData()->csInfo.neededThreadIdLayout == ThreadIDLayout::X;
+        if (needsLinearWalk)
         {
-            ++numberIdPushed;
+            m_ThreadIDLayout = ThreadIDLayout::X;
+            m_walkOrder = WO_XYZ;
         }
-
-        return numberIdPushed;
     }
 }
@@ -33,28 +33,25 @@ namespace IGC
     public:
         CComputeShaderBase(llvm::Function* pFunc, CShaderProgram* pProgram);
         virtual ~CComputeShaderBase();
-
-        void        CreateThreadPayloadData(
-                        void*& pThreadPayload,
-                        uint& curbeTotalDataLength,
-                        uint& curbeReadLength,
-                        ThreadIDLayout layout) const;
-        void        AllocatePerThreadConstantData(uint32_t &offset);
-        uint        GetNumberOfId() const;
-
-        /// Get the Thread ID's in Group
-        CVariable* CreateThreadIDinGroup(SGVUsage channelNum);
-        uint       GetThreadGroupSize() const { return m_threadGroupSize; }
     protected:
-        /// Size of a thread group (X x Y x Z) provided by the front-end.
-        uint                   m_threadGroupSize   = 0;
-        uint                   m_threadGroupSize_X = 0;
-        uint                   m_threadGroupSize_Y = 0;
-        uint                   m_threadGroupSize_Z = 0;
-
-        /// The set of X/Y/Z that form the local thread ID for each channel.
-        CVariable* m_pThread_ID_in_Group_X = nullptr;
-        CVariable* m_pThread_ID_in_Group_Y = nullptr;
-        CVariable* m_pThread_ID_in_Group_Z = nullptr;
+        void selectWalkOrder(
+            bool useLinearWalk,
+            uint numberOfTypedAccess,
+            uint numberOfUntypedAccess,
+            uint threadGroupSize_X,
+            uint threadGroupSize_Y,
+            uint threadGroupSize_Z);
+
+        ThreadIDLayout m_ThreadIDLayout = ThreadIDLayout::X;
+
+        enum WALK_ORDER {
+            WO_XYZ,
+            WO_XZY,
+            WO_YXZ,
+            WO_ZXY,
+            WO_YZX,
+            WO_ZYX
+        };
+        WALK_ORDER m_walkOrder = WALK_ORDER::WO_XYZ;
     };
 }