[Autobackout][FuncReg]Revert of change: e27c419

DianaChen · igcbot · commit 826616cf5d9c · 2021-11-12T06:38:32.000+01:00
vISA: Cross-thread size should be 32-byte aligned insted of GRF size aligned (2nd try)

Cross-thread-size alignement is the convention between IGC and NEO and should be 32-byte aligned.
Update vISA load payload proglog generation accordingly. Also update patch token value "dataParameterStreamSize"
to report the same size as expected in vISA
diff --git a/IGC/AdaptorOCL/OCL/sp/sp_g8.cpp b/IGC/AdaptorOCL/OCL/sp/sp_g8.cpp
@@ -1878,9 +1878,8 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(
             annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );
     }
 
-    // Payload must be a multiple of 32 bytes
-    // This assumption has to be the same as in vISA::Optimizer::loadThreadPayload
-    dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);
+    // Payload must be a multiple of a GRF register
+    dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());
 
     if( retValue.Success )
     {
diff --git a/visa/Optimizer.cpp b/visa/Optimizer.cpp
@@ -7801,7 +7801,35 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         {
             return;
         }
-
+        // indirect data address is at r0.0[5:31]
+        // local thread id is at r0.2[0:7]
+        // use r127 as the header for each oword load
+        uint32_t startGRF =
+            kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
+        uint32_t inputEnd = 32;
+        uint32_t inputCount = kernel.fg.builder->getInputCount();
+        for (unsigned int id = 0; id < inputCount; id++)
+        {
+            input_info_t* input_info = kernel.fg.builder->getInputArg(id);
+            // skip pseudo input for register bindings.
+            if (input_info->isPseudoInput())
+            {
+                continue;
+            }
+            if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
+            {
+              vISA::G4_Declare* dcl = input_info->dcl;
+              if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
+              {
+                  break;
+              }
+            }
+            if (inputEnd < (unsigned)(input_info->size + input_info->offset))
+            {
+                inputEnd = input_info->size + input_info->offset;
+            }
+        }
+        int numGRF = ((inputEnd + getGRFSize() - 1) / getGRFSize()) - startGRF;
         std::vector<G4_INST*> instBuffer;
 
         G4_Declare* r0 = builder.createHardwiredDeclare(8, Type_UD, 0, 0);
@@ -7934,56 +7962,6 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         int addrSubreg = 2;
 
         G4_BB* perThreadBB = nullptr;
-
-        // Calculate the payload size:
-        // indirect data address is at r0.0[5:31]
-        // local thread id is at r0.2[0:7]
-        // use r127 as the header for each oword load
-        uint32_t startGRF =
-            kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
-        uint32_t inputEnd = 32;
-        uint32_t inputCount = kernel.fg.builder->getInputCount();
-        for (unsigned int id = 0; id < inputCount; id++)
-        {
-            input_info_t* input_info = kernel.fg.builder->getInputArg(id);
-            // skip pseudo input for register bindings.
-            if (input_info->isPseudoInput())
-            {
-                continue;
-            }
-            if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
-            {
-                vISA::G4_Declare* dcl = input_info->dcl;
-                if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
-                {
-                    break;
-                }
-            }
-            if (inputEnd < (unsigned)(input_info->size + input_info->offset))
-            {
-                inputEnd = input_info->size + input_info->offset;
-            }
-        }
-        // cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
-        // GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
-        inputEnd = (inputEnd % 32) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
-        uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize() ? 0 : inputEnd - startGRF * getGRFSize();
-        int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
-        int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
-        // per-thread-data size must be GRF-size aligned,
-        uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
-        // cross-thread-data size must be 32-bytes aligned
-        uint32_t numCrossThreadDW = (CTIS < 0) ?
-            (payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
-            CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
-
-        if (useInlineData)
-        {
-            // Skip the 1st GRF
-            numCrossThreadDW = numCrossThreadDW >= numEltPerGRF<Type_UD>() ?
-                numCrossThreadDW - numEltPerGRF<Type_UD>() : 0;
-        }
-
         // Load per-thread data, if any. Per-thread data always start from r1
         // this is a fixed size 8 inst (nop padded as necessary), which may be skipped
         // by runtime if the local_id are auto-generated by HW.
@@ -7992,6 +7970,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         // forward start label is 64B aligned.
         if (builder.needsToLoadLocalID())
         {
+            int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
+            int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
+            uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
+            uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
+
+            if (useInlineData)
+            {
+                numCrossThreadGRF--;
+            }
             instBuffer.push_back(getLabel("per_thread_prolog"));
 
             // compute per-thread starting address (r127.2)
@@ -8020,7 +8007,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
             // create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
             // cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
             // need to be resolved to the new cross_thread_size.
-            G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadDW * TypeSize(Type_UD) , Type_UW);
+            G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadGRF * numEltPerGRF<Type_UB>(), Type_UW);
             auto addDst = builder.createDst(rtail->getRegVar(), 0, 2, 1, Type_UD);
             // instruction has relocation must not be compacted
             auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8098,15 +8085,21 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
             // and it will be either at R1 (if local id is not auto-generated) or
             // R1 + sizeof(local id) (if local id is auto-generated).
             {
+                int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
+                int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
+
+                uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
+                uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
                 uint32_t crossThreadStart = startGRF + numPerThreadGRF;
-                // first GRF of cross-thread data is already loaded
+
                 if (useInlineData)
+                {
+                    // first GRF of cross-thread data is already loaded
                     crossThreadStart++;
-
+                    numCrossThreadGRF--;
+                }
                 {
-                    // GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
-                    assert(!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
-                    loadFromMemory(rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
+                    loadFromMemory(rtail, crossThreadStart, numCrossThreadGRF);
                 }
             }
 

Original file line number	Diff line number	Diff line change
`@@ -1878,9 +1878,8 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(`
`1878`	`1878`	`annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );`
`1879`	`1879`	`}`
`1880`	`1880`
`1881`		`- // Payload must be a multiple of 32 bytes`
`1882`		`- // This assumption has to be the same as in vISA::Optimizer::loadThreadPayload`
`1883`		`- dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);`
	`1881`	`+ // Payload must be a multiple of a GRF register`
	`1882`	`+ dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());`
`1884`	`1883`
`1885`	`1884`	`if( retValue.Success )`
`1886`	`1885`	`{`