@@ -7801,7 +7801,35 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
78017801 {
78027802 return ;
78037803 }
7804-
7804+ // indirect data address is at r0.0[5:31]
7805+ // local thread id is at r0.2[0:7]
7806+ // use r127 as the header for each oword load
7807+ uint32_t startGRF =
7808+ kernel.getOptions ()->getuInt32Option (vISA_loadThreadPayloadStartReg);
7809+ uint32_t inputEnd = 32 ;
7810+ uint32_t inputCount = kernel.fg .builder ->getInputCount ();
7811+ for (unsigned int id = 0 ; id < inputCount; id++)
7812+ {
7813+ input_info_t * input_info = kernel.fg .builder ->getInputArg (id);
7814+ // skip pseudo input for register bindings.
7815+ if (input_info->isPseudoInput ())
7816+ {
7817+ continue ;
7818+ }
7819+ if (kernel.fg .builder ->getFCPatchInfo ()->getIsEntryKernel ())
7820+ {
7821+ vISA::G4_Declare* dcl = input_info->dcl ;
7822+ if (INPUT_GENERAL == input_info->getInputClass () && !(dcl->isLiveIn ()))
7823+ {
7824+ break ;
7825+ }
7826+ }
7827+ if (inputEnd < (unsigned )(input_info->size + input_info->offset ))
7828+ {
7829+ inputEnd = input_info->size + input_info->offset ;
7830+ }
7831+ }
7832+ int numGRF = ((inputEnd + getGRFSize () - 1 ) / getGRFSize ()) - startGRF;
78057833 std::vector<G4_INST*> instBuffer;
78067834
78077835 G4_Declare* r0 = builder.createHardwiredDeclare (8 , Type_UD, 0 , 0 );
@@ -7934,56 +7962,6 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79347962 int addrSubreg = 2 ;
79357963
79367964 G4_BB* perThreadBB = nullptr ;
7937-
7938- // Calculate the payload size:
7939- // indirect data address is at r0.0[5:31]
7940- // local thread id is at r0.2[0:7]
7941- // use r127 as the header for each oword load
7942- uint32_t startGRF =
7943- kernel.getOptions ()->getuInt32Option (vISA_loadThreadPayloadStartReg);
7944- uint32_t inputEnd = 32 ;
7945- uint32_t inputCount = kernel.fg .builder ->getInputCount ();
7946- for (unsigned int id = 0 ; id < inputCount; id++)
7947- {
7948- input_info_t * input_info = kernel.fg .builder ->getInputArg (id);
7949- // skip pseudo input for register bindings.
7950- if (input_info->isPseudoInput ())
7951- {
7952- continue ;
7953- }
7954- if (kernel.fg .builder ->getFCPatchInfo ()->getIsEntryKernel ())
7955- {
7956- vISA::G4_Declare* dcl = input_info->dcl ;
7957- if (INPUT_GENERAL == input_info->getInputClass () && !(dcl->isLiveIn ()))
7958- {
7959- break ;
7960- }
7961- }
7962- if (inputEnd < (unsigned )(input_info->size + input_info->offset ))
7963- {
7964- inputEnd = input_info->size + input_info->offset ;
7965- }
7966- }
7967- // cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
7968- // GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
7969- inputEnd = (inputEnd % 32 ) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
7970- uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize () ? 0 : inputEnd - startGRF * getGRFSize ();
7971- int PTIS = kernel.getInt32KernelAttr (Attributes::ATTR_PerThreadInputSize);
7972- int CTIS = kernel.getInt32KernelAttr (Attributes::ATTR_CrossThreadInputSize);
7973- // per-thread-data size must be GRF-size aligned,
7974- uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7975- // cross-thread-data size must be 32-bytes aligned
7976- uint32_t numCrossThreadDW = (CTIS < 0 ) ?
7977- (payloadSizeByte - numPerThreadGRF * getGRFSize ()) / TypeSize (Type_UD) :
7978- CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
7979-
7980- if (useInlineData)
7981- {
7982- // Skip the 1st GRF
7983- numCrossThreadDW = numCrossThreadDW >= numEltPerGRF<Type_UD>() ?
7984- numCrossThreadDW - numEltPerGRF<Type_UD>() : 0 ;
7985- }
7986-
79877965 // Load per-thread data, if any. Per-thread data always start from r1
79887966 // this is a fixed size 8 inst (nop padded as necessary), which may be skipped
79897967 // by runtime if the local_id are auto-generated by HW.
@@ -7992,6 +7970,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79927970 // forward start label is 64B aligned.
79937971 if (builder.needsToLoadLocalID ())
79947972 {
7973+ int PTIS = kernel.getInt32KernelAttr (Attributes::ATTR_PerThreadInputSize);
7974+ int CTIS = kernel.getInt32KernelAttr (Attributes::ATTR_CrossThreadInputSize);
7975+ uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7976+ uint32_t numCrossThreadGRF = (CTIS < 0 ) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
7977+
7978+ if (useInlineData)
7979+ {
7980+ numCrossThreadGRF--;
7981+ }
79957982 instBuffer.push_back (getLabel (" per_thread_prolog" ));
79967983
79977984 // compute per-thread starting address (r127.2)
@@ -8020,7 +8007,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
80208007 // create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
80218008 // cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
80228009 // need to be resolved to the new cross_thread_size.
8023- G4_Operand* addSrc1 = builder.createRelocImm (numCrossThreadDW * TypeSize (Type_UD) , Type_UW);
8010+ G4_Operand* addSrc1 = builder.createRelocImm (numCrossThreadGRF * numEltPerGRF<Type_UB>() , Type_UW);
80248011 auto addDst = builder.createDst (rtail->getRegVar (), 0 , 2 , 1 , Type_UD);
80258012 // instruction has relocation must not be compacted
80268013 auto addInst = builder.createBinOp (G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8098,15 +8085,21 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
80988085 // and it will be either at R1 (if local id is not auto-generated) or
80998086 // R1 + sizeof(local id) (if local id is auto-generated).
81008087 {
8088+ int PTIS = kernel.getInt32KernelAttr (Attributes::ATTR_PerThreadInputSize);
8089+ int CTIS = kernel.getInt32KernelAttr (Attributes::ATTR_CrossThreadInputSize);
8090+
8091+ uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
8092+ uint32_t numCrossThreadGRF = (CTIS < 0 ) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
81018093 uint32_t crossThreadStart = startGRF + numPerThreadGRF;
8102- // first GRF of cross-thread data is already loaded
8094+
81038095 if (useInlineData)
8096+ {
8097+ // first GRF of cross-thread data is already loaded
81048098 crossThreadStart++;
8105-
8099+ numCrossThreadGRF--;
8100+ }
81068101 {
8107- // GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
8108- assert (!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
8109- loadFromMemory (rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
8102+ loadFromMemory (rtail, crossThreadStart, numCrossThreadGRF);
81108103 }
81118104 }
81128105
0 commit comments