Skip to content

Commit 826616c

Browse files
DianaChenigcbot
authored andcommitted
[Autobackout][FuncReg]Revert of change: e27c419
vISA: Cross-thread size should be 32-byte aligned insted of GRF size aligned (2nd try) Cross-thread-size alignement is the convention between IGC and NEO and should be 32-byte aligned. Update vISA load payload proglog generation accordingly. Also update patch token value "dataParameterStreamSize" to report the same size as expected in vISA
1 parent 565703d commit 826616c

File tree

2 files changed

+52
-60
lines changed

2 files changed

+52
-60
lines changed

IGC/AdaptorOCL/OCL/sp/sp_g8.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1878,9 +1878,8 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(
18781878
annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );
18791879
}
18801880

1881-
// Payload must be a multiple of 32 bytes
1882-
// This assumption has to be the same as in vISA::Optimizer::loadThreadPayload
1883-
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);
1881+
// Payload must be a multiple of a GRF register
1882+
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());
18841883

18851884
if( retValue.Success )
18861885
{

visa/Optimizer.cpp

Lines changed: 50 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -7801,7 +7801,35 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
78017801
{
78027802
return;
78037803
}
7804-
7804+
// indirect data address is at r0.0[5:31]
7805+
// local thread id is at r0.2[0:7]
7806+
// use r127 as the header for each oword load
7807+
uint32_t startGRF =
7808+
kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
7809+
uint32_t inputEnd = 32;
7810+
uint32_t inputCount = kernel.fg.builder->getInputCount();
7811+
for (unsigned int id = 0; id < inputCount; id++)
7812+
{
7813+
input_info_t* input_info = kernel.fg.builder->getInputArg(id);
7814+
// skip pseudo input for register bindings.
7815+
if (input_info->isPseudoInput())
7816+
{
7817+
continue;
7818+
}
7819+
if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
7820+
{
7821+
vISA::G4_Declare* dcl = input_info->dcl;
7822+
if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
7823+
{
7824+
break;
7825+
}
7826+
}
7827+
if (inputEnd < (unsigned)(input_info->size + input_info->offset))
7828+
{
7829+
inputEnd = input_info->size + input_info->offset;
7830+
}
7831+
}
7832+
int numGRF = ((inputEnd + getGRFSize() - 1) / getGRFSize()) - startGRF;
78057833
std::vector<G4_INST*> instBuffer;
78067834

78077835
G4_Declare* r0 = builder.createHardwiredDeclare(8, Type_UD, 0, 0);
@@ -7934,56 +7962,6 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79347962
int addrSubreg = 2;
79357963

79367964
G4_BB* perThreadBB = nullptr;
7937-
7938-
// Calculate the payload size:
7939-
// indirect data address is at r0.0[5:31]
7940-
// local thread id is at r0.2[0:7]
7941-
// use r127 as the header for each oword load
7942-
uint32_t startGRF =
7943-
kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
7944-
uint32_t inputEnd = 32;
7945-
uint32_t inputCount = kernel.fg.builder->getInputCount();
7946-
for (unsigned int id = 0; id < inputCount; id++)
7947-
{
7948-
input_info_t* input_info = kernel.fg.builder->getInputArg(id);
7949-
// skip pseudo input for register bindings.
7950-
if (input_info->isPseudoInput())
7951-
{
7952-
continue;
7953-
}
7954-
if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
7955-
{
7956-
vISA::G4_Declare* dcl = input_info->dcl;
7957-
if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
7958-
{
7959-
break;
7960-
}
7961-
}
7962-
if (inputEnd < (unsigned)(input_info->size + input_info->offset))
7963-
{
7964-
inputEnd = input_info->size + input_info->offset;
7965-
}
7966-
}
7967-
// cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
7968-
// GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
7969-
inputEnd = (inputEnd % 32) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
7970-
uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize() ? 0 : inputEnd - startGRF * getGRFSize();
7971-
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
7972-
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
7973-
// per-thread-data size must be GRF-size aligned,
7974-
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7975-
// cross-thread-data size must be 32-bytes aligned
7976-
uint32_t numCrossThreadDW = (CTIS < 0) ?
7977-
(payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
7978-
CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
7979-
7980-
if (useInlineData)
7981-
{
7982-
// Skip the 1st GRF
7983-
numCrossThreadDW = numCrossThreadDW >= numEltPerGRF<Type_UD>() ?
7984-
numCrossThreadDW - numEltPerGRF<Type_UD>() : 0;
7985-
}
7986-
79877965
// Load per-thread data, if any. Per-thread data always start from r1
79887966
// this is a fixed size 8 inst (nop padded as necessary), which may be skipped
79897967
// by runtime if the local_id are auto-generated by HW.
@@ -7992,6 +7970,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79927970
// forward start label is 64B aligned.
79937971
if (builder.needsToLoadLocalID())
79947972
{
7973+
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
7974+
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
7975+
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7976+
uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
7977+
7978+
if (useInlineData)
7979+
{
7980+
numCrossThreadGRF--;
7981+
}
79957982
instBuffer.push_back(getLabel("per_thread_prolog"));
79967983

79977984
// compute per-thread starting address (r127.2)
@@ -8020,7 +8007,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
80208007
// create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
80218008
// cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
80228009
// need to be resolved to the new cross_thread_size.
8023-
G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadDW * TypeSize(Type_UD) , Type_UW);
8010+
G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadGRF * numEltPerGRF<Type_UB>(), Type_UW);
80248011
auto addDst = builder.createDst(rtail->getRegVar(), 0, 2, 1, Type_UD);
80258012
// instruction has relocation must not be compacted
80268013
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8098,15 +8085,21 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
80988085
// and it will be either at R1 (if local id is not auto-generated) or
80998086
// R1 + sizeof(local id) (if local id is auto-generated).
81008087
{
8088+
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
8089+
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
8090+
8091+
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
8092+
uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
81018093
uint32_t crossThreadStart = startGRF + numPerThreadGRF;
8102-
// first GRF of cross-thread data is already loaded
8094+
81038095
if (useInlineData)
8096+
{
8097+
// first GRF of cross-thread data is already loaded
81048098
crossThreadStart++;
8105-
8099+
numCrossThreadGRF--;
8100+
}
81068101
{
8107-
// GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
8108-
assert(!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
8109-
loadFromMemory(rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
8102+
loadFromMemory(rtail, crossThreadStart, numCrossThreadGRF);
81108103
}
81118104
}
81128105

0 commit comments

Comments
 (0)