Skip to content

Commit 11ba272

Browse files
committed
Fix dyn_b kernel index detection in deserialization path
Fix two bugs in non-dyn-quantize path: 1. GetUpdateDispatchDataFunc: dyn_b was misidentified as SLM because false in non-dyn-quantize path. Now checks INTERNAL_BUFFER args on default_fc kernel to distinguish dyn-quantize vs non-dyn-quantize. 2. Build-time: restrict dyn_b addition to cases where SLM is also present, ensuring deserialization can always distinguish the kernel layout (3-kernel: default|slm|dyn_b).
1 parent 2361b17 commit 11ba272

File tree

1 file changed

+44
-15
lines changed

1 file changed

+44
-15
lines changed

src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -961,8 +961,12 @@ void FullyConnected_bf_tiled::SetDispatchDataFunc(KernelData& kd, const Dispatch
961961

962962
// Reconstruct DispatchIndices for deserialization path (load()).
963963
// At build time, GetMultiKernelsData returns indices explicitly.
964-
// Layout: quantize(0) | default(1) | [optional:slm] | [optional:dyn_b]
965-
// dyn_b is identified by having no INTERNAL_BUFFER arguments (it reads the original F16 input).
964+
// Layout:
965+
// dyn-quantize path: quantize(0) | default(1) | [optional:slm] | [optional:dyn_b]
966+
// non-dyn-quantize: default(0) | [optional:slm] | [optional:dyn_b]
967+
// dyn_b is the last kernel when it has no INTERNAL_BUFFER args but the default_fc kernel does
968+
// (only possible in dyn-quantize path where default_fc uses quantized internal buffers).
969+
// In non-dyn-quantize path, no kernel has INTERNAL_BUFFER, so extra kernels are SLM only.
966970
void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
967971
if (kd.kernels.size() == 1) {
968972
Parent::GetUpdateDispatchDataFunc(kd);
@@ -975,20 +979,39 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
975979

976980
int32_t total = static_cast<int32_t>(kd.kernels.size());
977981
if (total > idx.default_fc + 1) {
978-
int32_t last = total - 1;
979-
bool last_has_internal_buffer = false;
980-
for (const auto& arg : kd.kernels[last].params.arguments) {
982+
// Check if default_fc kernel uses INTERNAL_BUFFER (dyn-quantize path)
983+
bool default_has_internal_buffer = false;
984+
for (const auto& arg : kd.kernels[idx.default_fc].params.arguments) {
981985
if (arg.t == ArgumentDescriptor::Types::INTERNAL_BUFFER) {
982-
last_has_internal_buffer = true;
986+
default_has_internal_buffer = true;
983987
break;
984988
}
985989
}
986-
if (!last_has_internal_buffer && !kd.internalBuffers.empty()) {
987-
idx.dyn_b = last;
988-
if (last > idx.default_fc + 1)
990+
991+
if (default_has_internal_buffer) {
992+
// dyn-quantize path: last kernel without INTERNAL_BUFFER is dyn_b
993+
int32_t last = total - 1;
994+
bool last_has_internal_buffer = false;
995+
for (const auto& arg : kd.kernels[last].params.arguments) {
996+
if (arg.t == ArgumentDescriptor::Types::INTERNAL_BUFFER) {
997+
last_has_internal_buffer = true;
998+
break;
999+
}
1000+
}
1001+
if (!last_has_internal_buffer) {
1002+
idx.dyn_b = last;
1003+
if (last > idx.default_fc + 1)
1004+
idx.slm = idx.default_fc + 1;
1005+
} else {
9891006
idx.slm = idx.default_fc + 1;
1007+
}
9901008
} else {
1009+
// non-dyn-quantize path: extra kernels are SLM only (dyn_b never has INTERNAL_BUFFER
1010+
// and neither do other kernels, so we cannot distinguish — but dyn_b is always last
1011+
// and is only added when SLM is also present in this path)
9911012
idx.slm = idx.default_fc + 1;
1013+
if (total > idx.default_fc + 2)
1014+
idx.dyn_b = total - 1;
9921015
}
9931016
}
9941017

@@ -1076,12 +1099,18 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
10761099
}
10771100
}
10781101

1079-
// Try to add dyn_b kernel for batch-optimal runtime dispatch
1080-
FullyConnected_bf_tiled_dyn_b dyn_b_impl;
1081-
auto dyn_b_kd = dyn_b_impl.GetKernelsData(params);
1082-
if (!dyn_b_kd.empty() && !dyn_b_kd[0].kernels.empty()) {
1083-
kernels_data[0].kernels.push_back(dyn_b_kd[0].kernels.back());
1084-
idx.dyn_b = static_cast<int32_t>(kernels_data[0].kernels.size()) - 1;
1102+
// Try to add dyn_b kernel for batch-optimal runtime dispatch.
1103+
// Only add when SLM is present — deserialization relies on default_fc having
1104+
// INTERNAL_BUFFER args (dyn-quantize path) to distinguish dyn_b from SLM.
1105+
// In non-dyn-quantize path, no kernel has INTERNAL_BUFFER, so dyn_b is only
1106+
// identifiable when it follows an SLM kernel (3-kernel layout: default|slm|dyn_b).
1107+
if (idx.slm >= 0) {
1108+
FullyConnected_bf_tiled_dyn_b dyn_b_impl;
1109+
auto dyn_b_kd = dyn_b_impl.GetKernelsData(params);
1110+
if (!dyn_b_kd.empty() && !dyn_b_kd[0].kernels.empty()) {
1111+
kernels_data[0].kernels.push_back(dyn_b_kd[0].kernels.back());
1112+
idx.dyn_b = static_cast<int32_t>(kernels_data[0].kernels.size()) - 1;
1113+
}
10851114
}
10861115

10871116
// Update default update_dispatch_data_func function

0 commit comments

Comments
 (0)