@@ -961,8 +961,12 @@ void FullyConnected_bf_tiled::SetDispatchDataFunc(KernelData& kd, const Dispatch
961961
962962// Reconstruct DispatchIndices for deserialization path (load()).
963963// At build time, GetMultiKernelsData returns indices explicitly.
964- // Layout: quantize(0) | default(1) | [optional:slm] | [optional:dyn_b]
965- // dyn_b is identified by having no INTERNAL_BUFFER arguments (it reads the original F16 input).
964+ // Layout:
965+ // dyn-quantize path: quantize(0) | default(1) | [optional:slm] | [optional:dyn_b]
966+ // non-dyn-quantize: default(0) | [optional:slm] | [optional:dyn_b]
967+ // dyn_b is the last kernel when it has no INTERNAL_BUFFER args but the default_fc kernel does
968+ // (only possible in dyn-quantize path where default_fc uses quantized internal buffers).
969+ // In non-dyn-quantize path, no kernel has INTERNAL_BUFFER, so extra kernels are SLM only.
966970void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc (KernelData& kd) const {
967971 if (kd.kernels .size () == 1 ) {
968972 Parent::GetUpdateDispatchDataFunc (kd);
@@ -975,20 +979,39 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
975979
976980 int32_t total = static_cast <int32_t >(kd.kernels .size ());
977981 if (total > idx.default_fc + 1 ) {
978- int32_t last = total - 1 ;
979- bool last_has_internal_buffer = false ;
980- for (const auto & arg : kd.kernels [last ].params .arguments ) {
982+ // Check if default_fc kernel uses INTERNAL_BUFFER (dyn-quantize path)
983+ bool default_has_internal_buffer = false ;
984+ for (const auto & arg : kd.kernels [idx. default_fc ].params .arguments ) {
981985 if (arg.t == ArgumentDescriptor::Types::INTERNAL_BUFFER) {
982- last_has_internal_buffer = true ;
986+ default_has_internal_buffer = true ;
983987 break ;
984988 }
985989 }
986- if (!last_has_internal_buffer && !kd.internalBuffers .empty ()) {
987- idx.dyn_b = last;
988- if (last > idx.default_fc + 1 )
990+
991+ if (default_has_internal_buffer) {
992+ // dyn-quantize path: last kernel without INTERNAL_BUFFER is dyn_b
993+ int32_t last = total - 1 ;
994+ bool last_has_internal_buffer = false ;
995+ for (const auto & arg : kd.kernels [last].params .arguments ) {
996+ if (arg.t == ArgumentDescriptor::Types::INTERNAL_BUFFER) {
997+ last_has_internal_buffer = true ;
998+ break ;
999+ }
1000+ }
1001+ if (!last_has_internal_buffer) {
1002+ idx.dyn_b = last;
1003+ if (last > idx.default_fc + 1 )
1004+ idx.slm = idx.default_fc + 1 ;
1005+ } else {
9891006 idx.slm = idx.default_fc + 1 ;
1007+ }
9901008 } else {
1009+ // non-dyn-quantize path: extra kernels are SLM only (dyn_b never has INTERNAL_BUFFER
1010+ // and neither do other kernels, so we cannot distinguish — but dyn_b is always last
1011+ // and is only added when SLM is also present in this path)
9911012 idx.slm = idx.default_fc + 1 ;
1013+ if (total > idx.default_fc + 2 )
1014+ idx.dyn_b = total - 1 ;
9921015 }
9931016 }
9941017
@@ -1076,12 +1099,18 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
10761099 }
10771100 }
10781101
1079- // Try to add dyn_b kernel for batch-optimal runtime dispatch
1080- FullyConnected_bf_tiled_dyn_b dyn_b_impl;
1081- auto dyn_b_kd = dyn_b_impl.GetKernelsData (params);
1082- if (!dyn_b_kd.empty () && !dyn_b_kd[0 ].kernels .empty ()) {
1083- kernels_data[0 ].kernels .push_back (dyn_b_kd[0 ].kernels .back ());
1084- idx.dyn_b = static_cast <int32_t >(kernels_data[0 ].kernels .size ()) - 1 ;
1102+ // Try to add dyn_b kernel for batch-optimal runtime dispatch.
1103+ // Only add when SLM is present — deserialization relies on default_fc having
1104+ // INTERNAL_BUFFER args (dyn-quantize path) to distinguish dyn_b from SLM.
1105+ // In non-dyn-quantize path, no kernel has INTERNAL_BUFFER, so dyn_b is only
1106+ // identifiable when it follows an SLM kernel (3-kernel layout: default|slm|dyn_b).
1107+ if (idx.slm >= 0 ) {
1108+ FullyConnected_bf_tiled_dyn_b dyn_b_impl;
1109+ auto dyn_b_kd = dyn_b_impl.GetKernelsData (params);
1110+ if (!dyn_b_kd.empty () && !dyn_b_kd[0 ].kernels .empty ()) {
1111+ kernels_data[0 ].kernels .push_back (dyn_b_kd[0 ].kernels .back ());
1112+ idx.dyn_b = static_cast <int32_t >(kernels_data[0 ].kernels .size ()) - 1 ;
1113+ }
10851114 }
10861115
10871116 // Update default update_dispatch_data_func function
0 commit comments