openvinotoolkit · e-ddykim · Feb 6, 2026 · Feb 4, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -150,6 +150,18 @@ void memory_pool::release_memory(memory* mem, const size_t& unique_id, primitive
 #endif
 }
 
+static int get_feature_block_size(const cldnn::format& fmt) {
+    const auto& order = cldnn::format::internal_order(fmt);
+    int f_bs = 1;
+    for (const auto& [dim, bs] : cldnn::format::block_sizes(fmt)) {
+        if (dim < order.size() && order[dim] == 'f') {
+            f_bs = static_cast<int>(bs);
+            break;
+        }
+    }
+    return f_bs;
+}
+
 memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
                                                   const primitive_id& prim_id,
                                                   size_t unique_id,
@@ -159,15 +171,22 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
                                                   bool reset,
                                                   bool is_dynamic) {
     const auto layout_bytes_count = layout.bytes_count();
+    const int f_block_size = get_feature_block_size(layout.format);
     auto it = _non_padded_pool.lower_bound(layout_bytes_count);
     while (it != _non_padded_pool.end()) {
-        if ((!is_dynamic || (layout_bytes_count > it->second._memory->get_layout().bytes_count() * _mem_pool_util_threshold)) &&
+        const auto& mem_layout = it->second._memory->get_layout();
+        if ((!is_dynamic || (layout_bytes_count > mem_layout.bytes_count() * _mem_pool_util_threshold)) &&
             (it->second._network_id == network_id &&
             it->second._type == type &&
-            it->second._memory->get_layout().format != format::fs_b_yx_fsv32 &&
+            mem_layout.format != format::fs_b_yx_fsv32 &&
             layout.format != format::fs_b_yx_fsv32 &&
             ((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
              (layout.feature() % 32 == 0)) &&
+#ifdef ENABLE_ONEDNN_FOR_GPU
+            (!format::is_blocked(layout.format) || layout.feature() % f_block_size == 0 ||
+             (mem_layout.format == layout.format &&
+              mem_layout.feature() % f_block_size == layout.feature() % f_block_size)) &&
+#endif // ENABLE_ONEDNN_FOR_GPU
             !has_conflict(it->second._users, restrictions))) {
             it->second._users.insert(memory_user(MEM_USER(unique_id, network_id, prim_id, layout_bytes_count)));
             auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
@@ -202,17 +221,23 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
                                               uint32_t network_id,
                                               const memory_restricter<uint32_t>& restrictions,
                                               allocation_type type) {
+    const int f_block_size = get_feature_block_size(layout.format);
     auto first_level_cache = _padded_pool.find(layout);
     if (first_level_cache != _padded_pool.end()) {
         for (auto& rec_list : first_level_cache->second) {
+            const auto& mem_layout = rec_list._memory->get_layout();
             if (rec_list._network_id == network_id &&
                 rec_list._type == type &&
                 ((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
                  (layout.feature() % 32 == 0)) &&
+#ifdef ENABLE_ONEDNN_FOR_GPU
+                (!format::is_blocked(layout.format) || layout.feature() % f_block_size == 0 ||
+                 mem_layout.feature() % f_block_size == layout.feature() % f_block_size) &&
+#endif // ENABLE_ONEDNN_FOR_GPU
                 // TODO: check if this condition always correct
-                layout.feature() <= rec_list._memory->get_layout().feature() &&
-                layout.batch() <= rec_list._memory->get_layout().batch() &&
-                rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 &&
+                layout.feature() <= mem_layout.feature() &&
+                layout.batch() <= mem_layout.batch() &&
+                mem_layout.format != format::fs_b_yx_fsv32 &&
                 layout.format != format::fs_b_yx_fsv32 &&
                 !has_conflict(rec_list._users, restrictions)) {
                 auto ret_mem = _engine->reinterpret_buffer(*(rec_list._memory), layout);

@@ -642,6 +642,57 @@ class memory_pool: public ::testing::Test {
                       std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu2"))->output_memory_ptr()->buffer_ptr());
         }
     }
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    void test_static_reuse_unaligned_feature() {
+        auto& engine = get_test_engine();
+        if (!engine.get_device_info().supports_immad)
+            return;
+
+        const int32_t x = 64;
+        const int32_t y = 64;
+        const int32_t f_can = 97;
+        const int32_t f_req = 88;
+
+        auto l_in = layout{ ov::PartialShape{1, f_can, y, x}, data_types::f16, format::bfyx };
+        auto l_can_blk = layout{ ov::PartialShape{1, f_can, y, x}, data_types::f16, format::b_fs_yx_fsv16 };
+        auto l_can_pln = layout{ ov::PartialShape{1, f_can, y, x}, data_types::f16, format::bfyx };
+        auto l_req_blk = layout{ ov::PartialShape{1, f_req, y, x}, data_types::f16, format::b_fs_yx_fsv16 };
+        auto m_in = engine.allocate_memory(l_in);
+
+        topology topology(
+            input_layout("input", l_in),
+            reorder("can_a", input_info("input"), l_can_blk),
+            reorder("can_b", input_info("input"), l_can_blk),
+            eltwise("reuse_can", { input_info("can_a"), input_info("can_b") }, eltwise_mode::sum),
+            reorder("can_planar", input_info("reuse_can"), l_can_pln),
+            activation("can_consume", input_info("can_planar"), activation_func::relu),
+            crop("req_crop", input_info("can_consume"),{ 1, f_req, x, y }, { 0, 0, 0, 0 }),
+            reorder("req_a", input_info("req_crop"), l_req_blk),
+            reorder("req_b", input_info("req_crop"), l_req_blk),
+            eltwise("reuse_req", { input_info("req_a"), input_info("req_b") }, eltwise_mode::sum),
+            activation("req_sink", input_info("reuse_req"), activation_func::relu)
+        );
+
+        ExecutionConfig config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::optimize_data(true));
+
+        network network(engine, topology, config);
+        network.set_input_data("input", m_in);
+        network.execute();
+
+        auto m_can = network.get_primitive("reuse_can")->output_memory_ptr();
+        auto m_req = network.get_primitive("reuse_req")->output_memory_ptr();
+
+        ASSERT_NE(m_can, nullptr);
+        ASSERT_NE(m_req, nullptr);
+        ASSERT_NE(m_can->buffer_ptr(), nullptr);
+        ASSERT_NE(m_req->buffer_ptr(), nullptr);
+
+        EXPECT_NE(m_can->buffer_ptr(), m_req->buffer_ptr());
+    }
+#endif // ENABLE_ONEDNN_FOR_GPU
+
 };
 
 TEST_F(memory_pool, basic_non_padded_relu_pipe) {
@@ -692,6 +743,12 @@ TEST_F(memory_pool, dynamic_mem_reuse_for_null_sel_impl) {
     this->test_dynamic_mem_reuse_for_null_sel_impl();
 }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+TEST_F(memory_pool, test_static_reuse_unaligned_feature) {
+    this->test_static_reuse_unaligned_feature();
+}
+#endif
+
 #ifdef RUN_ALL_MODEL_CACHING_TESTS
 TEST_F(memory_pool, basic_non_padded_relu_pipe_cached) {
     this->test_basic_non_padded_relu_pipe(true);