[CPU] FullyConnected acceleration with u2 weights decompression (#31467)

xuchen-intel · web-flow · commit 3b6410a94bae · 2025-09-29T11:04:54.000Z
### Details: - *FullyConnected acceleration with u2 weights decompression.* - *OneDNN PR: openvinotoolkit/oneDNN#289 ### Tickets: - *[CVS-169357](https://jira.devtools.intel.com/browse/CVS-169357)*
diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
@@ -90,6 +90,8 @@ std::optional<dnnl::memory::data_type> DnnlExtensionUtils::ElementTypeToDataType
         return memory::data_type::s4;
     case ov::element::u4:
         return memory::data_type::u4;
+    case ov::element::u2:
+        return memory::data_type::u2;
     case ov::element::f8e8m0:
         return memory::data_type::e8m0;
     case ov::element::f8e4m3:
@@ -137,6 +139,8 @@ ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory::
         return ov::element::i4;
     case memory::data_type::u4:
         return ov::element::u4;
+    case memory::data_type::u2:
+        return ov::element::u2;
     case memory::data_type::e8m0:
         return ov::element::f8e8m0;
     case memory::data_type::f8_e4m3:
diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
@@ -845,6 +845,36 @@ struct ConvertFromBinPrecision<std::tuple<src_t, dst_t>> {
     }
 };
 
+#define INTEL_CPU_CVT_FROM_2BIT_LIST                                                                 \
+    INTEL_CPU_CVT(u2, f32), INTEL_CPU_CVT(u2, f16), INTEL_CPU_CVT(u2, bf16), INTEL_CPU_CVT(u2, i32), \
+        INTEL_CPU_CVT(u2, u8), INTEL_CPU_CVT(u2, i8)
+
+struct ConvertFrom2BitContext {
+    const void* srcPtr;
+    void* dstPtr;
+    size_t size;
+    bool converted;
+};
+
+template <typename T>
+struct ConvertFrom2BitPrecision;
+
+[[maybe_unused]] static uint8_t get_u2(uint8_t val, uint8_t shift) {
+    return static_cast<uint8_t>((val & (0x3 << shift)) >> shift);
+}
+
+template <typename src_t, typename dst_t>
+struct ConvertFrom2BitPrecision<std::tuple<src_t, dst_t>> {
+    void operator()(ConvertFrom2BitContext& ctx) {
+        const auto* src = static_cast<const uint8_t*>(ctx.srcPtr);
+        auto dst = static_cast<dst_t*>(ctx.dstPtr);
+        parallel_for(ctx.size, [&](size_t i) {
+            dst[i] = static_cast<dst_t>(get_u2(src[i / 4], (i % 4) * 2));
+        });
+        ctx.converted = true;
+    }
+};
+
 #define INTEL_CPU_CVT_FROM_4BIT_LIST                                                                                 \
     INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, i32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16),                 \
         INTEL_CPU_CVT(u4, i8), INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, i32),                \
@@ -1069,6 +1099,10 @@ void cpu_convert(const void* srcPtr,
                         srcPrc.bitwidth(),
                         "> precision to: ",
                         dstPrc);
+    } else if (srcPrc == ov::element::u2) {
+        ConvertFrom2BitContext ctx{srcPtr, dstPtr, size, false};
+        OV_SWITCH(intel_cpu, ConvertFrom2BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_2BIT_LIST);
+        OPENVINO_ASSERT(ctx.converted, "cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
     } else if (srcPrc.bitwidth() == 4U) {
         ConvertFrom4BitContext ctx{srcPrc, srcPtr, dstPtr, size, false};
         OV_SWITCH(intel_cpu, ConvertFrom4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST);
@@ -1115,6 +1149,7 @@ bool is_supported_convert([[maybe_unused]] ov::element::Type srcPrc, [[maybe_unu
     isSupportedContext ctx;
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BIN_LIST);
+    OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_2BIT_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BYTE_FP_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_TO_4BIT_LIST);
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
@@ -138,7 +138,7 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT
                                                   const ov::element::Type weightsType,
                                                   const ov::intel_cpu::Config::ModelType modelType) {
     if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
-        if (any_of(inputType, f32, bf16) && any_of(weightsType, u8, i8, nf4, u4, i4, f4e2m1)) {
+        if (any_of(inputType, f32, bf16) && any_of(weightsType, u8, i8, nf4, u4, i4, f4e2m1, u2)) {
             return true;
         }
 
@@ -176,11 +176,15 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize,
     // For dynamic quantization, VNNI accumulation requires weight to be unsigned.
     // To support dynamic quantization with weights symmetrically quantized as i8/i4
     // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8.
-    if (none_of(weightsDesc->getPrecision(), ov::element::u8, ov::element::u4) &&
+    if (none_of(weightsDesc->getPrecision(), ov::element::u8, ov::element::u4, ov::element::u2) &&
         !((any_of(weightsDesc->getPrecision(), ov::element::i8, ov::element::i4) && !zpPtr))) {
         return false;
     }
-    if (zpPtr && none_of(zpPtr->getDesc().getPrecision(), ov::element::u8, ov::element::u4, ov::element::dynamic)) {
+    if (zpPtr && none_of(zpPtr->getDesc().getPrecision(),
+                         ov::element::u8,
+                         ov::element::u4,
+                         ov::element::u2,
+                         ov::element::dynamic)) {
         return false;
     }
 
@@ -255,6 +259,9 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
 
     if (auto it = memory.find(ARG_WEI | ARG_ATTR_ZERO_POINTS); it != memory.end()) {
         auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32;
+        if (weiDesc->getPrecision() == ov::element::u2) {
+            dstPrc = ov::element::u2;
+        }
         dnnlpoc.appendDecompressionZeroPointsLegacy(it->second, !attrs.weightsNonTransposed, dstPrc);
     }
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -95,10 +95,10 @@ static const TypeMapping dnnlFCTypeMapping {
     {{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just<f32>(), bypass()}},
     {{_u8 | _i8, _i8, _any, _any}, {bypass(), bypass(), just<f32>(), just<f32>()}},
     // compresses int weights (@todo more strict requrements for output precision?)
-    {{_bf16, _u8 | _i8 | _nf4 | _u4 | _i4 | _f4e2m1, _any, _any},       {bypass(), bypass(), use<0>(), use<0>()},
+    {{_bf16, _u8 | _i8 | _nf4 | _u4 | _i4 | _f4e2m1 | _u2, _any, _any},       {bypass(), bypass(), use<0>(), use<0>()},
      Require<dnnl::impl::cpu::x64::avx512_core_bf16>()}, // Ticket 122347
     {{_bf16, _u8 | _i8 | _nf4 | _u4 | _i4 | _f4e2m1, _any, _any},       {just<f32>(), bypass(), just<f32>(), just<f32>()}},
-    {{_f32,  _u8 | _i8 | _nf4 | _u4 | _i4 | _f4e2m1, _any, _any},       {bypass(), bypass(), use<0>(), use<0>()}},
+    {{_f32,  _u8 | _i8 | _nf4 | _u4 | _i4 | _f4e2m1 | _u2, _any, _any},       {bypass(), bypass(), use<0>(), use<0>()}},
     // @todo should we fallback to FPXX instead of _f32?
     {{_any, _any, _any, _any},                                {just<f32>(), just<f32>(), just<f32>(), just<f32>()}},
     // @todo explicitly cover configuration limitations for oneDNN on ARM
diff --git a/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp b/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp
@@ -35,6 +35,7 @@ struct TypeMask {
         _string = 1 << 20,
         _f4e2m1 = 1 << 21,
         _f8e8m0 = 1 << 22,
+        _u2 = 1 << 23,
     };
 
     explicit TypeMask(const ov::element::Type precision) : value(generateMask(precision)), precision(precision) {}
@@ -82,6 +83,7 @@ struct TypeMask {
             CASE(string)
             CASE(f4e2m1)
             CASE(f8e8m0)
+            CASE(u2)
         default:
             return _dynamic;
         }
@@ -116,6 +118,7 @@ DEFINE_TYPE_ALIAS(_f8e5m2);
 DEFINE_TYPE_ALIAS(_string);
 DEFINE_TYPE_ALIAS(_f4e2m1);
 DEFINE_TYPE_ALIAS(_f8e8m0);
+DEFINE_TYPE_ALIAS(_u2);
 constexpr auto _any_float = _f64 | _f32 | _f16 | _bf16;
 constexpr auto _hw_float = _f32 | _f16 | _bf16;
 constexpr auto _half_float = _f16 | _bf16;
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -72,7 +72,7 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes([[may
     }
 #if defined(OPENVINO_ARCH_X86_64)
     ov::element::TypeVector supportedDataTypes =
-        {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1};
+        {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1, Type_t::u2};
     if (apply_fp8) {
         supportedDataTypes.insert(supportedDataTypes.end(), {Type_t::f8e4m3, Type_t::f8e5m2});
     }
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
@@ -318,14 +318,14 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     for (const auto& ii : model->inputs()) {
         auto input_precision = ii.get_element_type();
         static const std::set<ov::element::Type_t> supported_precisions = {
-            ov::element::Type_t::u4,     ov::element::Type_t::i4,      ov::element::Type_t::u8,
-            ov::element::Type_t::i8,     ov::element::Type_t::f8e4m3,  ov::element::Type_t::f8e5m2,
-            ov::element::Type_t::u16,    ov::element::Type_t::i16,     ov::element::Type_t::u32,
-            ov::element::Type_t::i32,    ov::element::Type_t::u64,     ov::element::Type_t::i64,
-            ov::element::Type_t::bf16,   ov::element::Type_t::f16,     ov::element::Type_t::f32,
-            ov::element::Type_t::f64,    ov::element::Type_t::boolean, ov::element::Type_t::string,
-            ov::element::Type_t::nf4,    ov::element::Type_t::f4e2m1,  ov::element::Type_t::f8e8m0,
-            ov::element::Type_t::dynamic};
+            ov::element::Type_t::u4,   ov::element::Type_t::i4,      ov::element::Type_t::u8,
+            ov::element::Type_t::i8,   ov::element::Type_t::f8e4m3,  ov::element::Type_t::f8e5m2,
+            ov::element::Type_t::u16,  ov::element::Type_t::i16,     ov::element::Type_t::u32,
+            ov::element::Type_t::i32,  ov::element::Type_t::u64,     ov::element::Type_t::i64,
+            ov::element::Type_t::bf16, ov::element::Type_t::f16,     ov::element::Type_t::f32,
+            ov::element::Type_t::f64,  ov::element::Type_t::boolean, ov::element::Type_t::string,
+            ov::element::Type_t::nf4,  ov::element::Type_t::f4e2m1,  ov::element::Type_t::f8e8m0,
+            ov::element::Type_t::u2,   ov::element::Type_t::dynamic};
 
         if (supported_precisions.find(input_precision) == supported_precisions.end()) {
             OPENVINO_THROW_NOT_IMPLEMENTED("CPU plugin: Input image format ",
diff --git a/src/plugins/intel_cpu/src/utils/plain_tensor.hpp b/src/plugins/intel_cpu/src/utils/plain_tensor.hpp
@@ -405,6 +405,9 @@ struct PlainTensor {
         if (any_of(m_dt, ov::element::i4, ov::element::u4)) {
             return 2;
         }
+        if (m_dt == ov::element::u2) {
+            return 4;
+        }
         return 1;
     }
 
@@ -423,7 +426,15 @@ struct PlainTensor {
 
     template <typename DT, ov::element::Type_t SRC_PREC = ov::element::u8, typename... Is>
     [[nodiscard]] DT* ptr(Is... indices) const {
-        constexpr size_t stride_div = SRC_PREC == ov::element::u4 ? 2 : 1;
+        constexpr size_t stride_div = [] {
+            if (SRC_PREC == ov::element::u2) {
+                return 4;
+            }
+            if (SRC_PREC == ov::element::u4) {
+                return 2;
+            }
+            return 1;
+        }();
         const size_t off = offset<0>(indices...) / stride_div;
         return reinterpret_cast<DT*>(m_ptr.get()) + off;
     }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
@@ -96,7 +96,7 @@ void ConvertCPULayerTest::SetUp() {
     if (primitive.empty())
         primitive = getPrimitiveType();
 #if defined(OPENVINO_ARCH_ARM64)
-    if (inPrc == ov::element::u4 || inPrc == ov::element::i4 ||
+    if (inPrc == ov::element::u2 || inPrc == ov::element::u4 || inPrc == ov::element::i4 ||
         inPrc == ov::element::f4e2m1 || inPrc == ov::element::f8e8m0 ||
         inPrc == ov::element::f8e4m3 || inPrc == ov::element::f8e5m2 ||
         outPrc == ov::element::f8e4m3 || outPrc == ov::element::f8e5m2 ||
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
@@ -91,6 +91,24 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_f8e8m0, ConvertCPULayerT
                                 ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))),
                         ConvertCPULayerTest::getTestCaseName);
 
+const std::vector<ov::element::Type> common_precisions = {
+    ov::element::f32,
+    ov::element::i32,
+    ov::element::f16,
+    ov::element::bf16,
+    ov::element::u8,
+    ov::element::i8,
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_u2, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_dynamic()),
+                                ::testing::Values(ov::element::u2),
+                                ::testing::ValuesIn(common_precisions),
+                                ::testing::Values(ov::test::SpecialValue::none),
+                                ::testing::Values(CPUSpecificParams({}, {}, {}, {"ref"}))),
+                        ConvertCPULayerTest::getTestCaseName);
+
 const std::vector<ov::element::Type> f8_precisions = {
     ov::element::f8e4m3,
     ov::element::f8e5m2,
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
@@ -29,7 +29,8 @@ std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParam
         result << configEntry.first << ", " << configEntry.second.as<std::string>() << "_";
     }
     result << ")";
-    result << CpuTestWithFusing::getTestCaseName(fusing_params);
+    result << CpuTestWithFusing::getTestCaseName(fusing_params) << "_";
+    result << "should_fuse=" << should_fuse;
 
     return result.str();
 }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
@@ -40,6 +40,15 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_basic = {
     {{{}, {{1, 11, 154}}}, {154, 77}, 154ul},
     {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
 };
+const std::vector<MatMulDecompressionShapeParams> input_shapes_basic_u2 = {
+    {{{}, {{1, 8, 16}}}, {16, 2}},
+    {{{}, {{1, 4, 16}}}, {16, 2}},
+    {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}},
+    {{{}, {{1, 4, 16}}}, {1, 16, 32}},
+    {{{}, {{5, 40, 496}}}, {1, 496, 240}},
+    {{{}, {{1, 4, 48}}}, {48, 256}},
+    {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
+};
 const std::vector<MatMulDecompressionShapeParams> input_shapes_amx = {
     {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
     {{{}, {{1, 4, 32}}}, {32, 256}},
@@ -49,6 +58,10 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_amx = {
     {{{}, {{3, 339, 577}}}, {577, 335}},
     {{{}, {{1, 1, 256}}}, {256, 128}, 64ul},
 };
+const std::vector<MatMulDecompressionShapeParams> input_shapes_amx_u2 = {
+    {{{}, {{1, 8, 64}}}, {64, 64}},
+    {{{}, {{1, 16, 64}}}, {64, 128}},
+};
 const std::vector<fusingSpecificParams> fusing_params{emptyFusingSpec, fusingBias};
 
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
@@ -67,6 +80,22 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
                                             ::testing::Values(true)),
                          MatmulWeightsDecompression::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic_u2,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_basic_u2),
+                                            ::testing::Values(ov::element::u2),
+                                            ::testing::ValuesIn(decompression_precisions),
+                                            ::testing::Values(ov::element::dynamic),
+                                            ::testing::Values(true),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            // todo: zero points converted to fp32 for reshape == true case
+                                            ::testing::Values(false),
+                                            ::testing::ValuesIn(filter_additional_config_basic()),
+                                            ::testing::ValuesIn(fusing_params),
+                                            ::testing::Values(true)),
+                         MatmulWeightsDecompression::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic_fp8,
                          MatmulWeightsDecompression,
                          ::testing::Combine(::testing::ValuesIn(input_shapes_basic),
@@ -99,6 +128,22 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_amx,
                                             ::testing::Values(true)),
                          MatmulWeightsDecompression::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_amx_u2,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_amx_u2),
+                                            ::testing::Values(ov::element::u2),
+                                            ::testing::ValuesIn(decompression_precisions),
+                                            ::testing::Values(ov::element::dynamic),
+                                            ::testing::Values(true),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            // todo: zero points converted to fp32 for reshape == true case
+                                            ::testing::Values(false),
+                                            ::testing::ValuesIn(filter_additional_config_amx()),
+                                            ::testing::ValuesIn(fusing_params),
+                                            ::testing::Values(true)),
+                         MatmulWeightsDecompression::getTestCaseName);
+
 // symmetric weight compression : i4/i8 with no/empty DecompressionSubtract
 const std::vector<ov::test::ElementType> sym_weights_precisions = {ov::element::i8, ov::element::i4};
 
@@ -230,6 +275,13 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_basic_dyn_quant =
     {{{}, {{1, 1, 640}}}, {640, 90}},
 };
 
+const std::vector<MatMulDecompressionShapeParams> input_shapes_basic_dyn_quant_u2 = {
+    {{{}, {{1, 8, 16}}}, {16, 2}},
+    {{{}, {{1, 4, 16}}}, {16, 2}},
+    {{{}, {{1, 1, 128}}}, {128, 32}},
+    {{{}, {{1, 1, 640}}}, {640, 90}},
+};
+
 const std::vector<ov::test::ElementType> weights_precisions_dyn_quant = {ov::element::u8, ov::element::u4};
 const std::vector<fusingSpecificParams> fusing_params_dyn_quant{
     emptyFusingSpec,
@@ -261,6 +313,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro
                                             ::testing::Values(true)),
                          MatmulWeightsDecompression::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_u2,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant_u2),
+                                            ::testing::Values(ov::element::u2),
+                                            ::testing::ValuesIn(decompression_precisions),
+                                            ::testing::Values(ov::element::dynamic),
+                                            ::testing::Values(true),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            ::testing::Values(DecompressionType::scalar),
+                                            ::testing::Values(false),
+                                            ::testing::ValuesIn(filter_additional_config_dyn_quant()),
+                                            ::testing::ValuesIn(fusing_params_dyn_quant),
+                                            ::testing::Values(true)),
+                         MatmulWeightsDecompression::getTestCaseName);
+
 const std::vector<ov::test::ElementType> sym_weights_precisions_dyn_quant = {ov::element::i8, ov::element::i4};
 
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_sym_non_default_dyn_quant_group_sizes,
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit 5b93a37dae0237400e43850c07b4a44695e29178
+Subproject commit 0cad963300cd2b80c371cb66d435c60ad0e5edd7
diff --git a/src/tests/functional/plugin/shared/src/subgraph/weights_decompression_builders.cpp b/src/tests/functional/plugin/shared/src/subgraph/weights_decompression_builders.cpp

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes([[may`
`72`	`72`	`}`
`73`	`73`	`#if defined(OPENVINO_ARCH_X86_64)`
`74`	`74`	`ov::element::TypeVector supportedDataTypes =`
`75`		`- {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1};`
	`75`	`+ {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1, Type_t::u2};`
`76`	`76`	`if (apply_fp8) {`
`77`	`77`	`supportedDataTypes.insert(supportedDataTypes.end(), {Type_t::f8e4m3, Type_t::f8e5m2});`
`78`	`78`	`}`