bypassed optimization pass which introduced casts to fp32 .

cjm715 · cjm715 · commit c2c53b78317b · 2024-12-04T11:23:14.000-08:00
also added faster implementation of fp16 MatMul
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -221,12 +221,12 @@ static bool IsIsolatedFp16NodeOnCpu(const onnxruntime::Node& node, onnxruntime::
 }
 
 static Status ForceSingleNodeCPUFloat16ToFloat32(onnxruntime::Graph& graph, const KernelRegistry& cpu_kernel_registry) {
-  for (auto& node : graph.Nodes()) {
-    if (IsIsolatedFp16NodeOnCpu(node, graph, cpu_kernel_registry)) {
-      // unassign the node so that NeedInsertCast will return true for it, forcing it to fp32
-      node.SetExecutionProviderType("");
-    }
-  }
+  // for (auto& node : graph.Nodes()) {
+  //   if (IsIsolatedFp16NodeOnCpu(node, graph, cpu_kernel_registry)) {
+  //     // unassign the node so that NeedInsertCast will return true for it, forcing it to fp32
+  //     node.SetExecutionProviderType("");
+  //   }
+  // }
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
@@ -209,18 +209,58 @@ void MatMul<float>(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K, const float* A, const
   MlasGemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, threadpool);
 }
 
+
 template <>
-void MatMul<MLFloat16>(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K, const MLFloat16* A, const MLFloat16* B, MLFloat16* C, concurrency::ThreadPool* thread_pool) {
-  // Set alpha to 1 and beta to 0
-  Eigen::half alpha = Eigen::half(1.0f);
-  Eigen::half beta = Eigen::half(0.0f);
-
-  // Use GEMM with the given parameters
-  math::Gemm<Eigen::half>(CblasNoTrans, CblasNoTrans, M, N, K, alpha,
-                          reinterpret_cast<const Eigen::half*>(A), reinterpret_cast<const Eigen::half*>(B), beta,
-                          reinterpret_cast<Eigen::half*>(C), thread_pool);
+void MatMul<MLFloat16>(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K,  const MLFloat16* a_data, const MLFloat16* b_data, MLFloat16* y_data, concurrency::ThreadPool* thread_pool) {
+
+MLFloat16 alpha = MLFloat16(1.0f);
+MLFloat16 beta = MLFloat16(0.0f);
+ // if input is empty tensor, return directly as nothing need to be calculated.
+  if (M == 0 || N == 0)
+    return;
+
+#if defined(__GNUC__) && defined(HAS_CLASS_MEMACCESS)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
+memset(&beta, 0, sizeof(MLFloat16));
+#if defined(__GNUC__) && defined(HAS_CLASS_MEMACCESS)
+#pragma GCC diagnostic pop
+#endif
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+
+
+  MLAS_HALF_GEMM_DATA_PARAMS data;
+  data.A = a_data;
+  data.lda = K;
+  data.B = b_data;
+  data.ldb = N;
+  data.C = y_data;
+  data.ldc = N;
+  // if (c_shape != nullptr) {
+  //   data.Bias = c_data;
+  // }
+  MlasHalfGemmBatch(M, N, K, 1, &data, thread_pool);
+  return;
+  
+#endif
+  // Fallback to Eigen
+  // // Broadcast the bias as needed if bias is given
+  // GemmBroadcastBias(M, N, beta, c_data, c_shape, y_data);
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+  math::Gemm<Eigen::half>(CblasNoTrans, CblasNoTrans, M, N, K, *reinterpret_cast<Eigen::half*>(&alpha),
+                          reinterpret_cast<const Eigen::half*>(a_data), reinterpret_cast<const Eigen::half*>(b_data), *reinterpret_cast<Eigen::half*>(&beta), reinterpret_cast<Eigen::half*>(y_data), thread_pool);
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+    
 }
 
+
 #ifdef MLAS_SUPPORTS_GEMM_DOUBLE
 template <>
 void MatMul<double>(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K, const double* A, const double* B, double* C, ThreadPool* threadpool) {