Enable multithreading on FP16 to FP32 cast operator (microsoft#23619)

Erick Muñoz · web-flow · commit 7fc7d5ec750d · 2025-03-19T01:27:25.000-07:00
### Description
Enables multithreading on FP16 to FP32 cast operator.



### Motivation and Context
Improves CPU performance on FP16 models that require casting to FP32.
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -254,11 +254,32 @@ struct TensorCasterNoSat<std::string, DstType> {
 // tensor MLFloat16 -> float
 template <>
 struct TensorCaster<MLFloat16, float> {
-  void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
+  void Cast(const OpKernelContext& ctx, const TensorShape& shape, const Tensor& in, Tensor& out) const {
     auto out_data = out.MutableData<float>();
     auto in_data = in.Data<MLFloat16>();
     const size_t shape_size = narrow<size_t>(shape.Size());
-    MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+
+    // Check if the tensor is long enough to use threads
+    if (shape_size <= 128000) {
+      MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+      return;
+    }
+    // Calculate the number of compute cyles per implementation
+    auto cpu_info = CPUIDInfo::GetCPUIDInfo();
+    double num_compute_cycles;
+    if (cpu_info.HasSSE3()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 1);
+    } else if (cpu_info.HasAVX2()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 2);
+    } else {
+      num_compute_cycles = static_cast<double>(shape_size * 10);
+    }
+
+    concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size,
+                                            {shape_size * 2.f, shape_size * 4.f, num_compute_cycles},
+                                            [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
+                                              MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast<size_t>(last_span - first_span));
+                                            });
   }
 };