Use multiple cores to improve data conversation.

quic-zhanweiw · quic-zhanweiw · commit 1a51693f0fa9 · 2026-01-11T11:33:50.000+08:00
diff --git a/README.md b/README.md
@@ -53,6 +53,8 @@ Developers can use QAI AppBuilder in both C++ and Python projects <br>
 • Support both Windows & Linux <br>
 • Support Genie(Large Language Model) <br>
 • Support LLM on both CPU & NPU [*NEW!*] <br>
+• Support Multimodal LLM [*NEW!*] <br>
+• Support Float & Native Input & Output Data [*NEW!*] <br>
 • Support Multi Graph <br>
 • Support LoRA <br> 
 • Support multiple models <br>
@@ -62,7 +64,8 @@ Developers can use QAI AppBuilder in both C++ and Python projects <br>
 • Plenty of sample code <br>
 
 ** Support ARM64 Windows, Linux and Ubuntu (e.g.: X Elite Windows, QCS8550 Linux and QCM6490 Ubuntu). <br>
-** Support OpenAI Compatible API Service([GenieAPIService](samples/genie/c++/README.md)) on WoS, Android and Linux.
+** Support OpenAI Compatible API Service([GenieAPIService](samples/genie/c++/README.md)) on WoS, Android and Linux. <br>
+** Use "native" input & output can improve data conversation performance obviously. Refer to [Wisper](samples/python/whisper_base_en/whisper_base_en.py) sample code. <br>
 
 ## Diagram
 <br>
diff --git a/samples/python/whisper_base_en/whisper_base_en.py b/samples/python/whisper_base_en/whisper_base_en.py
@@ -480,14 +480,14 @@ def log_mel_spectrogram(
     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
     log_spec = (log_spec + 4.0) / 4.0
 
-    # ——关键修改在这里：返回前转换为 float16——
+    # return float16
     return (
         log_spec
         .unsqueeze(0)
         .detach()
-        .to(dtype=torch.float16)   # 转为半精度
+        .to(dtype=torch.float16)   # convert to fp16
         .cpu()
-        .numpy()                   # numpy 数组，dtype=np.float16
+        .numpy()                   # numpy array，dtype=np.float16
     )
 
 def chunk_and_resample_audio(
@@ -517,9 +517,6 @@ def chunk_and_resample_audio(
         ),
         audio[last_sample_in_full_length_audio_chunks:],
     ]
-
-
-
         
 def load_demo_audio() -> tuple[np.ndarray, int]:
 #    TEST_AUDIO_PATH.fetch()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -15,6 +15,8 @@ else()
 set(APP "appbuilder")
 endif()
 
+target_compile_features(${APP} PRIVATE cxx_std_20)
+
 set(APP_SOURCES "QnnSampleApp.cpp"
                 "main.cpp"
                 "Log/Logger.cpp"
@@ -44,6 +46,11 @@ endif()
 
 ADD_LIBRARY(${APP} SHARED ${APP_SOURCES} ${APP_SOURCES_ARCH})
 
+if (MSVC)
+  target_compile_options(${APP} PRIVATE /O2 /GL /fp:fast)
+  target_link_options(${APP}    PRIVATE /LTCG)
+endif()
+
 SET(LIBRARY_OUTPUT_PATH "${PROJECT_SOURCE_DIR}/../lib")
 
 target_compile_definitions(${APP} PUBLIC "-DNOMINMAX")
diff --git a/src/LibAppBuilder.cpp b/src/LibAppBuilder.cpp
@@ -15,6 +15,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <fcntl.h>
+#include <algorithm>
+#include <execution>
+#include <vector>
+
 
 #include "BuildId.hpp"
 #include "DynamicLoadUtil.hpp"
@@ -50,6 +54,18 @@ namespace qnn {
 namespace tools {
 namespace libappbuilder {
 
+void warmup_parallel_stl()
+{
+    static std::once_flag once;
+    std::call_once(once, []{
+        constexpr size_t N = 1 << 18;
+        static std::vector<int> dummy(N, 0);
+        std::for_each(std::execution::par, dummy.begin(), dummy.end(),
+                      [](int& x){ x += 1; });
+    });
+    QNN_WAR("warmup_parallel_stl");
+}
+
 std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBinaryPath, std::string backEndPath, std::string systemLibraryPath,
                                                            bool loadFromCachedBinary, std::vector<LoraAdapter>& lora_adapters,
                                                            const std::string& input_data_type, const std::string& output_data_type) {
@@ -66,7 +82,8 @@ std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBin
       modelPath = cachedBinaryPath;
   }
 
-  printf("input_data_type: %s, output_data_type: %s\n", input_data_type.c_str(), output_data_type.c_str());
+  QNN_WAR("input_data_type: %s, output_data_type: %s\n", input_data_type.c_str(), output_data_type.c_str());
+
   iotensor::InputDataType parsedInputDataType     = iotensor::parseInputDataType(input_data_type);
   iotensor::OutputDataType parsedOutputDataType   = iotensor::parseOutputDataType(output_data_type);
 
@@ -100,6 +117,9 @@ std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBin
     }
   }
 
+  if ((input_data_type == "float") || (output_data_type == "float")) // We need 'std::transform' only for �float� mode. It need data conversation.
+      warmup_parallel_stl();
+
   sg_qnnInterface = qnnFunctionPointers.qnnInterface;
   std::unique_ptr<sample_app::QnnSampleApp> app(new sample_app::QnnSampleApp(qnnFunctionPointers, "null", opPackagePaths, sg_backendHandle, "null",
                                                                              debug, parsedOutputDataType, parsedInputDataType, sg_parsedProfilingLevel,
diff --git a/src/PAL/include/PAL/DynamicLoading.hpp b/src/PAL/include/PAL/DynamicLoading.hpp
@@ -93,7 +93,7 @@ int dlClose(void *handle);
 ///   recent error that occurred from a call to one of the functions in the
 ///   dl-family APIs.
 //---------------------------------------------------------------------------
-char *dlError(void);
+const char *dlError(void);
 
 }  // namespace dynamicloading
 }  // namespace pal
diff --git a/src/PAL/include/PAL/StringOp.hpp b/src/PAL/include/PAL/StringOp.hpp
@@ -41,6 +41,9 @@ class pal::StringOp {
   ///   Number of bytes copied
   //---------------------------------------------------------------------------
   static size_t memscpy(void *dst, size_t dstSize, const void *src, size_t copySize);
+  /*
+  static size_t memscpy(void* __restrict dst, size_t dstSize, const void* __restrict src, size_t copySize, unsigned blocks = 8);
+  */
 
   //---------------------------------------------------------------------------
   /// @brief
diff --git a/src/PAL/src/common/StringOp.cpp b/src/PAL/src/common/StringOp.cpp
@@ -14,6 +14,60 @@
 //---------------------------------------------------------------------------
 //    pal::StringOp::memscpy
 //---------------------------------------------------------------------------
+
+/*
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <vector>
+#include <cstddef>
+#include <cstring>
+
+size_t pal::StringOp::memscpy(void* __restrict dst, size_t dstSize,
+      const void* __restrict src, size_t copySize,
+      unsigned blocks)
+{
+    if (!dst || !src || dstSize == 0 || copySize == 0) return 0;
+
+    const size_t n = (dstSize < copySize) ? dstSize : copySize;
+
+    if (blocks == 0) blocks = 1;
+
+    // ?????????(????? 1 ??,????�??�)
+    if (blocks > n) blocks = static_cast<unsigned>(n);
+
+    // ??:????????????(??????/????)
+    unsigned hw = std::thread::hardware_concurrency();
+    if (hw >= 8) hw = hw -2;
+    if (hw == 0) hw = 4;
+    // ???????? 2~8 ????????,???????
+    blocks = std::min(blocks, std::min(hw, 8u));
+    printf("blocks = %d\n", blocks);
+
+    auto* d = static_cast<unsigned char*>(dst);
+    auto* s = static_cast<const unsigned char*>(src);
+
+    // ??:?�??�???chunk = ceil(n / blocks)
+    size_t chunk = (n + blocks - 1) / blocks;
+
+    // ??:??? 64B,????????? cache line ??
+    chunk = (chunk + 63) & ~size_t(63);
+
+    std::vector<unsigned> ids(blocks);
+    std::iota(ids.begin(), ids.end(), 0u);
+
+    std::for_each(std::execution::par, ids.begin(), ids.end(),
+                  [=](unsigned i) noexcept {
+                    const size_t begin = static_cast<size_t>(i) * chunk;
+                    if (begin >= n) return;
+                    const size_t end = std::min(begin + chunk, n);
+                    memcpy(d + begin, s + begin, end - begin);
+                  });
+
+    return n;
+}
+*/
+
 size_t pal::StringOp::memscpy(void *dst, size_t dstSize, const void *src, size_t copySize) {
   if (!dst || !src || !dstSize || !copySize) return 0;
 
diff --git a/src/PAL/src/windows/DynamicLoading.cpp b/src/PAL/src/windows/DynamicLoading.cpp
@@ -24,7 +24,7 @@
 #define TOSTRING(x)  STRINGIFY(x)
 
 static std::set<HMODULE> mod_handles;
-static thread_local char *sg_lastErrMsg = "";
+static thread_local const char* sg_lastErrMsg = "";
 
 void *pal::dynamicloading::dlOpen(const char *filename, int flags) {
   HMODULE mod;
@@ -211,8 +211,8 @@ int pal::dynamicloading::dlClose(void *handle) {
   return 0;
 }
 
-char *pal::dynamicloading::dlError(void) {
-  char *retStr = sg_lastErrMsg;
+const char *pal::dynamicloading::dlError(void) {
+  const char *retStr = sg_lastErrMsg;
 
   sg_lastErrMsg = "";
 
diff --git a/src/Utils/DataUtil.cpp b/src/Utils/DataUtil.cpp
@@ -16,16 +16,27 @@
 #include <iostream>
 #include <numeric>
 #include <queue>
+
+#include <execution>
+#include <algorithm>
+#include <bit>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  #include <arm_neon.h>
+#endif
 #ifdef _WIN32
 #include <intrin.h>
 #endif
 #include "DataUtil.hpp"
 #include "Logger.hpp"
-#ifndef __hexagon__
 #include "PAL/Directory.hpp"
 #include "PAL/FileOp.hpp"
 #include "PAL/Path.hpp"
-#endif
+
+#define PARALLEL 1  // wd. Improve performance through std::transform and NEON.
 
 using namespace qnn;
 using namespace qnn::tools;
@@ -412,6 +423,107 @@ static inline uint16_t datautil::fp16_ieee_from_fp32_value(float f) {
      return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
  }
 
+static inline uint16_t datautil::fp16_ieee_from_fp32_value_v2(float f) noexcept {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    constexpr float scale_to_inf  = 0x1.0p+112f;
+    constexpr float scale_to_zero = 0x1.0p-110f;
+#else
+    constexpr float scale_to_inf  = std::bit_cast<float>(UINT32_C(0x77800000));
+    constexpr float scale_to_zero = std::bit_cast<float>(UINT32_C(0x08800000));
+#endif
+    float base = (std::fabs(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w      = std::bit_cast<uint32_t>(f);
+    const uint32_t shl1_w = (w << 1);
+    const uint32_t sign   = (w & UINT32_C(0x80000000));
+    uint32_t bias         = (shl1_w & UINT32_C(0xFF000000));
+    bias = std::max(bias, UINT32_C(0x71000000));
+
+    base = std::bit_cast<float>((bias >> 1) + UINT32_C(0x07800000)) + base;
+
+    const uint32_t bits          = std::bit_cast<uint32_t>(base);
+    const uint32_t exp_bits      = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits =  bits        & UINT32_C(0x00000FFF);
+    const uint32_t nonsign       = exp_bits + mantissa_bits;
+
+    return static_cast<uint16_t>((sign >> 16) |
+             ((shl1_w > UINT32_C(0xFF000000)) ? UINT16_C(0x7E00) : nonsign));
+}
+
+
+bool datautil::float32_to_float16_neon(uint16_t* __restrict dst,
+                             const float*   __restrict src,
+                             size_t n) noexcept
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+    size_t i = 0;
+    constexpr size_t step = 8;
+    for (; i + step <= n; i += step) {
+        const float32x4_t f0 = vld1q_f32(src + i + 0);
+        const float32x4_t f1 = vld1q_f32(src + i + 4);
+        const float16x4_t h0 = vcvt_f16_f32(f0);
+        const float16x4_t h1 = vcvt_f16_f32(f1);
+        const float16x8_t h  = vcombine_f16(h0, h1);
+        const uint16x8_t u16 = vreinterpretq_u16_f16(h);
+        vst1q_u16(dst + i, u16);
+    }
+    for (; i < n; ++i) dst[i] = fp16_ieee_from_fp32_value_v2(src[i]);
+    return true;
+  #else
+    (void)dst; (void)src; (void)n; return false;
+  #endif
+#else
+    (void)dst; (void)src; (void)n; return false;
+#endif
+}
+
+
+void datautil::float32_to_float16_dispatch(uint16_t* __restrict dst,
+                                 const float*   __restrict src,
+                                 size_t n) noexcept
+{
+    if (!float32_to_float16_neon(dst, src, n)) {
+        constexpr size_t step = 16;
+        size_t i = 0;
+        for (; i + step <= n; i += step) {
+            dst[i+0]  = fp16_ieee_from_fp32_value_v2(src[i+0]);
+            dst[i+1]  = fp16_ieee_from_fp32_value_v2(src[i+1]);
+            dst[i+2]  = fp16_ieee_from_fp32_value_v2(src[i+2]);
+            dst[i+3]  = fp16_ieee_from_fp32_value_v2(src[i+3]);
+            dst[i+4]  = fp16_ieee_from_fp32_value_v2(src[i+4]);
+            dst[i+5]  = fp16_ieee_from_fp32_value_v2(src[i+5]);
+            dst[i+6]  = fp16_ieee_from_fp32_value_v2(src[i+6]);
+            dst[i+7]  = fp16_ieee_from_fp32_value_v2(src[i+7]);
+
+            dst[i+8]  = fp16_ieee_from_fp32_value_v2(src[i+8]);
+            dst[i+9]  = fp16_ieee_from_fp32_value_v2(src[i+9]);
+            dst[i+10] = fp16_ieee_from_fp32_value_v2(src[i+10]);
+            dst[i+11] = fp16_ieee_from_fp32_value_v2(src[i+11]);
+            dst[i+12] = fp16_ieee_from_fp32_value_v2(src[i+12]);
+            dst[i+13] = fp16_ieee_from_fp32_value_v2(src[i+13]);
+            dst[i+14] = fp16_ieee_from_fp32_value_v2(src[i+14]);
+            dst[i+15] = fp16_ieee_from_fp32_value_v2(src[i+15]);
+        }
+        for (; i < n; ++i) dst[i] = fp16_ieee_from_fp32_value_v2(src[i]);
+    }
+}
+
+
+void datautil::float32_to_float16_parallel(uint16_t* __restrict dst,
+                                 const float*   __restrict src,
+                                 size_t n) noexcept
+{
+    constexpr size_t kParallelThreshold = 8192;
+    if (n < kParallelThreshold) {
+        float32_to_float16_dispatch(dst, src, n);
+        return;
+    }
+    std::transform(std::execution::par_unseq, src, src + n, dst,
+                   [](float x) noexcept -> uint16_t { return fp16_ieee_from_fp32_value_v2(x); });
+
+}
+
 // Enabling fp16 execution
 bool datautil::float32ToFloatN(uint8_t* out,
                        float* in,
@@ -423,13 +535,14 @@ bool datautil::float32ToFloatN(uint8_t* out,
       }
   
       if(bitWidth == 16){
-  #ifndef __hexagon__
+  #ifdef PARALLEL   // wd. Improve performance through std::transform and NEON.
+        auto* dst = reinterpret_cast<uint16_t*>(out);  
+        float32_to_float16_parallel(dst, in, numElements);
+  #else
           uint16_t *temp = (uint16_t *)out;
           for(size_t i = 0; i < numElements; i++){
               temp[i] = fp16_ieee_from_fp32_value(in[i]);
           }
-  #else
-          return false;
   #endif //__hexagon__
       }
       else if(bitWidth == 32) {
diff --git a/src/Utils/DataUtil.hpp b/src/Utils/DataUtil.hpp
@@ -90,6 +90,25 @@ StatusCode writeBinaryToFile(std::string fileDir,
 
 // Enabling fp16 execution
 static inline uint16_t fp16_ieee_from_fp32_value(float f);
+
+// Single-element FP32?FP16 (bit type), scalar micro-optimized version, semantic unchanged
+static inline uint16_t fp16_ieee_from_fp32_value_v2(float f) noexcept;
+
+// Batch NEON Fast Path (Returns Whether NEON Has Been Used)
+bool float32_to_float16_neon(uint16_t* __restrict dst,
+                             const float*   __restrict src,
+                             size_t n) noexcept;
+
+// Scheduler: Prioritize NEON, otherwise fall back to scalar batch
+void float32_to_float16_dispatch(uint16_t* __restrict dst,
+                                 const float*   __restrict src,
+                                 size_t n) noexcept;
+
+// Parallel version: small array seq, large array par_unseq
+void float32_to_float16_parallel(uint16_t* __restrict dst,
+                                 const float*   __restrict src,
+                                 size_t n) noexcept;
+
 static inline float fp16_ieee_to_fp32_value(uint16_t h);
 static inline uint32_t fp32_to_bits(float f);
 static inline float fp32_from_bits(uint32_t w);
diff --git a/src/Utils/IOTensor.cpp b/src/Utils/IOTensor.cpp