Skip to content

Commit 1a51693

Browse files
committed
Use multiple cores to improve data conversation.
1 parent 257f049 commit 1a51693

File tree

11 files changed

+240
-19
lines changed

11 files changed

+240
-19
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ Developers can use QAI AppBuilder in both C++ and Python projects <br>
5353
• Support both Windows & Linux <br>
5454
• Support Genie(Large Language Model) <br>
5555
• Support LLM on both CPU & NPU [*NEW!*] <br>
56+
• Support Multimodal LLM [*NEW!*] <br>
57+
• Support Float & Native Input & Output Data [*NEW!*] <br>
5658
• Support Multi Graph <br>
5759
• Support LoRA <br>
5860
• Support multiple models <br>
@@ -62,7 +64,8 @@ Developers can use QAI AppBuilder in both C++ and Python projects <br>
6264
• Plenty of sample code <br>
6365

6466
** Support ARM64 Windows, Linux and Ubuntu (e.g.: X Elite Windows, QCS8550 Linux and QCM6490 Ubuntu). <br>
65-
** Support OpenAI Compatible API Service([GenieAPIService](samples/genie/c++/README.md)) on WoS, Android and Linux.
67+
** Support OpenAI Compatible API Service([GenieAPIService](samples/genie/c++/README.md)) on WoS, Android and Linux. <br>
68+
** Use "native" input & output can improve data conversation performance obviously. Refer to [Wisper](samples/python/whisper_base_en/whisper_base_en.py) sample code. <br>
6669

6770
## Diagram
6871
<br>

samples/python/whisper_base_en/whisper_base_en.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -480,14 +480,14 @@ def log_mel_spectrogram(
480480
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
481481
log_spec = (log_spec + 4.0) / 4.0
482482

483-
# ——关键修改在这里:返回前转换为 float16——
483+
# return float16
484484
return (
485485
log_spec
486486
.unsqueeze(0)
487487
.detach()
488-
.to(dtype=torch.float16) # 转为半精度
488+
.to(dtype=torch.float16) # convert to fp16
489489
.cpu()
490-
.numpy() # numpy 数组,dtype=np.float16
490+
.numpy() # numpy array,dtype=np.float16
491491
)
492492

493493
def chunk_and_resample_audio(
@@ -517,9 +517,6 @@ def chunk_and_resample_audio(
517517
),
518518
audio[last_sample_in_full_length_audio_chunks:],
519519
]
520-
521-
522-
523520

524521
def load_demo_audio() -> tuple[np.ndarray, int]:
525522
# TEST_AUDIO_PATH.fetch()

src/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ else()
1515
set(APP "appbuilder")
1616
endif()
1717

18+
target_compile_features(${APP} PRIVATE cxx_std_20)
19+
1820
set(APP_SOURCES "QnnSampleApp.cpp"
1921
"main.cpp"
2022
"Log/Logger.cpp"
@@ -44,6 +46,11 @@ endif()
4446

4547
ADD_LIBRARY(${APP} SHARED ${APP_SOURCES} ${APP_SOURCES_ARCH})
4648

49+
if (MSVC)
50+
target_compile_options(${APP} PRIVATE /O2 /GL /fp:fast)
51+
target_link_options(${APP} PRIVATE /LTCG)
52+
endif()
53+
4754
SET(LIBRARY_OUTPUT_PATH "${PROJECT_SOURCE_DIR}/../lib")
4855

4956
target_compile_definitions(${APP} PUBLIC "-DNOMINMAX")

src/LibAppBuilder.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#include <stdio.h>
1616
#include <stdlib.h>
1717
#include <fcntl.h>
18+
#include <algorithm>
19+
#include <execution>
20+
#include <vector>
21+
1822

1923
#include "BuildId.hpp"
2024
#include "DynamicLoadUtil.hpp"
@@ -50,6 +54,18 @@ namespace qnn {
5054
namespace tools {
5155
namespace libappbuilder {
5256

57+
void warmup_parallel_stl()
58+
{
59+
static std::once_flag once;
60+
std::call_once(once, []{
61+
constexpr size_t N = 1 << 18;
62+
static std::vector<int> dummy(N, 0);
63+
std::for_each(std::execution::par, dummy.begin(), dummy.end(),
64+
[](int& x){ x += 1; });
65+
});
66+
QNN_WAR("warmup_parallel_stl");
67+
}
68+
5369
std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBinaryPath, std::string backEndPath, std::string systemLibraryPath,
5470
bool loadFromCachedBinary, std::vector<LoraAdapter>& lora_adapters,
5571
const std::string& input_data_type, const std::string& output_data_type) {
@@ -66,7 +82,8 @@ std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBin
6682
modelPath = cachedBinaryPath;
6783
}
6884

69-
printf("input_data_type: %s, output_data_type: %s\n", input_data_type.c_str(), output_data_type.c_str());
85+
QNN_WAR("input_data_type: %s, output_data_type: %s\n", input_data_type.c_str(), output_data_type.c_str());
86+
7087
iotensor::InputDataType parsedInputDataType = iotensor::parseInputDataType(input_data_type);
7188
iotensor::OutputDataType parsedOutputDataType = iotensor::parseOutputDataType(output_data_type);
7289

@@ -100,6 +117,9 @@ std::unique_ptr<sample_app::QnnSampleApp> initQnnSampleApp(std::string cachedBin
100117
}
101118
}
102119

120+
if ((input_data_type == "float") || (output_data_type == "float")) // We need 'std::transform' only for ‘float’ mode. It need data conversation.
121+
warmup_parallel_stl();
122+
103123
sg_qnnInterface = qnnFunctionPointers.qnnInterface;
104124
std::unique_ptr<sample_app::QnnSampleApp> app(new sample_app::QnnSampleApp(qnnFunctionPointers, "null", opPackagePaths, sg_backendHandle, "null",
105125
debug, parsedOutputDataType, parsedInputDataType, sg_parsedProfilingLevel,

src/PAL/include/PAL/DynamicLoading.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ int dlClose(void *handle);
9393
/// recent error that occurred from a call to one of the functions in the
9494
/// dl-family APIs.
9595
//---------------------------------------------------------------------------
96-
char *dlError(void);
96+
const char *dlError(void);
9797

9898
} // namespace dynamicloading
9999
} // namespace pal

src/PAL/include/PAL/StringOp.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class pal::StringOp {
4141
/// Number of bytes copied
4242
//---------------------------------------------------------------------------
4343
static size_t memscpy(void *dst, size_t dstSize, const void *src, size_t copySize);
44+
/*
45+
static size_t memscpy(void* __restrict dst, size_t dstSize, const void* __restrict src, size_t copySize, unsigned blocks = 8);
46+
*/
4447

4548
//---------------------------------------------------------------------------
4649
/// @brief

src/PAL/src/common/StringOp.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,60 @@
1414
//---------------------------------------------------------------------------
1515
// pal::StringOp::memscpy
1616
//---------------------------------------------------------------------------
17+
18+
/*
19+
#include <algorithm>
20+
#include <execution>
21+
#include <numeric>
22+
#include <vector>
23+
#include <cstddef>
24+
#include <cstring>
25+
26+
size_t pal::StringOp::memscpy(void* __restrict dst, size_t dstSize,
27+
const void* __restrict src, size_t copySize,
28+
unsigned blocks)
29+
{
30+
if (!dst || !src || dstSize == 0 || copySize == 0) return 0;
31+
32+
const size_t n = (dstSize < copySize) ? dstSize : copySize;
33+
34+
if (blocks == 0) blocks = 1;
35+
36+
// ?????????(????? 1 ??,????“??”)
37+
if (blocks > n) blocks = static_cast<unsigned>(n);
38+
39+
// ??:????????????(??????/????)
40+
unsigned hw = std::thread::hardware_concurrency();
41+
if (hw >= 8) hw = hw -2;
42+
if (hw == 0) hw = 4;
43+
// ???????? 2~8 ????????,???????
44+
blocks = std::min(blocks, std::min(hw, 8u));
45+
printf("blocks = %d\n", blocks);
46+
47+
auto* d = static_cast<unsigned char*>(dst);
48+
auto* s = static_cast<const unsigned char*>(src);
49+
50+
// ??:?“??”???chunk = ceil(n / blocks)
51+
size_t chunk = (n + blocks - 1) / blocks;
52+
53+
// ??:??? 64B,????????? cache line ??
54+
chunk = (chunk + 63) & ~size_t(63);
55+
56+
std::vector<unsigned> ids(blocks);
57+
std::iota(ids.begin(), ids.end(), 0u);
58+
59+
std::for_each(std::execution::par, ids.begin(), ids.end(),
60+
[=](unsigned i) noexcept {
61+
const size_t begin = static_cast<size_t>(i) * chunk;
62+
if (begin >= n) return;
63+
const size_t end = std::min(begin + chunk, n);
64+
memcpy(d + begin, s + begin, end - begin);
65+
});
66+
67+
return n;
68+
}
69+
*/
70+
1771
size_t pal::StringOp::memscpy(void *dst, size_t dstSize, const void *src, size_t copySize) {
1872
if (!dst || !src || !dstSize || !copySize) return 0;
1973

src/PAL/src/windows/DynamicLoading.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#define TOSTRING(x) STRINGIFY(x)
2525

2626
static std::set<HMODULE> mod_handles;
27-
static thread_local char *sg_lastErrMsg = "";
27+
static thread_local const char* sg_lastErrMsg = "";
2828

2929
void *pal::dynamicloading::dlOpen(const char *filename, int flags) {
3030
HMODULE mod;
@@ -211,8 +211,8 @@ int pal::dynamicloading::dlClose(void *handle) {
211211
return 0;
212212
}
213213

214-
char *pal::dynamicloading::dlError(void) {
215-
char *retStr = sg_lastErrMsg;
214+
const char *pal::dynamicloading::dlError(void) {
215+
const char *retStr = sg_lastErrMsg;
216216

217217
sg_lastErrMsg = "";
218218

src/Utils/DataUtil.cpp

Lines changed: 118 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,27 @@
1616
#include <iostream>
1717
#include <numeric>
1818
#include <queue>
19+
20+
#include <execution>
21+
#include <algorithm>
22+
#include <bit>
23+
#include <cmath>
24+
#include <cstddef>
25+
#include <cstdint>
26+
27+
#if defined(__aarch64__) || defined(_M_ARM64)
28+
#include <arm_neon.h>
29+
#endif
1930
#ifdef _WIN32
2031
#include <intrin.h>
2132
#endif
2233
#include "DataUtil.hpp"
2334
#include "Logger.hpp"
24-
#ifndef __hexagon__
2535
#include "PAL/Directory.hpp"
2636
#include "PAL/FileOp.hpp"
2737
#include "PAL/Path.hpp"
28-
#endif
38+
39+
#define PARALLEL 1 // wd. Improve performance through std::transform and NEON.
2940

3041
using namespace qnn;
3142
using namespace qnn::tools;
@@ -412,6 +423,107 @@ static inline uint16_t datautil::fp16_ieee_from_fp32_value(float f) {
412423
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
413424
}
414425

426+
static inline uint16_t datautil::fp16_ieee_from_fp32_value_v2(float f) noexcept {
427+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
428+
constexpr float scale_to_inf = 0x1.0p+112f;
429+
constexpr float scale_to_zero = 0x1.0p-110f;
430+
#else
431+
constexpr float scale_to_inf = std::bit_cast<float>(UINT32_C(0x77800000));
432+
constexpr float scale_to_zero = std::bit_cast<float>(UINT32_C(0x08800000));
433+
#endif
434+
float base = (std::fabs(f) * scale_to_inf) * scale_to_zero;
435+
436+
const uint32_t w = std::bit_cast<uint32_t>(f);
437+
const uint32_t shl1_w = (w << 1);
438+
const uint32_t sign = (w & UINT32_C(0x80000000));
439+
uint32_t bias = (shl1_w & UINT32_C(0xFF000000));
440+
bias = std::max(bias, UINT32_C(0x71000000));
441+
442+
base = std::bit_cast<float>((bias >> 1) + UINT32_C(0x07800000)) + base;
443+
444+
const uint32_t bits = std::bit_cast<uint32_t>(base);
445+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
446+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
447+
const uint32_t nonsign = exp_bits + mantissa_bits;
448+
449+
return static_cast<uint16_t>((sign >> 16) |
450+
((shl1_w > UINT32_C(0xFF000000)) ? UINT16_C(0x7E00) : nonsign));
451+
}
452+
453+
454+
bool datautil::float32_to_float16_neon(uint16_t* __restrict dst,
455+
const float* __restrict src,
456+
size_t n) noexcept
457+
{
458+
#if defined(__aarch64__) || defined(_M_ARM64)
459+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
460+
size_t i = 0;
461+
constexpr size_t step = 8;
462+
for (; i + step <= n; i += step) {
463+
const float32x4_t f0 = vld1q_f32(src + i + 0);
464+
const float32x4_t f1 = vld1q_f32(src + i + 4);
465+
const float16x4_t h0 = vcvt_f16_f32(f0);
466+
const float16x4_t h1 = vcvt_f16_f32(f1);
467+
const float16x8_t h = vcombine_f16(h0, h1);
468+
const uint16x8_t u16 = vreinterpretq_u16_f16(h);
469+
vst1q_u16(dst + i, u16);
470+
}
471+
for (; i < n; ++i) dst[i] = fp16_ieee_from_fp32_value_v2(src[i]);
472+
return true;
473+
#else
474+
(void)dst; (void)src; (void)n; return false;
475+
#endif
476+
#else
477+
(void)dst; (void)src; (void)n; return false;
478+
#endif
479+
}
480+
481+
482+
void datautil::float32_to_float16_dispatch(uint16_t* __restrict dst,
483+
const float* __restrict src,
484+
size_t n) noexcept
485+
{
486+
if (!float32_to_float16_neon(dst, src, n)) {
487+
constexpr size_t step = 16;
488+
size_t i = 0;
489+
for (; i + step <= n; i += step) {
490+
dst[i+0] = fp16_ieee_from_fp32_value_v2(src[i+0]);
491+
dst[i+1] = fp16_ieee_from_fp32_value_v2(src[i+1]);
492+
dst[i+2] = fp16_ieee_from_fp32_value_v2(src[i+2]);
493+
dst[i+3] = fp16_ieee_from_fp32_value_v2(src[i+3]);
494+
dst[i+4] = fp16_ieee_from_fp32_value_v2(src[i+4]);
495+
dst[i+5] = fp16_ieee_from_fp32_value_v2(src[i+5]);
496+
dst[i+6] = fp16_ieee_from_fp32_value_v2(src[i+6]);
497+
dst[i+7] = fp16_ieee_from_fp32_value_v2(src[i+7]);
498+
499+
dst[i+8] = fp16_ieee_from_fp32_value_v2(src[i+8]);
500+
dst[i+9] = fp16_ieee_from_fp32_value_v2(src[i+9]);
501+
dst[i+10] = fp16_ieee_from_fp32_value_v2(src[i+10]);
502+
dst[i+11] = fp16_ieee_from_fp32_value_v2(src[i+11]);
503+
dst[i+12] = fp16_ieee_from_fp32_value_v2(src[i+12]);
504+
dst[i+13] = fp16_ieee_from_fp32_value_v2(src[i+13]);
505+
dst[i+14] = fp16_ieee_from_fp32_value_v2(src[i+14]);
506+
dst[i+15] = fp16_ieee_from_fp32_value_v2(src[i+15]);
507+
}
508+
for (; i < n; ++i) dst[i] = fp16_ieee_from_fp32_value_v2(src[i]);
509+
}
510+
}
511+
512+
513+
void datautil::float32_to_float16_parallel(uint16_t* __restrict dst,
514+
const float* __restrict src,
515+
size_t n) noexcept
516+
{
517+
constexpr size_t kParallelThreshold = 8192;
518+
if (n < kParallelThreshold) {
519+
float32_to_float16_dispatch(dst, src, n);
520+
return;
521+
}
522+
std::transform(std::execution::par_unseq, src, src + n, dst,
523+
[](float x) noexcept -> uint16_t { return fp16_ieee_from_fp32_value_v2(x); });
524+
525+
}
526+
415527
// Enabling fp16 execution
416528
bool datautil::float32ToFloatN(uint8_t* out,
417529
float* in,
@@ -423,13 +535,14 @@ bool datautil::float32ToFloatN(uint8_t* out,
423535
}
424536

425537
if(bitWidth == 16){
426-
#ifndef __hexagon__
538+
#ifdef PARALLEL // wd. Improve performance through std::transform and NEON.
539+
auto* dst = reinterpret_cast<uint16_t*>(out);
540+
float32_to_float16_parallel(dst, in, numElements);
541+
#else
427542
uint16_t *temp = (uint16_t *)out;
428543
for(size_t i = 0; i < numElements; i++){
429544
temp[i] = fp16_ieee_from_fp32_value(in[i]);
430545
}
431-
#else
432-
return false;
433546
#endif //__hexagon__
434547
}
435548
else if(bitWidth == 32) {

src/Utils/DataUtil.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,25 @@ StatusCode writeBinaryToFile(std::string fileDir,
9090

9191
// Enabling fp16 execution
9292
static inline uint16_t fp16_ieee_from_fp32_value(float f);
93+
94+
// Single-element FP32?FP16 (bit type), scalar micro-optimized version, semantic unchanged
95+
static inline uint16_t fp16_ieee_from_fp32_value_v2(float f) noexcept;
96+
97+
// Batch NEON Fast Path (Returns Whether NEON Has Been Used)
98+
bool float32_to_float16_neon(uint16_t* __restrict dst,
99+
const float* __restrict src,
100+
size_t n) noexcept;
101+
102+
// Scheduler: Prioritize NEON, otherwise fall back to scalar batch
103+
void float32_to_float16_dispatch(uint16_t* __restrict dst,
104+
const float* __restrict src,
105+
size_t n) noexcept;
106+
107+
// Parallel version: small array seq, large array par_unseq
108+
void float32_to_float16_parallel(uint16_t* __restrict dst,
109+
const float* __restrict src,
110+
size_t n) noexcept;
111+
93112
static inline float fp16_ieee_to_fp32_value(uint16_t h);
94113
static inline uint32_t fp32_to_bits(float f);
95114
static inline float fp32_from_bits(uint32_t w);

0 commit comments

Comments
 (0)