2424#include < cstddef>
2525#include < cstdint>
2626
27- #if defined(__aarch64__) || defined(_M_ARM64)
27+ #if defined(__aarch64__) || defined(_M_ARM64) || defined(__ANDROID__)
2828 #include < arm_neon.h>
2929#endif
3030#ifdef _WIN32
@@ -396,6 +396,7 @@ static inline uint32_t datautil::fp32_to_bits(float f) {
396396#endif
397397}
398398
399+ #if !defined(__ANDROID__)
399400// Enabling fp16 execution
400401static inline uint16_t datautil::fp16_ieee_from_fp32_value (float f) {
401402 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
@@ -422,7 +423,22 @@ static inline uint16_t datautil::fp16_ieee_from_fp32_value(float f) {
422423 const uint32_t nonsign = exp_bits + mantissa_bits;
423424 return (sign >> 16 ) | (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign);
424425 }
426+ #endif
425427
428+ #if defined(__ANDROID__)
429+ static inline uint16_t datautil::fp16_ieee_from_fp32_hw (float f)
430+ {
431+ float32x4_t v32 = vdupq_n_f32 (f);
432+ // VCVT F16.F32
433+ float16x4_t v16 = vcvt_f16_f32 (v32);
434+ __fp16 h = v16[0 ];
435+ uint16_t out;
436+ memcpy (&out, &h, sizeof (out));
437+ return out;
438+ }
439+ #endif
440+
441+ #if !defined(__ANDROID__)
426442static inline uint16_t datautil::fp16_ieee_from_fp32_value_v2 (float f) noexcept {
427443#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
428444 constexpr float scale_to_inf = 0x1 .0p+112f ;
@@ -523,6 +539,7 @@ void datautil::float32_to_float16_parallel(uint16_t* __restrict dst,
523539 [](float x) noexcept -> uint16_t { return fp16_ieee_from_fp32_value_v2 (x); });
524540
525541}
542+ #endif
526543
527544// Enabling fp16 execution
528545bool datautil::float32ToFloatN (uint8_t * out,
@@ -535,13 +552,17 @@ bool datautil::float32ToFloatN(uint8_t* out,
535552 }
536553
537554 if (bitWidth == 16 ){
538- #ifdef PARALLEL // wd. Improve performance through std::transform and NEON.
555+ #if defined( PARALLEL) && !defined(__ANDROID__) // wd. Improve performance through std::transform and NEON.
539556 auto * dst = reinterpret_cast <uint16_t *>(out);
540557 float32_to_float16_parallel (dst, in, numElements);
541558 #else
542559 uint16_t *temp = (uint16_t *)out;
543560 for (size_t i = 0 ; i < numElements; i++){
544- temp[i] = fp16_ieee_from_fp32_value (in[i]);
561+ #if defined(__ANDROID__)
562+ temp[i] = fp16_ieee_from_fp32_hw (in[i]);
563+ #else
564+ temp[i] = fp16_ieee_from_fp32_value (in[i]);
565+ #endif
545566 }
546567 #endif // __hexagon__
547568 }
0 commit comments