float16 support for AMD64 target #23536

georgen117 · 2025-01-29T22:47:49Z

georgen117
Jan 29, 2025

I am working on adding MLAS_TARGET_AMD64 support to the fp16_common.h header.

This currently has implemented a lot of inline functions of NEON intrinsic instructions.

I have implemented the functions using intrinsic instructions from immintrin.h.

What is the preference for adding the code.

Does the community prefer

Option 1 implement all the AMD64 versions of the function then all the ARM NEON version of the functions

#if defined(MLAS_TARGET_AMD64)
#include <immintrin.h>

typedef __m128h MLAS_FLOAT16X8;
typedef __int64 MLAS_FLOAT16X4;
typedef __m128i MLAS_UINT16X8;
typedef __int64 MLAS_UINT16X4;

MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasReinterpretAsFloat16x8(MLAS_INT32X4 Vector) { return _mm_castsi128_ph(Vector); }

MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasBroadcastFloat16x8(_mlas_fp16_ Value) { return _mm_castsi128_ph(_mm_set1_epi16(Value)); }
.
.
.
#endif  // MLAS_TARGET_AMD64
#if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)

typedef float16x8_t MLAS_FLOAT16X8;
typedef float16x4_t MLAS_FLOAT16X4;
typedef uint16x8_t MLAS_UINT16X8;
typedef uint16x4_t MLAS_UINT16X4;


MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasReinterpretAsFloat16x8(MLAS_INT32X4 Vector) { return vreinterpretq_f16_s32(Vector); }

MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasBroadcastFloat16x8(_mlas_fp16_ Value) { return vreinterpretq_f16_p16(vdupq_n_p16(Value)); }
.
.
.
#endif  // MLAS_TARGET_ARM64 || MLAS_TARGET_ARM64EC

Or is this still prefered?
Option2 interleave the AMD64 and ARM NEON code

#if defined(MLAS_TARGET_AMD64)
#include <immintrin.h>
typedef __m128h MLAS_FLOAT16X8;
typedef __int64 MLAS_FLOAT16X4;
typedef __m128i MLAS_UINT16X8;
typedef __int64 MLAS_UINT16X4;

#elseif defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
typedef float16x8_t MLAS_FLOAT16X8;
typedef float16x4_t MLAS_FLOAT16X4;
typedef uint16x8_t MLAS_UINT16X8;
typedef uint16x4_t MLAS_UINT16X4;
#endif
 
MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasReinterpretAsFloat16x8(MLAS_INT32X4 Vector) { 
#if defined(MLAS_TARGET_AMD64)
    return _mm_castsi128_ph(Vector); 
#elseif defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
    return vreinterpretq_f16_s32(Vector); 
#endif
}

MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasBroadcastFloat16x8(_mlas_fp16_ Value) {
#if defined(MLAS_TARGET_AMD64)
    return _mm_castsi128_ph(_mm_set1_epi16(Value)); 
#elseif defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
    return vreinterpretq_f16_p16(vdupq_n_p16(Value));
#endif
}
.
.
.

I hope it is clear what I am asking.

For the float32 version of similar code found in mlasi.h option 2 was used.

However the code I have been implementing currently uses Option 1 I wanted to know if there was a preference so I could switch it it needed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

float16 support for AMD64 target #23536

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

float16 support for AMD64 target #23536

Uh oh!

georgen117 Jan 29, 2025

Replies: 0 comments

georgen117
Jan 29, 2025