@@ -771,15 +771,6 @@ inline static float32x4_t ggml_v_tanh(float32x4_t x) {
771771 //return vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
772772}
773773
774- inline static float32x4_t ggml_v_softcap (float32x4_t x , float32x4_t s_before , float32x4_t s_after ) {
775- return vmulq_f32 (s_after , ggml_v_tanh (vmulq_f32 (x , s_before )));
776- //const float32x4_t one = vdupq_n_f32(1.0f);
777- //const float32x4_t two_x = vmulq_f32(x, s_before);
778- //const float32x4_t exp_two_x = ggml_v_expf(two_x);
779- //const float32x4_t th = vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
780- //return vmulq_f32(th, s_after);
781- }
782-
783774// Slower than lookup on my M2-Max
784775inline static float32x4_t ggml_v_gelu (float32x4_t x , float32x4_t c1 , float32x4_t c2 ) {
785776 const float32x4_t one = vdupq_n_f32 (1.0f );
@@ -845,13 +836,6 @@ inline static __m512 ggml_v_tanh(__m512 x) {
845836 return _mm512_mask_blend_ps (mask , res , one );
846837}
847838
848- inline static __m512 ggml_v_softcap (__m512 x , __m512 s_before , __m512 s_after ) {
849- const __m512 one = _mm512_set1_ps (1.0f );
850- const __m512 exp_two_x = ggml_v_expf (_mm512_mul_ps (x , s_before ));
851- const __m512 th = _mm512_div_ps (_mm512_sub_ps (exp_two_x , one ), _mm512_add_ps (exp_two_x , one ));
852- return _mm512_mul_ps (th , s_after );
853- }
854-
855839inline static __m512 ggml_v_gelu (__m512 x , __m512 c1 , __m512 c2 ) {
856840 const __m512 one = _mm512_set1_ps (1.0f );
857841 __m512 arg = _mm512_fmadd_ps (x , _mm512_mul_ps (c1 , x ), one );
@@ -927,14 +911,6 @@ inline static __m256 ggml_v_tanh(__m256 x) {
927911 return _mm256_or_ps (_mm256_and_ps (mask , one ), _mm256_andnot_ps (mask , res ));
928912}
929913
930- inline static __m256 ggml_v_softcap (__m256 x , float s_before , float s_after ) {
931- return _mm256_mul_ps (_mm256_set1_ps (s_after ), ggml_v_tanh (_mm256_mul_ps (x , _mm256_set1_ps (s_before ))));
932- //const __m256 one = _mm256_set1_ps(1.0f);
933- //const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f*s_before)));
934- //const __m256 th = _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
935- //return _mm256_mul_ps(th, _mm256_set1_ps(s_after));
936- }
937-
938914inline static __m256 ggml_v_gelu (__m256 x , __m256 c1 , __m256 c2 ) {
939915 const __m256 one = _mm256_set1_ps (1.0f );
940916 const __m256 mask = _mm256_cmp_ps (x , _mm256_set1_ps (10.f ), _CMP_GT_OQ );
@@ -1005,13 +981,6 @@ inline static __m128 ggml_v_tanh(__m128 x) {
1005981 return _mm_div_ps (_mm_sub_ps (exp_two_x , one ), _mm_add_ps (exp_two_x , one ));
1006982}
1007983
1008- inline static __m128 ggml_v_softcap (__m128 x , float s_before , float s_after ) {
1009- const __m128 one = _mm_set1_ps (1.0f );
1010- const __m128 exp_two_x = ggml_v_expf (_mm_mul_ps (x , _mm_set1_ps (2.f * s_before )));
1011- const __m128 th = _mm_div_ps (_mm_sub_ps (exp_two_x , one ), _mm_add_ps (exp_two_x , one ));
1012- return _mm_mul_ps (th , _mm_set1_ps (s_after ));
1013- }
1014-
1015984#endif // __ARM_NEON / __AVX2__ / __SSE2__
1016985
1017986inline static void ggml_vec_silu_f16 (const int n , ggml_fp16_t * y , const ggml_fp16_t * x ) {
@@ -1140,72 +1109,6 @@ static void ggml_vec_tanh_f32(const int n, float * y, const float * x) {
11401109 }
11411110}
11421111
1143- static void ggml_vec_cpy_softcap_f32 (const int n , const float * x , float * y , float s_before , float s_after ) {
1144- int i = 0 ;
1145- #if defined(__AVX512F__ ) && defined(__AVX512DQ__ )
1146- __m512 vs_before = _mm512_set1_ps (2.f * s_before );
1147- __m512 vs_after = _mm512_set1_ps (s_after );
1148- for (; i + 15 < n ; i += 16 ) {
1149- _mm512_storeu_ps (y + i , ggml_v_softcap (_mm512_loadu_ps (x + i ), vs_before , vs_after ));
1150- }
1151- #elif defined(__AVX2__ ) && defined(__FMA__ )
1152- for (; i + 7 < n ; i += 8 ) {
1153- _mm256_storeu_ps (y + i , ggml_v_softcap (_mm256_loadu_ps (x + i ), s_before , s_after ));
1154- }
1155- #elif defined(__SSE2__ )
1156- for (; i + 3 < n ; i += 4 ) {
1157- _mm_storeu_ps (y + i , ggml_v_softcap (_mm_loadu_ps (x + i ), s_before , s_after ));
1158- }
1159- #elif defined(__ARM_NEON ) && defined(__aarch64__ )
1160- float32x4_t vs_before = vdupq_n_f32 (s_before );
1161- float32x4_t vs_after = vdupq_n_f32 (s_after );
1162- for (; i + 3 < n ; i += 4 ) {
1163- vst1q_f32 (y + i , ggml_v_softcap (vld1q_f32 (x + i ), vs_before , vs_after ));
1164- }
1165- #endif
1166- for (; i < n ; ++ i ) {
1167- y [i ] = s_after * tanhf (x [i ]* s_before );
1168- }
1169- }
1170-
1171- static void ggml_vec_softcap_f32 (const int n , float * x , float s_before , float s_after ) {
1172- int i = 0 ;
1173- #if defined(__AVX512F__ ) && defined(__AVX512DQ__ )
1174- __m512 vs_before = _mm512_set1_ps (2.f * s_before );
1175- __m512 vs_after = _mm512_set1_ps (s_after );
1176- //for (; i + 63 < n; i += 64) {
1177- // __m512 x1 = _mm512_loadu_ps(x + i);
1178- // __m512 x2 = _mm512_loadu_ps(x + i + 16);
1179- // __m512 x3 = _mm512_loadu_ps(x + i + 32);
1180- // __m512 x4 = _mm512_loadu_ps(x + i + 48);
1181- // _mm512_storeu_ps(x + i + 0, ggml_v_softcap(x1, vs_before, vs_after));
1182- // _mm512_storeu_ps(x + i + 16, ggml_v_softcap(x2, vs_before, vs_after));
1183- // _mm512_storeu_ps(x + i + 32, ggml_v_softcap(x3, vs_before, vs_after));
1184- // _mm512_storeu_ps(x + i + 48, ggml_v_softcap(x4, vs_before, vs_after));
1185- //}
1186- for (; i + 15 < n ; i += 16 ) {
1187- _mm512_storeu_ps (x + i , ggml_v_softcap (_mm512_loadu_ps (x + i ), vs_before , vs_after ));
1188- }
1189- #elif defined(__AVX2__ ) && defined(__FMA__ )
1190- for (; i + 7 < n ; i += 8 ) {
1191- _mm256_storeu_ps (x + i , ggml_v_softcap (_mm256_loadu_ps (x + i ), s_before , s_after ));
1192- }
1193- #elif defined(__SSE2__ )
1194- for (; i + 3 < n ; i += 4 ) {
1195- _mm_storeu_ps (x + i , ggml_v_softcap (_mm_loadu_ps (x + i ), s_before , s_after ));
1196- }
1197- #elif defined(__ARM_NEON ) && defined(__aarch64__ )
1198- float32x4_t vs_before = vdupq_n_f32 (s_before );
1199- float32x4_t vs_after = vdupq_n_f32 (s_after );
1200- for (; i + 3 < n ; i += 4 ) {
1201- vst1q_f32 (x + i , ggml_v_softcap (vld1q_f32 (x + i ), vs_before , vs_after ));
1202- }
1203- #endif
1204- for (; i < n ; ++ i ) {
1205- x [i ] = s_after * tanhf (x [i ]* s_before );
1206- }
1207- }
1208-
12091112//
12101113// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
12111114// via SIMD instructions is faster than the fp16-based lookup table.
0 commit comments