Skip to content

Commit a8f5ba7

Browse files
committed
Revert "Reapply "IK Softcap op""
1 parent 653e576 commit a8f5ba7

File tree

12 files changed

+10
-294
lines changed

12 files changed

+10
-294
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,6 @@ extern "C" {
634634
GGML_OP_TIMESTEP_EMBEDDING,
635635
GGML_OP_ARGSORT,
636636
GGML_OP_LEAKY_RELU,
637-
GGML_OP_SOFTCAP,
638637

639638
GGML_OP_FLASH_ATTN_EXT,
640639
GGML_OP_FLASH_ATTN_BACK,
@@ -1464,19 +1463,6 @@ extern "C" {
14641463
float s,
14651464
float b);
14661465

1467-
GGML_API struct ggml_tensor * ggml_softcap(
1468-
struct ggml_context * ctx,
1469-
struct ggml_tensor * a,
1470-
float s_before,
1471-
float s_after);
1472-
1473-
// in-place, returns view(a)
1474-
GGML_API struct ggml_tensor * ggml_softcap_inplace(
1475-
struct ggml_context * ctx,
1476-
struct ggml_tensor * a,
1477-
float s_before,
1478-
float s_after);
1479-
14801466
// b -> view(a,offset,nb1,nb2,3), return modified a
14811467
GGML_API struct ggml_tensor * ggml_set(
14821468
struct ggml_context * ctx,

ggml/src/ggml-alloc.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
4545
case GGML_OP_RMS_NORM_BACK:
4646
case GGML_OP_SOFT_MAX:
4747
case GGML_OP_SOFT_MAX_BACK:
48-
case GGML_OP_SOFTCAP:
4948
return true;
5049

5150
default:

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2542,10 +2542,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
25422542
{
25432543
ggml_compute_forward_scale(params, tensor);
25442544
} break;
2545-
case GGML_OP_SOFTCAP:
2546-
{
2547-
ggml_compute_forward_softcap(params, tensor);
2548-
} break;
25492545
case GGML_OP_SET:
25502546
{
25512547
ggml_compute_forward_set(params, tensor);
@@ -2998,7 +2994,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
29982994
{
29992995
n_tasks = 1; //TODO
30002996
} break;
3001-
case GGML_OP_SOFTCAP:
30022997
case GGML_OP_SOFT_MAX:
30032998
{
30042999
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));

ggml/src/ggml-cpu/ops.cpp

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -5486,79 +5486,6 @@ void ggml_compute_forward_scale(
54865486
}
54875487
}
54885488

5489-
// ggml_compute_forward_softcap
5490-
5491-
static void ggml_compute_forward_softcap_f32(
5492-
const ggml_compute_params * params,
5493-
ggml_tensor * dst) {
5494-
5495-
const ggml_tensor * src0 = dst->src[0];
5496-
5497-
GGML_ASSERT(ggml_is_contiguous(src0));
5498-
GGML_ASSERT(ggml_is_contiguous(dst));
5499-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
5500-
5501-
// scale factor
5502-
float val[2];
5503-
memcpy(val, dst->op_params, sizeof(val));
5504-
5505-
const int ith = params->ith;
5506-
const int nth = params->nth;
5507-
5508-
const int nc = src0->ne[0];
5509-
const int nr = ggml_nrows(src0);
5510-
5511-
// rows per thread
5512-
const int dr = (nr + nth - 1)/nth;
5513-
5514-
// row range for this thread
5515-
const int ir0 = dr*ith;
5516-
const int ir1 = MIN(ir0 + dr, nr);
5517-
5518-
const size_t nb01 = src0->nb[1];
5519-
5520-
const size_t nb1 = dst->nb[1];
5521-
5522-
//if (ith == 0) printf("%s: nc = %d, nr = %d, nth = %d, params = %g, %g, %d\n", __func__, nc, nr, nth, val[0], val[1], dst->data == src0->data ? 1 : 0);
5523-
5524-
for (int i1 = ir0; i1 < ir1; i1++) {
5525-
/* if (dst->data != src0->data) {
5526-
// src0 is same shape as dst => same indices
5527-
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); */
5528-
float * dst_row = (float *) ((char *) dst->data + i1*nb1);
5529-
if (dst->data == src0->data) {
5530-
ggml_vec_softcap_f32(nc, dst_row, val[0], val[1]);
5531-
} else {
5532-
const float * src_row = (const float *)((const char *)src0->data + i1*nb01);
5533-
ggml_vec_cpy_softcap_f32(nc, src_row, dst_row, val[0], val[1]);
5534-
// TODO: better implementation
5535-
float * row = (float *) ((char *) dst->data + i1*nb1);
5536-
ggml_vec_softcap_f32(nc, row, val[0], val[1]);
5537-
//ggml_vec_scale_f32(nc, row, val[0]);
5538-
//ggml_vec_tanh_f32(nc, row, row);
5539-
//ggml_vec_scale_f32(nc, row, val[1]);
5540-
}
5541-
}
5542-
}
5543-
5544-
void ggml_compute_forward_softcap(
5545-
const ggml_compute_params * params,
5546-
ggml_tensor * dst) {
5547-
5548-
const struct ggml_tensor * src0 = dst->src[0];
5549-
5550-
switch (src0->type) {
5551-
case GGML_TYPE_F32:
5552-
{
5553-
ggml_compute_forward_softcap_f32(params, dst);
5554-
} break;
5555-
default:
5556-
{
5557-
GGML_ASSERT(false);
5558-
}
5559-
}
5560-
}
5561-
55625489
// ggml_compute_forward_set
55635490

55645491
static void ggml_compute_forward_set_f32(

ggml/src/ggml-cpu/ops.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * param
6666
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6767
void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6868
void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69-
void ggml_compute_forward_softcap(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7069
void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7170
void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7271
void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);

ggml/src/ggml-cpu/vec.h

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -771,15 +771,6 @@ inline static float32x4_t ggml_v_tanh(float32x4_t x) {
771771
//return vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
772772
}
773773

774-
inline static float32x4_t ggml_v_softcap(float32x4_t x, float32x4_t s_before, float32x4_t s_after) {
775-
return vmulq_f32(s_after, ggml_v_tanh(vmulq_f32(x, s_before)));
776-
//const float32x4_t one = vdupq_n_f32(1.0f);
777-
//const float32x4_t two_x = vmulq_f32(x, s_before);
778-
//const float32x4_t exp_two_x = ggml_v_expf(two_x);
779-
//const float32x4_t th = vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
780-
//return vmulq_f32(th, s_after);
781-
}
782-
783774
// Slower than lookup on my M2-Max
784775
inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
785776
const float32x4_t one = vdupq_n_f32(1.0f);
@@ -845,13 +836,6 @@ inline static __m512 ggml_v_tanh(__m512 x) {
845836
return _mm512_mask_blend_ps(mask, res, one);
846837
}
847838

848-
inline static __m512 ggml_v_softcap(__m512 x, __m512 s_before, __m512 s_after) {
849-
const __m512 one = _mm512_set1_ps(1.0f);
850-
const __m512 exp_two_x = ggml_v_expf(_mm512_mul_ps(x, s_before));
851-
const __m512 th = _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
852-
return _mm512_mul_ps(th, s_after);
853-
}
854-
855839
inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
856840
const __m512 one = _mm512_set1_ps(1.0f);
857841
__m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
@@ -927,14 +911,6 @@ inline static __m256 ggml_v_tanh(__m256 x) {
927911
return _mm256_or_ps(_mm256_and_ps(mask, one), _mm256_andnot_ps(mask, res));
928912
}
929913

930-
inline static __m256 ggml_v_softcap(__m256 x, float s_before, float s_after) {
931-
return _mm256_mul_ps(_mm256_set1_ps(s_after), ggml_v_tanh(_mm256_mul_ps(x, _mm256_set1_ps(s_before))));
932-
//const __m256 one = _mm256_set1_ps(1.0f);
933-
//const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f*s_before)));
934-
//const __m256 th = _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
935-
//return _mm256_mul_ps(th, _mm256_set1_ps(s_after));
936-
}
937-
938914
inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
939915
const __m256 one = _mm256_set1_ps(1.0f);
940916
const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
@@ -1005,13 +981,6 @@ inline static __m128 ggml_v_tanh(__m128 x) {
1005981
return _mm_div_ps(_mm_sub_ps(exp_two_x, one), _mm_add_ps(exp_two_x, one));
1006982
}
1007983

1008-
inline static __m128 ggml_v_softcap(__m128 x, float s_before, float s_after) {
1009-
const __m128 one = _mm_set1_ps(1.0f);
1010-
const __m128 exp_two_x = ggml_v_expf(_mm_mul_ps(x, _mm_set1_ps(2.f*s_before)));
1011-
const __m128 th = _mm_div_ps(_mm_sub_ps(exp_two_x, one), _mm_add_ps(exp_two_x, one));
1012-
return _mm_mul_ps(th, _mm_set1_ps(s_after));
1013-
}
1014-
1015984
#endif // __ARM_NEON / __AVX2__ / __SSE2__
1016985

1017986
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
@@ -1140,72 +1109,6 @@ static void ggml_vec_tanh_f32(const int n, float * y, const float * x) {
11401109
}
11411110
}
11421111

1143-
static void ggml_vec_cpy_softcap_f32(const int n, const float * x, float * y, float s_before, float s_after) {
1144-
int i = 0;
1145-
#if defined(__AVX512F__) && defined(__AVX512DQ__)
1146-
__m512 vs_before = _mm512_set1_ps(2.f*s_before);
1147-
__m512 vs_after = _mm512_set1_ps(s_after);
1148-
for (; i + 15 < n; i += 16) {
1149-
_mm512_storeu_ps(y + i, ggml_v_softcap(_mm512_loadu_ps(x + i), vs_before, vs_after));
1150-
}
1151-
#elif defined(__AVX2__) && defined(__FMA__)
1152-
for (; i + 7 < n; i += 8) {
1153-
_mm256_storeu_ps(y + i, ggml_v_softcap(_mm256_loadu_ps(x + i), s_before, s_after));
1154-
}
1155-
#elif defined(__SSE2__)
1156-
for (; i + 3 < n; i += 4) {
1157-
_mm_storeu_ps(y + i, ggml_v_softcap(_mm_loadu_ps(x + i), s_before, s_after));
1158-
}
1159-
#elif defined(__ARM_NEON) && defined(__aarch64__)
1160-
float32x4_t vs_before = vdupq_n_f32(s_before);
1161-
float32x4_t vs_after = vdupq_n_f32(s_after);
1162-
for (; i + 3 < n; i += 4) {
1163-
vst1q_f32(y + i, ggml_v_softcap(vld1q_f32(x + i), vs_before, vs_after));
1164-
}
1165-
#endif
1166-
for (; i < n; ++i) {
1167-
y[i] = s_after*tanhf(x[i]*s_before);
1168-
}
1169-
}
1170-
1171-
static void ggml_vec_softcap_f32(const int n, float * x, float s_before, float s_after) {
1172-
int i = 0;
1173-
#if defined(__AVX512F__) && defined(__AVX512DQ__)
1174-
__m512 vs_before = _mm512_set1_ps(2.f*s_before);
1175-
__m512 vs_after = _mm512_set1_ps(s_after);
1176-
//for (; i + 63 < n; i += 64) {
1177-
// __m512 x1 = _mm512_loadu_ps(x + i);
1178-
// __m512 x2 = _mm512_loadu_ps(x + i + 16);
1179-
// __m512 x3 = _mm512_loadu_ps(x + i + 32);
1180-
// __m512 x4 = _mm512_loadu_ps(x + i + 48);
1181-
// _mm512_storeu_ps(x + i + 0, ggml_v_softcap(x1, vs_before, vs_after));
1182-
// _mm512_storeu_ps(x + i + 16, ggml_v_softcap(x2, vs_before, vs_after));
1183-
// _mm512_storeu_ps(x + i + 32, ggml_v_softcap(x3, vs_before, vs_after));
1184-
// _mm512_storeu_ps(x + i + 48, ggml_v_softcap(x4, vs_before, vs_after));
1185-
//}
1186-
for (; i + 15 < n; i += 16) {
1187-
_mm512_storeu_ps(x + i, ggml_v_softcap(_mm512_loadu_ps(x + i), vs_before, vs_after));
1188-
}
1189-
#elif defined(__AVX2__) && defined(__FMA__)
1190-
for (; i + 7 < n; i += 8) {
1191-
_mm256_storeu_ps(x + i, ggml_v_softcap(_mm256_loadu_ps(x + i), s_before, s_after));
1192-
}
1193-
#elif defined(__SSE2__)
1194-
for (; i + 3 < n; i += 4) {
1195-
_mm_storeu_ps(x + i, ggml_v_softcap(_mm_loadu_ps(x + i), s_before, s_after));
1196-
}
1197-
#elif defined(__ARM_NEON) && defined(__aarch64__)
1198-
float32x4_t vs_before = vdupq_n_f32(s_before);
1199-
float32x4_t vs_after = vdupq_n_f32(s_after);
1200-
for (; i + 3 < n; i += 4) {
1201-
vst1q_f32(x + i, ggml_v_softcap(vld1q_f32(x + i), vs_before, vs_after));
1202-
}
1203-
#endif
1204-
for (; i < n; ++i) {
1205-
x[i] = s_after*tanhf(x[i]*s_before);
1206-
}
1207-
}
1208-
12091112
//
12101113
// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
12111114
// via SIMD instructions is faster than the fp16-based lookup table.

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ bool g_mul_mat_q = true;
5050
#include "ggml-cuda/set-rows.cuh"
5151
#include "ggml.h"
5252

53-
#include "ggml-cuda/softcap.cuh"
54-
5553
// #include "ggml-cuda/iqk_mmvq.cuh"
5654

5755
#include <algorithm>
@@ -2625,9 +2623,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
26252623
case GGML_OP_SCALE:
26262624
ggml_cuda_op_scale(ctx, dst);
26272625
break;
2628-
case GGML_OP_SOFTCAP:
2629-
ggml_cuda_op_softcap(ctx, dst);
2630-
break;
26312626
case GGML_OP_SQR:
26322627
ggml_cuda_op_sqr(ctx, dst);
26332628
break;
@@ -3673,7 +3668,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36733668
case GGML_OP_MUL:
36743669
case GGML_OP_DIV:
36753670
case GGML_OP_SCALE:
3676-
case GGML_OP_SOFTCAP:
36773671
case GGML_OP_SQR:
36783672
case GGML_OP_SQRT:
36793673
case GGML_OP_SIN:

ggml/src/ggml-cuda/softcap.cu

Lines changed: 0 additions & 32 deletions
This file was deleted.

ggml/src/ggml-cuda/softcap.cuh

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)