Skip to content

Commit 5688720

Browse files
authored
Merge pull request #16 from esolithe/concedo_experimental
Concedo experimental
2 parents 60fb5f5 + 748dfcc commit 5688720

31 files changed

+613
-278
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ if (LLAMA_CUBLAS)
9696

9797
add_compile_definitions(GGML_USE_LLAMAFILE)
9898
add_compile_definitions(GGML_USE_CUDA)
99-
add_compile_definitions(SD_USE_CUBLAS)
99+
add_compile_definitions(SD_USE_CUDA)
100100

101101
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
102102
add_compile_definitions(GGML_CUDA_F16)
@@ -177,7 +177,7 @@ if (LLAMA_HIPBLAS)
177177
list(APPEND GGML_SOURCES_ROCM ${SRCS})
178178
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
179179
list(APPEND GGML_SOURCES_ROCM ${SRCS})
180-
add_compile_definitions(GGML_USE_HIP GGML_USE_CUDA SD_USE_CUBLAS)
180+
add_compile_definitions(GGML_USE_HIP GGML_USE_CUDA SD_USE_CUDA)
181181
add_library(ggml-rocm ${GGML_SOURCES_CUDA})
182182

183183
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ CLBLAST_FLAGS = -DGGML_USE_CLBLAST
8383
FAILSAFE_FLAGS = -DUSE_FAILSAFE
8484
VULKAN_FLAGS = -DGGML_USE_VULKAN -DSD_USE_VULKAN
8585
ifdef LLAMA_CUBLAS
86-
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
86+
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUDA
8787
else
8888
CUBLAS_FLAGS =
8989
endif
@@ -177,7 +177,7 @@ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/templat
177177
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
178178

179179
ifdef LLAMA_CUBLAS
180-
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
180+
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
181181
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
182182
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
183183
CUBLAS_OBJS += $(patsubst %.cu,%.o,$(filter-out ggml/src/ggml-cuda/ggml-cuda.cu, $(wildcard ggml/src/ggml-cuda/*.cu)))
@@ -256,7 +256,7 @@ ifdef DETECT_ROCWMMA
256256
HIPFLAGS += -DGGML_HIP_ROCWMMA_FATTN -I$(dir $(DETECT_ROCWMMA))
257257
endif
258258

259-
HIPFLAGS += -DGGML_USE_HIP -DGGML_HIP_NO_VMM -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
259+
HIPFLAGS += -DGGML_USE_HIP -DGGML_HIP_NO_VMM -DGGML_USE_CUDA -DSD_USE_CUDA $(shell $(ROCM_PATH)/bin/hipconfig -C)
260260
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
261261
HIPLDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
262262
HIPLDFLAGS += -lhipblas -lamdhip64 -lrocblas

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1679,7 +1679,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16791679
[](common_params & params) {
16801680
params.warmup = false;
16811681
}
1682-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
1682+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
16831683
add_opt(common_arg(
16841684
{"--spm-infill"},
16851685
string_format(

convert_hf_to_gguf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2645,7 +2645,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26452645
yield from super().modify_tensors(data_torch, name, bid)
26462646

26472647

2648-
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2648+
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
26492649
class Qwen2VLModel(TextModel):
26502650
model_arch = gguf.MODEL_ARCH.QWEN2VL
26512651

@@ -2669,7 +2669,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26692669
return [(self.map_tensor_name(name), data_torch)]
26702670

26712671

2672-
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2672+
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
26732673
class Qwen2VLVisionModel(VisionModel):
26742674
def __init__(self, *args, **kwargs):
26752675
super().__init__(*args, **kwargs)

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ struct load_model_inputs
7070
const int quant_k = 0;
7171
const int quant_v = 0;
7272
const bool check_slowness = false;
73+
const bool swa_support = false;
7374
const bool quiet = false;
7475
const int debugmode = 0;
7576
};

ggml/include/ggml.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,14 +534,15 @@ extern "C" {
534534
GGML_UNARY_OP_STEP,
535535
GGML_UNARY_OP_TANH,
536536
GGML_UNARY_OP_ELU,
537-
GGML_UNARY_OP_RELU,
538537
GGML_UNARY_OP_SIGMOID,
539538
GGML_UNARY_OP_GELU,
539+
GGML_UNARY_OP_GELU_ERF,
540540
GGML_UNARY_OP_GELU_QUICK,
541541
GGML_UNARY_OP_SILU,
542542
GGML_UNARY_OP_HARDSWISH,
543543
GGML_UNARY_OP_HARDSIGMOID,
544544
GGML_UNARY_OP_EXP,
545+
GGML_UNARY_OP_RELU,
545546

546547
GGML_UNARY_OP_COUNT,
547548
};
@@ -1037,6 +1038,16 @@ extern "C" {
10371038
struct ggml_context * ctx,
10381039
struct ggml_tensor * a);
10391040

1041+
// GELU using erf (error function) when possible
1042+
// some backends may fallback to approximation based on Abramowitz and Stegun formula
1043+
GGML_API struct ggml_tensor * ggml_gelu_erf(
1044+
struct ggml_context * ctx,
1045+
struct ggml_tensor * a);
1046+
1047+
GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
1048+
struct ggml_context * ctx,
1049+
struct ggml_tensor * a);
1050+
10401051
GGML_API struct ggml_tensor * ggml_gelu_quick(
10411052
struct ggml_context * ctx,
10421053
struct ggml_tensor * a);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2216,6 +2216,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
22162216
} break;
22172217

22182218
case GGML_UNARY_OP_GELU:
2219+
case GGML_UNARY_OP_GELU_ERF:
22192220
case GGML_UNARY_OP_GELU_QUICK:
22202221
case GGML_UNARY_OP_SILU:
22212222
{

ggml/src/ggml-cpu/ops.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
26912691
}
26922692
}
26932693

2694+
// ggml_compute_forward_gelu_erf
2695+
2696+
static void ggml_compute_forward_gelu_erf_f32(
2697+
const ggml_compute_params * params,
2698+
ggml_tensor * dst) {
2699+
2700+
const ggml_tensor * src0 = dst->src[0];
2701+
2702+
assert(ggml_is_contiguous_1(src0));
2703+
assert(ggml_is_contiguous_1(dst));
2704+
assert(ggml_are_same_shape(src0, dst));
2705+
2706+
const int ith = params->ith;
2707+
const int nth = params->nth;
2708+
2709+
const int nc = src0->ne[0];
2710+
const int nr = ggml_nrows(src0);
2711+
2712+
// rows per thread
2713+
const int dr = (nr + nth - 1)/nth;
2714+
2715+
// row range for this thread
2716+
const int ir0 = dr*ith;
2717+
const int ir1 = MIN(ir0 + dr, nr);
2718+
2719+
for (int i1 = ir0; i1 < ir1; i1++) {
2720+
ggml_vec_gelu_erf_f32(nc,
2721+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
2722+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
2723+
2724+
#ifndef NDEBUG
2725+
for (int k = 0; k < nc; k++) {
2726+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2727+
GGML_UNUSED(x);
2728+
assert(!isnan(x));
2729+
assert(!isinf(x));
2730+
}
2731+
#endif
2732+
}
2733+
}
2734+
2735+
static void ggml_compute_forward_gelu_erf_f16(
2736+
const ggml_compute_params * params,
2737+
ggml_tensor * dst) {
2738+
2739+
const ggml_tensor * src0 = dst->src[0];
2740+
2741+
assert(ggml_is_contiguous_1(src0));
2742+
assert(ggml_is_contiguous_1(dst));
2743+
assert(ggml_are_same_shape(src0, dst));
2744+
2745+
const int ith = params->ith;
2746+
const int nth = params->nth;
2747+
2748+
const int nc = src0->ne[0];
2749+
const int nr = ggml_nrows(src0);
2750+
2751+
// rows per thread
2752+
const int dr = (nr + nth - 1)/nth;
2753+
2754+
// row range for this thread
2755+
const int ir0 = dr*ith;
2756+
const int ir1 = MIN(ir0 + dr, nr);
2757+
2758+
for (int i1 = ir0; i1 < ir1; i1++) {
2759+
ggml_vec_gelu_erf_f16(nc,
2760+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2761+
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2762+
2763+
#ifndef NDEBUG
2764+
for (int k = 0; k < nc; k++) {
2765+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2766+
const float v = GGML_FP16_TO_FP32(x);
2767+
GGML_UNUSED(v);
2768+
assert(!isnan(v));
2769+
assert(!isinf(v));
2770+
}
2771+
#endif
2772+
}
2773+
}
2774+
2775+
static void ggml_compute_forward_gelu_erf(
2776+
const ggml_compute_params * params,
2777+
ggml_tensor * dst) {
2778+
2779+
const ggml_tensor * src0 = dst->src[0];
2780+
2781+
switch (src0->type) {
2782+
case GGML_TYPE_F32:
2783+
{
2784+
ggml_compute_forward_gelu_erf_f32(params, dst);
2785+
} break;
2786+
case GGML_TYPE_F16:
2787+
{
2788+
ggml_compute_forward_gelu_erf_f16(params, dst);
2789+
} break;
2790+
default:
2791+
{
2792+
GGML_ABORT("fatal error");
2793+
}
2794+
}
2795+
}
2796+
26942797
// ggml_compute_forward_gelu_quick
26952798

26962799
static void ggml_compute_forward_gelu_quick_f32(
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
77497852
{
77507853
ggml_compute_forward_gelu(params, dst);
77517854
} break;
7855+
case GGML_UNARY_OP_GELU_ERF:
7856+
{
7857+
ggml_compute_forward_gelu_erf(params, dst);
7858+
} break;
77527859
case GGML_UNARY_OP_GELU_QUICK:
77537860
{
77547861
ggml_compute_forward_gelu_quick(params, dst);

ggml/src/ggml-cpu/vec.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
428428
static const float GELU_COEF_A = 0.044715f;
429429
static const float GELU_QUICK_COEF = -1.702f;
430430
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
431+
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
431432

432433
inline static float ggml_gelu_f32(float x) {
433434
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
440441
}
441442
}
442443

444+
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
445+
for (int i = 0; i < n; ++i) {
446+
float xi = GGML_FP16_TO_FP32(x[i]);
447+
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
448+
y[i] = GGML_FP32_TO_FP16(res);
449+
}
450+
}
451+
443452
#ifdef GGML_GELU_FP16
444453
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
445454
uint16_t t;
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
463472
}
464473
#endif
465474

475+
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
476+
for (int i = 0; i < n; ++i) {
477+
float xi = x[i];
478+
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
479+
}
480+
}
481+
466482
inline static float ggml_gelu_quick_f32(float x) {
467483
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
468484
}

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
149149
GGML_METAL_KERNEL_TYPE_SIGMOID,
150150
GGML_METAL_KERNEL_TYPE_GELU,
151151
GGML_METAL_KERNEL_TYPE_GELU_4,
152+
GGML_METAL_KERNEL_TYPE_GELU_ERF,
153+
GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
152154
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
153155
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
154156
GGML_METAL_KERNEL_TYPE_SILU,
@@ -1103,6 +1105,8 @@ @implementation GGMLMetalClass
11031105
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
11041106
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
11051107
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
1108+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF, gelu_erf, true);
1109+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4, gelu_erf_4, true);
11061110
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
11071111
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
11081112
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
@@ -1613,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
16131617
case GGML_UNARY_OP_RELU:
16141618
case GGML_UNARY_OP_SIGMOID:
16151619
case GGML_UNARY_OP_GELU:
1620+
case GGML_UNARY_OP_GELU_ERF:
16161621
case GGML_UNARY_OP_GELU_QUICK:
16171622
case GGML_UNARY_OP_SILU:
16181623
case GGML_UNARY_OP_ELU:
@@ -2251,6 +2256,25 @@ static bool ggml_metal_encode_node(
22512256

22522257
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
22532258
} break;
2259+
case GGML_UNARY_OP_GELU_ERF:
2260+
{
2261+
int64_t n = ggml_nelements(dst);
2262+
2263+
id<MTLComputePipelineState> pipeline = nil;
2264+
2265+
if (n % 4 == 0) {
2266+
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
2267+
n /= 4;
2268+
} else {
2269+
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
2270+
}
2271+
2272+
[encoder setComputePipelineState:pipeline];
2273+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2274+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
2275+
2276+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
2277+
} break;
22542278
case GGML_UNARY_OP_GELU_QUICK:
22552279
{
22562280
int64_t n = ggml_nelements(dst);

0 commit comments

Comments
 (0)