Skip to content

Commit c16d672

Browse files
committed
Merge commit '9230dbe2c757e2d5071329095727d0fa9d4b85c4' into concedo_experimental
# Conflicts: # ggml/src/ggml-cpu/CMakeLists.txt # src/llama-graph.cpp # tools/server/README.md
2 parents b59b5db + 9230dbe commit c16d672

29 files changed

+2680
-2471
lines changed

common/arg.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3212,6 +3212,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32123212
params.speculative.model.path = value;
32133213
}
32143214
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3215+
add_opt(common_arg(
3216+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3217+
string_format(
3218+
"KV cache data type for K for the draft model\n"
3219+
"allowed values: %s\n"
3220+
"(default: %s)",
3221+
get_all_kv_cache_types().c_str(),
3222+
ggml_type_name(params.speculative.cache_type_k)
3223+
),
3224+
[](common_params & params, const std::string & value) {
3225+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3226+
}
3227+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3228+
add_opt(common_arg(
3229+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3230+
string_format(
3231+
"KV cache data type for V for the draft model\n"
3232+
"allowed values: %s\n"
3233+
"(default: %s)",
3234+
get_all_kv_cache_types().c_str(),
3235+
ggml_type_name(params.speculative.cache_type_v)
3236+
),
3237+
[](common_params & params, const std::string & value) {
3238+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3239+
}
3240+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32153241

32163242
add_opt(common_arg(
32173243
{"-mv", "--model-vocoder"}, "FNAME",

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ struct common_params_speculative {
195195
float p_split = 0.1f; // speculative decoding split probability
196196
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
197197

198+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
199+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
200+
198201
struct cpu_params cpuparams;
199202
struct cpu_params cpuparams_batch;
200203

ggml/src/ggml-cpu/arch/arm/repack.cpp

Lines changed: 1014 additions & 1026 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 9 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,8 @@
7878
#endif
7979
#if defined(__ARM_ARCH)
8080
struct ggml_arm_arch_features_type {
81-
int has_neon;
82-
int has_dotprod;
83-
int has_i8mm;
84-
int has_sve;
8581
int sve_cnt;
86-
int has_sme;
87-
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
82+
} ggml_arm_arch_features = { 0 };
8883
#endif
8984

9085

@@ -683,87 +678,15 @@ bool ggml_is_numa(void) {
683678

684679
#if defined(__linux__) && defined(__aarch64__)
685680
#include <sys/auxv.h>
686-
#elif defined(__APPLE__)
687-
#include <sys/sysctl.h>
688-
#endif
689-
690-
#if !defined(HWCAP2_I8MM)
691-
#define HWCAP2_I8MM (1 << 13)
692-
#endif
693-
694-
#if !defined(HWCAP2_SME)
695-
#define HWCAP2_SME (1 << 23)
696681
#endif
697682

698683
static void ggml_init_arm_arch_features(void) {
699-
#if defined(__linux__) && defined(__aarch64__)
700-
uint32_t hwcap = getauxval(AT_HWCAP);
701-
uint32_t hwcap2 = getauxval(AT_HWCAP2);
702-
703-
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
704-
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
705-
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
706-
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
707-
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
708-
709-
#if defined(__ARM_FEATURE_SVE)
684+
#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
710685
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
711686
#endif
712-
#elif defined(__APPLE__)
713-
int oldp = 0;
714-
size_t size = sizeof(oldp);
715-
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
716-
oldp = 0;
717-
}
718-
ggml_arm_arch_features.has_neon = oldp;
719-
720-
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
721-
oldp = 0;
722-
}
723-
ggml_arm_arch_features.has_dotprod = oldp;
724-
725-
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
726-
oldp = 0;
727-
}
728-
ggml_arm_arch_features.has_i8mm = oldp;
729-
730-
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
731-
oldp = 0;
732-
}
733-
ggml_arm_arch_features.has_sme = oldp;
734-
735-
ggml_arm_arch_features.has_sve = 0;
736-
ggml_arm_arch_features.sve_cnt = 0;
737-
#else
738-
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
739-
#if defined(__ARM_NEON)
740-
ggml_arm_arch_features.has_neon = 1;
741-
#else
742-
ggml_arm_arch_features.has_neon = 0;
743-
#endif
744-
745-
#if defined(__ARM_FEATURE_MATMUL_INT8)
746-
ggml_arm_arch_features.has_i8mm = 1;
747-
#else
748-
ggml_arm_arch_features.has_i8mm = 0;
749-
#endif
750-
751-
#if defined(__ARM_FEATURE_SVE)
752-
ggml_arm_arch_features.has_sve = 1;
753-
ggml_arm_arch_features.sve_cnt = 16;
754-
#else
755-
ggml_arm_arch_features.has_sve = 0;
756-
ggml_arm_arch_features.sve_cnt = 0;
757-
#endif
758-
759-
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
760-
ggml_arm_arch_features.has_sme = 1;
761-
#else
762-
ggml_arm_arch_features.has_sme = 0;
763-
#endif
764-
#endif
765687
}
766-
#endif
688+
689+
#endif // __ARM_ARCH
767690

768691
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
769692
GGML_ASSERT(!ggml_get_no_alloc(ctx));
@@ -3472,31 +3395,31 @@ int ggml_cpu_has_vxe(void) {
34723395

34733396
int ggml_cpu_has_neon(void) {
34743397
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
3475-
return ggml_arm_arch_features.has_neon;
3398+
return 1;
34763399
#else
34773400
return 0;
34783401
#endif
34793402
}
34803403

34813404
int ggml_cpu_has_dotprod(void) {
34823405
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3483-
return ggml_arm_arch_features.has_dotprod;
3406+
return 1;
34843407
#else
34853408
return 0;
34863409
#endif
34873410
}
34883411

34893412
int ggml_cpu_has_sve(void) {
34903413
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3491-
return ggml_arm_arch_features.has_sve;
3414+
return 1;
34923415
#else
34933416
return 0;
34943417
#endif
34953418
}
34963419

34973420
int ggml_cpu_has_matmul_int8(void) {
34983421
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3499-
return ggml_arm_arch_features.has_i8mm;
3422+
return 1;
35003423
#else
35013424
return 0;
35023425
#endif
@@ -3512,7 +3435,7 @@ int ggml_cpu_get_sve_cnt(void) {
35123435

35133436
int ggml_cpu_has_sme(void) {
35143437
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3515-
return ggml_arm_arch_features.has_sme;
3438+
return 1;
35163439
#else
35173440
return 0;
35183441
#endif

ggml/src/ggml-cuda/conv2d-dw.cu

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#include "conv2d-dw.cuh"
2+
3+
struct conv_params {
4+
int in_w, in_h;
5+
int out_w, out_h;
6+
int kernel_w, kernel_h;
7+
int stride_x, stride_y;
8+
int padding_x, padding_y;
9+
int dilation_x, dilation_y;
10+
int channels, batches;
11+
};
12+
13+
struct kernel_bounds {
14+
int y_min, y_max;
15+
int x_min, x_max;
16+
};
17+
18+
__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
19+
kernel_bounds bounds;
20+
bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
21+
bounds.y_max =
22+
min(params.kernel_h,
23+
(params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
24+
bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
25+
bounds.x_max =
26+
min(params.kernel_w,
27+
(params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
28+
return bounds;
29+
}
30+
31+
__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
32+
return out_coord * stride + kern_coord * dilation - padding;
33+
}
34+
35+
struct whcn_layout {
36+
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
37+
return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
38+
}
39+
40+
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
41+
return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
42+
}
43+
44+
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
45+
return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
46+
y * params.out_w + x;
47+
}
48+
49+
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
50+
int & out_x) {
51+
out_x = global_idx % params.out_w;
52+
out_y = (global_idx / params.out_w) % params.out_h;
53+
c = (global_idx / (params.out_w * params.out_h)) % params.channels;
54+
n = global_idx / (params.out_w * params.out_h * params.channels);
55+
}
56+
};
57+
58+
struct cwhn_layout {
59+
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
60+
return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
61+
}
62+
63+
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
64+
return (ky * params.kernel_w + kx) * params.channels + c;
65+
}
66+
67+
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
68+
return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
69+
x * params.channels + c;
70+
}
71+
72+
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
73+
int & out_x) {
74+
c = global_idx % params.channels;
75+
out_x = (global_idx / params.channels) % params.out_w;
76+
out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
77+
n = global_idx / (params.channels * params.out_w * params.out_h);
78+
}
79+
};
80+
81+
template <typename T, typename Layout>
82+
__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
83+
const int in_w, const int in_h, const int out_w, const int out_h,
84+
const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
85+
const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
86+
const int channels, const int batches) {
87+
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
88+
const int total_elements = batches * channels * out_h * out_w;
89+
90+
if (global_idx >= total_elements) {
91+
return;
92+
}
93+
94+
conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x,
95+
stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
96+
97+
int batch_idx, channel_idx, out_y_idx, out_x_idx;
98+
Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
99+
100+
T accumulator = 0;
101+
kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
102+
103+
for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
104+
int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
105+
106+
for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
107+
int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
108+
109+
const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
110+
const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
111+
112+
accumulator += input_val * kernel_val;
113+
}
114+
}
115+
116+
output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
117+
}
118+
119+
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
120+
const ggml_tensor * kernel = dst->src[0];
121+
const ggml_tensor * input = dst->src[1];
122+
123+
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
124+
const float * w_d = (const float *) kernel->data;
125+
const float * x_d = (const float *) input->data;
126+
float * y_d = (float *) dst->data;
127+
128+
const int32_t * p = (const int32_t *) dst->op_params;
129+
const int stride_x = p[0];
130+
const int stride_y = p[1];
131+
const int padding_x = p[2];
132+
const int padding_y = p[3];
133+
const int dilation_x = p[4];
134+
const int dilation_y = p[5];
135+
136+
const int in_w = input->ne[0];
137+
const int in_h = input->ne[1];
138+
const int kernel_w = kernel->ne[0];
139+
const int kernel_h = kernel->ne[1];
140+
const int out_w = dst->ne[0];
141+
const int out_h = dst->ne[1];
142+
const int channels = dst->ne[2];
143+
const int batches = dst->ne[3];
144+
145+
cudaStream_t st = ctx.stream();
146+
147+
const int total = batches * channels * out_h * out_w;
148+
const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
149+
150+
if (ggml_is_contiguous(input)) {
151+
conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
152+
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
153+
dilation_x, dilation_y, channels, batches);
154+
} else if (ggml_is_contiguous_channels(input)) {
155+
conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
156+
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
157+
dilation_x, dilation_y, channels, batches);
158+
} else {
159+
GGML_ABORT("Unsupported memory layout for conv_2d_dw");
160+
}
161+
}

ggml/src/ggml-cuda/conv2d-dw.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
#include "common.cuh"
3+
4+
#define CUDA_CONV2D_DW_BLOCK_SIZE 256
5+
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ bool g_mul_mat_q = true;
1313
#include "ggml-cuda/clamp.cuh"
1414
#include "ggml-cuda/concat.cuh"
1515
#include "ggml-cuda/conv-transpose-1d.cuh"
16+
#include "ggml-cuda/conv2d-dw.cuh"
1617
#include "ggml-cuda/convert.cuh"
1718
#include "ggml-cuda/count-equal.cuh"
1819
#include "ggml-cuda/cpy.cuh"
@@ -2315,6 +2316,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23152316
case GGML_OP_IM2COL:
23162317
ggml_cuda_op_im2col(ctx, dst);
23172318
break;
2319+
case GGML_OP_CONV_2D_DW:
2320+
ggml_cuda_op_conv2d_dw(ctx, dst);
2321+
break;
23182322
case GGML_OP_CONV_TRANSPOSE_1D:
23192323
ggml_cuda_op_conv_transpose_1d(ctx,dst);
23202324
break;
@@ -3214,6 +3218,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32143218
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
32153219
}
32163220
case GGML_OP_IM2COL:
3221+
case GGML_OP_CONV_2D_DW:
32173222
case GGML_OP_POOL_2D:
32183223
case GGML_OP_SUM:
32193224
case GGML_OP_SUM_ROWS:

0 commit comments

Comments
 (0)