Skip to content

Commit e49462b

Browse files
committed
Revert "Biased mmvq: minor optimization (ikawrakow#880)"
This reverts commit fd3757d.
1 parent 76d9ef9 commit e49462b

File tree

2 files changed

+7
-46
lines changed

2 files changed

+7
-46
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,6 @@ struct cmd_params {
265265
bool no_fug = false;
266266
bool use_thp = false;
267267
bool no_ooae = false;
268-
bool mqkv = false;
269268
output_formats output_format;
270269
output_formats output_format_stderr;
271270
};
@@ -304,7 +303,6 @@ static const cmd_params cmd_params_defaults = {
304303
/* no_fug */ false,
305304
/* use_thp */ false,
306305
/* no_ooae */ false,
307-
/* mqkv */ false,
308306
/* output_format */ MARKDOWN,
309307
/* output_format_stderr */ NONE,
310308
};
@@ -344,7 +342,6 @@ static void print_usage(int /* argc */, char ** argv) {
344342
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
345343
printf(" -w, --warmup <0|1> (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
346344
printf(" -rtr, --run-time-repack <0|1> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
347-
printf(" -mqkv, --merge-qkv (default: %s)\n", cmd_params_defaults.mqkv ? "1" : "0");
348345
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
349346
printf(" -ot, --override-tensor pattern (default: none)\n");
350347
printf(" -fmoe, --fused-moe <0|1> (default: %s)\n", cmd_params_defaults.fmoe? "1" : "0");
@@ -736,12 +733,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
736733
break;
737734
}
738735
params.repack = std::stoi(argv[i]);
739-
} else if (arg == "-mqkv" || arg == "--merge-qkv") {
740-
if (++i >= argc) {
741-
invalid_param = true;
742-
break;
743-
}
744-
params.mqkv = std::stoi(argv[i]);
745736
} else if (arg == "-thp" || arg == "--transparent-huge-pages") {
746737
if (++i >= argc) {
747738
invalid_param = true;
@@ -860,7 +851,6 @@ struct cmd_params_instance {
860851
bool no_fug = false;
861852
bool use_thp = false;
862853
bool no_ooae = false;
863-
bool mqkv = false;
864854
const llama_model_tensor_buft_override* buft_overrides;
865855

866856
llama_model_params to_llama_mparams() const {
@@ -876,7 +866,6 @@ struct cmd_params_instance {
876866
mparams.use_mmap = use_mmap;
877867
mparams.repack_tensors = repack;
878868
mparams.use_thp = use_thp;
879-
mparams.merge_qkv = mqkv;
880869
mparams.tensor_buft_overrides = buft_overrides;
881870

882871
return mparams;
@@ -890,7 +879,6 @@ struct cmd_params_instance {
890879
main_gpu == other.main_gpu &&
891880
use_mmap == other.use_mmap &&
892881
repack == other.repack &&
893-
mqkv == other.mqkv &&
894882
use_thp == other.use_thp &&
895883
tensor_split == other.tensor_split;
896884
}
@@ -973,7 +961,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
973961
/* .no_fug = */ params.no_fug,
974962
/* .use_thp = */ params.use_thp,
975963
/* .no_ooae = */ params.no_ooae,
976-
/* .mqkv = */ params.mqkv,
977964
/* .buft_overrides=*/ params.buft_overrides.data(),
978965
};
979966
instances.push_back(instance);
@@ -1011,7 +998,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1011998
/* .no_fug = */ params.no_fug,
1012999
/* .use_thp = */ params.use_thp,
10131000
/* .no_ooae = */ params.no_ooae,
1014-
/* .mqkv = */ params.mqkv,
10151001
/* .buft_overrides=*/ params.buft_overrides.data(),
10161002
};
10171003
instances.push_back(instance);
@@ -1049,7 +1035,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10491035
/* .no_fug = */ params.no_fug,
10501036
/* .use_thp = */ params.use_thp,
10511037
/* .no_ooae = */ params.no_ooae,
1052-
/* .mqkv = */ params.mqkv,
10531038
/* .buft_overrides=*/ params.buft_overrides.data(),
10541039
};
10551040
instances.push_back(instance);
@@ -1086,8 +1071,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10861071
/* .ger = */ params.ger,
10871072
/* .no_fug = */ params.no_fug,
10881073
/* .use_thp = */ params.use_thp,
1089-
/* .no_ooae = */ params.no_ooae,
1090-
/* .mqkv = */ params.mqkv,
1074+
/* .no_ooae = */ params.no_ooae,
10911075
/* .buft_overrides=*/ params.buft_overrides.data(),
10921076
};
10931077
instances.push_back(instance);
@@ -1136,7 +1120,6 @@ struct test {
11361120
bool no_fug = false;
11371121
bool use_thp = false;
11381122
bool no_ooae = false;
1139-
bool mqkv = false;
11401123
int n_prompt;
11411124
int n_gen;
11421125
std::string test_time;
@@ -1169,7 +1152,6 @@ struct test {
11691152
use_mmap = inst.use_mmap;
11701153
embeddings = inst.embeddings;
11711154
repack = inst.repack;
1172-
mqkv = inst.mqkv;
11731155
fmoe = inst.fmoe;
11741156
ger = inst.ger;
11751157
no_fug = inst.no_fug;
@@ -1265,7 +1247,7 @@ struct test {
12651247
"n_threads", "type_k", "type_v",
12661248
"n_gpu_layers", "split_mode",
12671249
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser",
1268-
"tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "fused_moe", "grouped_er", "fused_up_gate", "use_thp", "ooae",
1250+
"tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "grouped_er", "fused_up_gate", "use_thp", "ooae",
12691251
"n_prompt", "n_gen", "test_time",
12701252
"avg_ns", "stddev_ns",
12711253
"avg_ts", "stddev_ts", "test",
@@ -1287,7 +1269,7 @@ struct test {
12871269
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
12881270
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
12891271
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
1290-
field == "fused_moe" || field == "grouped_er" || field == "fused_up_gate" || field == "ooae" || field == "mqkv") {
1272+
field == "fused_moe" || field == "grouped_er" || field == "fused_up_gate" || field == "ooae") {
12911273
return BOOL;
12921274
}
12931275
if (field == "avg_ts" || field == "stddev_ts") {
@@ -1331,7 +1313,7 @@ struct test {
13311313
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser),
13321314
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
13331315
std::to_string(repack), std::to_string(fmoe), std::to_string(ger),
1334-
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(mqkv),
1316+
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae),
13351317
std::to_string(n_prompt), std::to_string(n_gen), test_time,
13361318
std::to_string(avg_ns()), std::to_string(stdev_ns()),
13371319
std::to_string(avg_ts()), std::to_string(stdev_ts()),
@@ -1509,9 +1491,6 @@ struct markdown_printer : public printer {
15091491
if (field == "repack") {
15101492
return 3;
15111493
}
1512-
if (field == "mqkv") {
1513-
return 4;
1514-
}
15151494
if (field == "use_thp") {
15161495
return 3;
15171496
}
@@ -1570,9 +1549,6 @@ struct markdown_printer : public printer {
15701549
if (field == "repack") {
15711550
return "rtr";
15721551
}
1573-
if (field == "mqkv") {
1574-
return "mqkv";
1575-
}
15761552
if (field == "use_thp") {
15771553
return "thp";
15781554
}
@@ -1658,9 +1634,6 @@ struct markdown_printer : public printer {
16581634
if (params.repack != cmd_params_defaults.repack) {
16591635
fields.emplace_back("repack");
16601636
}
1661-
if (params.mqkv != cmd_params_defaults.mqkv) {
1662-
fields.emplace_back("mqkv");
1663-
}
16641637
if (params.use_thp != cmd_params_defaults.use_thp) {
16651638
fields.emplace_back("use_thp");
16661639
}

ggml/src/ggml-cuda/mmvq-templates.cuh

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,6 @@ static __device__ void mul_mat_vec_q(
112112
}
113113
}
114114

115-
float local_bias[rows_per_cuda_block] = { 0.0f };
116-
if (bias && threadIdx.y == 0 && threadIdx.x < rows_per_cuda_block && row0 + threadIdx.x < nrows_dst) {
117-
local_bias[threadIdx.x] = bias[row0 + threadIdx.x];
118-
}
119115
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
120116
if (threadIdx.y > 0) {
121117
#pragma unroll
@@ -144,7 +140,7 @@ static __device__ void mul_mat_vec_q(
144140
}
145141

146142
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
147-
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x] + local_bias[threadIdx.x];
143+
dst[j*nrows_dst + row0 + threadIdx.x] = bias ? tmp[j][threadIdx.x] + bias[j*nrows_dst + row0 + threadIdx.x] : tmp[j][threadIdx.x];
148144
}
149145
}
150146
}
@@ -180,14 +176,6 @@ static __device__ void fused_mul_mat_vec_q(
180176
// partial sum for each thread
181177
float tmp_u[ncols_y][rows_per_cuda_block] = {0.0f};
182178
float tmp_g[ncols_y][rows_per_cuda_block] = {0.0f};
183-
float local_bias_u[rows_per_cuda_block] = { 0.0f };
184-
float local_bias_g[rows_per_cuda_block] = { 0.0f };
185-
if (bias_u && threadIdx.y == 0 && threadIdx.x < rows_per_cuda_block && row0 + threadIdx.x < nrows_dst) {
186-
local_bias_u[threadIdx.x] = bias_u[row0 + threadIdx.x];
187-
}
188-
if (bias_g && threadIdx.y == 0 && threadIdx.x < rows_per_cuda_block && row0 + threadIdx.x < nrows_dst) {
189-
local_bias_g[threadIdx.x] = bias_g[row0 + threadIdx.x];
190-
}
191179

192180
const block_q8_1 * y = (const block_q8_1 *) vy;
193181

@@ -254,8 +242,8 @@ static __device__ void fused_mul_mat_vec_q(
254242
default: {
255243
constexpr float alpha = 1.702f;
256244
constexpr float limit = 7.0f;
257-
g += local_bias_g[threadIdx.x];
258-
u += local_bias_u[threadIdx.x];
245+
g += bias_g[j*nrows_dst + row0 + threadIdx.x];
246+
u += bias_u[j*nrows_dst + row0 + threadIdx.x];
259247
g = fminf(g, limit);
260248
u = fmaxf(fminf(u, limit), -limit);
261249
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);

0 commit comments

Comments
 (0)