Skip to content

Commit 8768c7c

Browse files
committed
fix tests and examples
1 parent 4428593 commit 8768c7c

File tree

7 files changed

+37
-53
lines changed

7 files changed

+37
-53
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -774,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
774774
struct test {
775775
static const std::string build_commit;
776776
static const int build_number;
777-
static const bool cuda;
778-
static const bool vulkan;
779-
static const bool kompute;
780-
static const bool metal;
781-
static const bool sycl;
782-
static const bool gpu_blas;
783-
static const bool blas;
784777
static const std::string cpu_info;
785778
static const std::string gpu_info;
786779
std::string model_filename;
@@ -793,7 +786,6 @@ struct test {
793786
std::string cpu_mask;
794787
bool cpu_strict;
795788
int poll;
796-
bool has_rpc;
797789
ggml_type type_k;
798790
ggml_type type_v;
799791
int n_gpu_layers;
@@ -822,7 +814,6 @@ struct test {
822814
cpu_mask = inst.cpu_mask;
823815
cpu_strict = inst.cpu_strict;
824816
poll = inst.poll;
825-
has_rpc = !inst.rpc_servers.empty();
826817
type_k = inst.type_k;
827818
type_v = inst.type_v;
828819
n_gpu_layers = inst.n_gpu_layers;
@@ -881,7 +872,6 @@ struct test {
881872
static const std::vector<std::string> & get_fields() {
882873
static const std::vector<std::string> fields = {
883874
"build_commit", "build_number",
884-
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
885875
"cpu_info", "gpu_info",
886876
"model_filename", "model_type", "model_size", "model_n_params",
887877
"n_batch", "n_ubatch",
@@ -908,8 +898,7 @@ struct test {
908898
field == "avg_ns" || field == "stddev_ns") {
909899
return INT;
910900
}
911-
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
912-
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
901+
if (field == "f16_kv" || field == "no_kv_offload" ||
913902
field == "cpu_strict" ||
914903
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
915904
return BOOL;
@@ -938,8 +927,6 @@ struct test {
938927
}
939928
std::vector<std::string> values = {
940929
build_commit, std::to_string(build_number),
941-
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
942-
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
943930
cpu_info, gpu_info,
944931
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
945932
std::to_string(n_batch), std::to_string(n_ubatch),
@@ -967,13 +954,6 @@ struct test {
967954

968955
const std::string test::build_commit = LLAMA_COMMIT;
969956
const int test::build_number = LLAMA_BUILD_NUMBER;
970-
const bool test::cuda = !!ggml_cpu_has_cuda();
971-
const bool test::vulkan = !!ggml_cpu_has_vulkan();
972-
const bool test::kompute = !!ggml_cpu_has_kompute();
973-
const bool test::metal = !!ggml_cpu_has_metal();
974-
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
975-
const bool test::blas = !!ggml_cpu_has_blas();
976-
const bool test::sycl = !!ggml_cpu_has_sycl();
977957
const std::string test::cpu_info = get_cpu_info();
978958
const std::string test::gpu_info = get_gpu_info();
979959

@@ -1268,9 +1248,6 @@ struct markdown_printer : public printer {
12681248
value = buf;
12691249
} else if (field == "backend") {
12701250
value = test::get_backend();
1271-
if (t.has_rpc) {
1272-
value += "+RPC";
1273-
}
12741251
} else if (field == "test") {
12751252
if (t.n_prompt > 0 && t.n_gen == 0) {
12761253
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);

examples/quantize-stats/quantize-stats.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
142142
}
143143

144144
static void test_roundtrip_on_chunk(
145-
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
145+
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
146146
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147147
) {
148148
if (layer->type == GGML_TYPE_F16) {
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
156156
if (use_reference) {
157157
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
158158
} else {
159-
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
159+
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
160160
}
161161
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
162162

@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
166166

167167
// Run quantization function for a single layer and update error stats
168168
static void test_roundtrip_on_layer(
169-
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
169+
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
170170
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171171
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
172172
) {
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
187187
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
188188

189189
if (num_chunks < 2 || max_thread < 2) {
190-
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
190+
test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
191191
output_scratch.data(), print_layer_stats ? layer_error : total_error);
192192
} else {
193193
auto & stats = print_layer_stats ? layer_error : total_error;
194194
std::mutex mutex;
195195
uint64_t counter = 0;
196-
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
196+
auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
197197
&quantized_scratch, &output_scratch, chunk_size] () {
198198
error_stats local_stats {};
199199
while (true) {
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
205205
}
206206
lock.unlock();
207207
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
208-
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
208+
test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
209209
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
210210
}
211211
};
@@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
371371
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
372372
continue;
373373
}
374-
const auto * qfns = ggml_get_type_traits(type);
375-
if (qfns->from_float && qfns->to_float) {
374+
const auto * qfns = ggml_get_type_traits(type);
375+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
376+
if (qfns_cpu->from_float && qfns->to_float) {
376377
if (params.verbose) {
377378
printf("testing %s ...\n", ggml_type_name(type));
378379
}
@@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
393394
test_roundtrip_on_layer(
394395
layer_name,
395396
params.per_layer_stats,
396-
*qfns,
397+
*qfns, *qfns_cpu,
397398
params.reference,
398399
kv_tensor.second,
399400
input_scratch,

ggml/src/ggml-blas/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ if (BLAS_FOUND)
1919
target_include_directories(ggml-blas PRIVATE . ..)
2020

2121
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
22+
add_compile_definitions(ACCELERATE_NEW_LAPACK)
23+
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
2224
add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
2325
elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
2426
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.

ggml/src/ggml-quants.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4931,6 +4931,8 @@ void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y,
49314931
quantize_iq2_s(x, y, 1, k, NULL);
49324932
}
49334933

4934+
// =============================== data validation
4935+
49344936
static bool validate_float(float f, size_t i) {
49354937
if (isinf(f)) {
49364938
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);

pocs/vdot/vdot.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,9 @@ int main(int argc, char** argv) {
263263
// Note, we do not include this in the timing as in practical application
264264
// we already have the quantized model weights.
265265
if (useQ4_1) {
266-
funcs->from_float(x1.data(), q41.data(), kVecSize);
266+
funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
267267
} else {
268-
funcs->from_float(x1.data(), q40.data(), kVecSize);
268+
funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
269269
}
270270

271271
// Now measure time the dot product needs using the "scalar" version above
@@ -284,7 +284,7 @@ int main(int argc, char** argv) {
284284
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
285285
}
286286
else {
287-
const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type);
287+
const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
288288
vdot->from_float(y1.data(), q8.data(), kVecSize);
289289
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
290290
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);

tests/test-quantize-fns.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,22 +45,23 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
4545
}
4646

4747
// Total quantization error on test data
48-
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
48+
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
4949
std::vector<uint8_t> tmp_q(2*test_size);
5050
std::vector<float> tmp_out(test_size);
5151

52-
qfns->from_float(test_data, tmp_q.data(), test_size);
52+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
5353
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
5454
return array_rmse(test_data, tmp_out.data(), test_size);
5555
}
5656

5757
// Total quantization error on test data
58-
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
58+
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
5959
std::vector<uint8_t> tmp_q(2*test_size);
6060
std::vector<float> tmp_out(test_size);
6161
std::vector<float> tmp_out_ref(test_size);
6262

63-
qfns->from_float(test_data, tmp_q.data(), test_size);
63+
// FIXME: why is done twice?
64+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
6465
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
6566

6667
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
@@ -84,9 +85,9 @@ static float dot_product_error(
8485
std::vector<uint8_t> tmp_q1(2*test_size);
8586
std::vector<uint8_t> tmp_q2(2*test_size);
8687

87-
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
88+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
8889

89-
qfns->from_float(test_data1, tmp_q1.data(), test_size);
90+
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
9091
vdot->from_float(test_data2, tmp_q2.data(), test_size);
9192

9293
float result = INFINITY;
@@ -145,8 +146,8 @@ int main(int argc, char * argv[]) {
145146
printf("Testing %s\n", ggml_type_name((ggml_type) i));
146147
ggml_quantize_init(ei);
147148

148-
if (qfns->from_float && qfns->to_float) {
149-
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
149+
if (qfns_cpu->from_float && qfns->to_float) {
150+
const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
150151
const float max_quantization_error =
151152
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
152153
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
@@ -161,7 +162,7 @@ int main(int argc, char * argv[]) {
161162
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
162163
}
163164

164-
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
165+
const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
165166
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
166167
num_failed += failed;
167168
if (failed || verbose) {

tests/test-quantize-perf.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,10 @@ static void usage(char * argv[]) {
123123
printf(" --type TYPE set test type as");
124124
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
125125
ggml_type type = (ggml_type) i;
126-
const auto * qfns = ggml_get_type_traits(type);
126+
const auto * qfns = ggml_get_type_traits(type);
127+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
127128
if (ggml_type_name(type) != NULL) {
128-
if (qfns->from_float && qfns->to_float) {
129+
if (qfns_cpu->from_float && qfns->to_float) {
129130
printf(" %s", ggml_type_name(type));
130131
}
131132
}
@@ -277,7 +278,7 @@ int main(int argc, char * argv[]) {
277278
continue;
278279
}
279280

280-
if (qfns->from_float && qfns->to_float) {
281+
if (qfns_cpu->from_float && qfns->to_float) {
281282
printf("%s\n", ggml_type_name(type));
282283

283284
ggml_quantize_init(type);
@@ -301,7 +302,7 @@ int main(int argc, char * argv[]) {
301302
for (size_t size : params.test_sizes) {
302303
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
303304
auto quantize_fn = [&](void) -> float {
304-
qfns->from_float(test_data1, test_q1, size);
305+
qfns_cpu->from_float(test_data1, test_q1, size);
305306
return test_q1[0];
306307
};
307308
size_t quantized_size = ggml_row_size(type, size);
@@ -312,7 +313,7 @@ int main(int argc, char * argv[]) {
312313

313314
if (params.op_dequantize_row_q) {
314315
printf(" dequantize_row_q\n");
315-
qfns->from_float(test_data1, test_q1, largest);
316+
qfns_cpu->from_float(test_data1, test_q1, largest);
316317
for (size_t size : params.test_sizes) {
317318
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
318319
auto quantize_fn = [&](void) -> float {
@@ -330,7 +331,7 @@ int main(int argc, char * argv[]) {
330331
for (size_t size : params.test_sizes) {
331332
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
332333
auto quantize_fn = [&](void) -> float {
333-
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
334+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
334335
vdot->from_float(test_data1, test_q1, size);
335336
return test_q1[0];
336337
};
@@ -342,8 +343,8 @@ int main(int argc, char * argv[]) {
342343

343344
if (params.op_vec_dot_q) {
344345
printf(" vec_dot_q\n");
345-
qfns->from_float(test_data1, test_q1, largest);
346-
qfns->from_float(test_data2, test_q2, largest);
346+
qfns_cpu->from_float(test_data1, test_q1, largest);
347+
qfns_cpu->from_float(test_data2, test_q2, largest);
347348
for (size_t size : params.test_sizes) {
348349
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
349350
auto quantize_fn = [&](void) -> float {

0 commit comments

Comments
 (0)