|
16 | 16 | #include "scalar_impl.h" |
17 | 17 | #include "ecmult_impl.h" |
18 | 18 | #include "bench.h" |
| 19 | +#include "tests_common.h" |
19 | 20 |
|
20 | 21 | #define POINTS 32768 |
21 | 22 |
|
@@ -69,6 +70,142 @@ typedef struct { |
69 | 70 | secp256k1_fe* output_xonly; |
70 | 71 | } bench_data; |
71 | 72 |
|
| 73 | +/* |
| 74 | + * ABCD Calibration Benchmarks |
| 75 | + * |
| 76 | + * Measures the performance of each algorithm at various batch sizes and |
| 77 | + * outputs. Use tools/ecmult_multi_calib.py to calculate optimal C and D |
| 78 | + * values from the output. |
| 79 | + * |
| 80 | + * Each algorithm is only calibrated within its optimal batch size range |
| 81 | + * to avoid skewing results with uninteresting results far away from that |
| 82 | + * range. |
| 83 | + */ |
| 84 | +static void run_ecmult_multi_calib(bench_data* data) { |
| 85 | + static const size_t batch_sizes[] = { |
| 86 | + /* Small numbers should help stabilize Strauss intercept */ |
| 87 | + 2, 3, 5, 7, 10, 15, 20, 30, 50, 70, |
| 88 | + /* Crossover region between Strauss and Pippenger */ |
| 89 | + 85, 88, 90, 100, 120, 150, 175, |
| 90 | + /* Pippenger windows, getting progressively larger */ |
| 91 | + 200, 300, 500, 750, 1000, 1200, 1500, 2000, 3000, 5000, 7500, 10000, 15000, 20000, 30000 |
| 92 | + }; |
| 93 | + static const size_t n_batch_sizes = sizeof(batch_sizes) / sizeof(batch_sizes[0]); |
| 94 | + |
| 95 | + static const char* algo_names[] = { |
| 96 | + "TRIVIAL", "STRAUSS", "PIPPENGER_1", "PIPPENGER_2", "PIPPENGER_3", "PIPPENGER_4", "PIPPENGER_5", "PIPPENGER_6", "PIPPENGER_7", "PIPPENGER_8", "PIPPENGER_9", "PIPPENGER_10", "PIPPENGER_11", "PIPPENGER_12" |
| 97 | + }; |
| 98 | + |
| 99 | + /* Maximum batch size for Strauss calibration. */ |
| 100 | + static const size_t STRAUSS_MAX_CALIB_BATCH = 500; |
| 101 | + |
| 102 | + /* Per-window min/max batch sizes for Pippenger calibration. */ |
| 103 | + static const size_t pippenger_min_calib_batch[12] = { |
| 104 | + /*w=1 2 3 4 5 6 7 8 9 10 11 12 */ |
| 105 | + 5, 5, 10, 30, 70, 150, 300, 750, 1500, 3000, 7500, 15000 |
| 106 | + }; |
| 107 | + static const size_t pippenger_max_calib_batch[12] = { |
| 108 | + /*w=1 2 3 4 5 6 7 8 9 10 11 12 */ |
| 109 | + 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000, 30000, 30000, 30000 |
| 110 | + }; |
| 111 | + |
| 112 | + secp256k1_ge *points = NULL; |
| 113 | + secp256k1_scalar *scalars = NULL; |
| 114 | + secp256k1_gej result; |
| 115 | + size_t max_points = batch_sizes[n_batch_sizes - 1]; |
| 116 | + int algo; |
| 117 | + size_t i, j; |
| 118 | + int base_iters = 1000; |
| 119 | + |
| 120 | + points = (secp256k1_ge *)malloc(max_points * sizeof(secp256k1_ge)); |
| 121 | + scalars = (secp256k1_scalar *)malloc(max_points * sizeof(secp256k1_scalar)); |
| 122 | + CHECK(points != NULL); |
| 123 | + CHECK(scalars != NULL); |
| 124 | + |
| 125 | + for (i = 0; i < max_points; i++) { |
| 126 | + points[i] = data->pubkeys[i % POINTS]; |
| 127 | + scalars[i] = data->scalars[i % POINTS]; |
| 128 | + } |
| 129 | + |
| 130 | + printf("# ECMULT_MULTI Calibration Data\n"); |
| 131 | + printf("# Format: ALGO,N,TIME_US (microseconds per batch)\n"); |
| 132 | + printf("# Copy the DATA section below into the Python script\n"); |
| 133 | + printf("#\n"); |
| 134 | + printf("# BEGIN DATA\n"); |
| 135 | + |
| 136 | + /* Measure STRAUSS */ |
| 137 | + algo = SECP256K1_ECMULT_MULTI_ALGO_STRAUSS; |
| 138 | + for (i = 0; i < n_batch_sizes; i++) { |
| 139 | + size_t n = batch_sizes[i]; |
| 140 | + int64_t t_start, t_end; |
| 141 | + double time_us; |
| 142 | + int iters = base_iters; |
| 143 | + int iter; |
| 144 | + |
| 145 | + /* Only run up to the max to not skew result */ |
| 146 | + if (n > STRAUSS_MAX_CALIB_BATCH) continue; |
| 147 | + |
| 148 | + /* Using many iterations in Strauss since batch sizes are small */ |
| 149 | + if (n >= 300) iters = base_iters / 2; |
| 150 | + if (iters < 100) iters = 100; |
| 151 | + |
| 152 | + t_start = gettime_i64(); |
| 153 | + for (iter = 0; iter < iters; iter++) { |
| 154 | + secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo, |
| 155 | + &result, n, points, scalars, NULL); |
| 156 | + } |
| 157 | + t_end = gettime_i64(); |
| 158 | + |
| 159 | + time_us = (double)(t_end - t_start) / iters; |
| 160 | + printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us); |
| 161 | + } |
| 162 | + |
| 163 | + /* Measure PIPPENGER variants */ |
| 164 | + for (algo = SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1; |
| 165 | + algo <= SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_12; |
| 166 | + algo++) { |
| 167 | + int window = algo - SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1; |
| 168 | + size_t min_batch = pippenger_min_calib_batch[window]; |
| 169 | + size_t max_batch = pippenger_max_calib_batch[window]; |
| 170 | + |
| 171 | + for (j = 0; j < n_batch_sizes; j++) { |
| 172 | + size_t n = batch_sizes[j]; |
| 173 | + int64_t t_start, t_end; |
| 174 | + double time_us; |
| 175 | + int iters = base_iters; |
| 176 | + int iter; |
| 177 | + |
| 178 | + /* Only run for the selected range of each algo */ |
| 179 | + if (n < min_batch || n > max_batch) continue; |
| 180 | + |
| 181 | + /* Limiting iterations to keep run-time managable */ |
| 182 | + if (n >= 1000) iters = base_iters / 10; |
| 183 | + if (n >= 5000) iters = base_iters / 50; |
| 184 | + if (n >= 15000) iters = base_iters / 100; |
| 185 | + if (iters < 3) iters = 3; |
| 186 | + |
| 187 | + t_start = gettime_i64(); |
| 188 | + for (iter = 0; iter < iters; iter++) { |
| 189 | + secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo, |
| 190 | + &result, n, points, scalars, NULL); |
| 191 | + } |
| 192 | + t_end = gettime_i64(); |
| 193 | + |
| 194 | + time_us = (double)(t_end - t_start) / iters; |
| 195 | + printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us); |
| 196 | + } |
| 197 | + } |
| 198 | + |
| 199 | + printf("# END DATA\n"); |
| 200 | + printf("#\n"); |
| 201 | + printf("# To calculate ABCD constants, run:\n"); |
| 202 | + printf("# ./bench_ecmult calib 2>&1 | python3 tools/ecmult_multi_calib.py\n"); |
| 203 | + printf("#\n"); |
| 204 | + |
| 205 | + free(points); |
| 206 | + free(scalars); |
| 207 | +} |
| 208 | + |
72 | 209 | /* Hashes x into [0, POINTS) twice and store the result in offset1 and offset2. */ |
73 | 210 | static void hash_into_offset(bench_data* data, size_t x) { |
74 | 211 | data->offset1 = (x * 0x537b7f6f + 0x8f66a481) % POINTS; |
@@ -338,6 +475,7 @@ static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_ |
338 | 475 | int main(int argc, char **argv) { |
339 | 476 | bench_data data; |
340 | 477 | int i, p; |
| 478 | + int run_calib = 0; |
341 | 479 |
|
342 | 480 | int default_iters = 10000; |
343 | 481 | int iters = get_iters(default_iters); |
@@ -368,6 +506,8 @@ int main(int argc, char **argv) { |
368 | 506 | } else if(have_flag(argc, argv, "auto")) { |
369 | 507 | printf("Using automatic algorithm selection:\n"); |
370 | 508 | data.forced_algo = BENCH_ALGO_AUTO; |
| 509 | + } else if(have_flag(argc, argv, "calib")) { |
| 510 | + run_calib = 1; |
371 | 511 | } else { |
372 | 512 | fprintf(stderr, "%s: unrecognized argument '%s'.\n\n", argv[0], argv[1]); |
373 | 513 | help(argv, default_iters); |
@@ -398,27 +538,30 @@ int main(int argc, char **argv) { |
398 | 538 | } |
399 | 539 | secp256k1_ge_set_all_gej_var(data.pubkeys, data.pubkeys_gej, POINTS); |
400 | 540 |
|
| 541 | + if (run_calib) { |
| 542 | + run_ecmult_multi_calib(&data); |
| 543 | + } else { |
| 544 | + print_output_table_header_row(); |
| 545 | + /* Initialize offset1 and offset2 */ |
| 546 | + hash_into_offset(&data, 0); |
| 547 | + run_ecmult_bench(&data, iters); |
401 | 548 |
|
402 | | - print_output_table_header_row(); |
403 | | - /* Initialize offset1 and offset2 */ |
404 | | - hash_into_offset(&data, 0); |
405 | | - run_ecmult_bench(&data, iters); |
406 | | - |
407 | | - for (i = 1; i <= 8; ++i) { |
408 | | - run_ecmult_multi_bench(&data, i, 1, iters); |
409 | | - } |
| 549 | + for (i = 1; i <= 8; ++i) { |
| 550 | + run_ecmult_multi_bench(&data, i, 1, iters); |
| 551 | + } |
410 | 552 |
|
411 | | - /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1 |
412 | | - * and the higher it goes the longer the computation takes(more points) |
413 | | - * So we don't run this benchmark with low iterations to prevent slow down */ |
414 | | - if (iters > 2) { |
415 | | - for (p = 0; p <= 11; ++p) { |
416 | | - for (i = 9; i <= 16; ++i) { |
417 | | - run_ecmult_multi_bench(&data, i << p, 1, iters); |
| 553 | + /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1 |
| 554 | + * and the higher it goes the longer the computation takes(more points) |
| 555 | + * So we don't run this benchmark with low iterations to prevent slow down */ |
| 556 | + if (iters > 2) { |
| 557 | + for (p = 0; p <= 11; ++p) { |
| 558 | + for (i = 9; i <= 16; ++i) { |
| 559 | + run_ecmult_multi_bench(&data, i << p, 1, iters); |
| 560 | + } |
418 | 561 | } |
| 562 | + } else { |
| 563 | + printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n"); |
419 | 564 | } |
420 | | - } else { |
421 | | - printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n"); |
422 | 565 | } |
423 | 566 |
|
424 | 567 | secp256k1_context_destroy(data.ctx); |
|
0 commit comments