Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 219 additions & 51 deletions src/bench_ecmult.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@
#include "scalar_impl.h"
#include "ecmult_impl.h"
#include "bench.h"
#include "tests_common.h"

#define POINTS 32768

/* Default memory limit (64 MB) */
#define DEFAULT_MEM_LIMIT (64 * 1024 * 1024)
/* Select bench algorithm automatically */
#define BENCH_ALGO_AUTO (-1)

static void help(char **argv, int default_iters) {
printf("Benchmark EC multiplication algorithms\n");
printf("\n");
Expand All @@ -30,23 +36,25 @@ static void help(char **argv, int default_iters) {
printf("function name. The letter 'g' indicates that one of the points is the generator.\n");
printf("The benchmarks are divided by the number of points.\n");
printf("\n");
printf("default (ecmult_multi): picks pippenger_wnaf or strauss_wnaf depending on the\n");
printf(" batch size\n");
printf("pippenger_wnaf: for all batch sizes\n");
printf("strauss_wnaf: for all batch sizes\n");
printf("simple: multiply and sum each point individually\n");
printf("default (auto): automatically select best algorithm\n");
printf("pippenger_wnaf: for all batch sizes\n");
printf("strauss_wnaf: for all batch sizes\n");
printf("simple: multiply and sum each point individually\n");
printf("\n");
}

typedef struct {
/* Setup once in advance */
secp256k1_context* ctx;
secp256k1_scratch_space* scratch;
secp256k1_scalar* scalars;
secp256k1_ge* pubkeys;
secp256k1_gej* pubkeys_gej;
secp256k1_scalar* seckeys;
secp256k1_gej* expected_output;
secp256k1_ecmult_multi_func ecmult_multi;

/* Algorithm selection */
int forced_algo;
size_t mem_limit;

/* Changes per benchmark */
size_t count;
Expand All @@ -62,6 +70,142 @@ typedef struct {
secp256k1_fe* output_xonly;
} bench_data;

/*
* ABCD Calibration Benchmarks
*
* Measures the performance of each algorithm at various batch sizes and
* outputs. Use tools/ecmult_multi_calib.py to calculate optimal C and D
* values from the output.
*
* Each algorithm is only calibrated within its optimal batch size range
* to avoid skewing results with uninteresting results far away from that
* range.
*/
static void run_ecmult_multi_calib(bench_data* data) {
static const size_t batch_sizes[] = {
/* Small numbers should help stabilize Strauss intercept */
2, 3, 5, 7, 10, 15, 20, 30, 50, 70,
/* Crossover region between Strauss and Pippenger */
85, 88, 90, 100, 120, 150, 175,
/* Pippenger windows, getting progressively larger */
200, 300, 500, 750, 1000, 1200, 1500, 2000, 3000, 5000, 7500, 10000, 15000, 20000, 30000
};
static const size_t n_batch_sizes = sizeof(batch_sizes) / sizeof(batch_sizes[0]);

static const char* algo_names[] = {
"TRIVIAL", "STRAUSS", "PIPPENGER_1", "PIPPENGER_2", "PIPPENGER_3", "PIPPENGER_4", "PIPPENGER_5", "PIPPENGER_6", "PIPPENGER_7", "PIPPENGER_8", "PIPPENGER_9", "PIPPENGER_10", "PIPPENGER_11", "PIPPENGER_12"
};

/* Maximum batch size for Strauss calibration. */
static const size_t STRAUSS_MAX_CALIB_BATCH = 500;

/* Per-window min/max batch sizes for Pippenger calibration. */
static const size_t pippenger_min_calib_batch[12] = {
/*w=1 2 3 4 5 6 7 8 9 10 11 12 */
5, 5, 10, 30, 70, 150, 300, 750, 1500, 3000, 7500, 15000
};
static const size_t pippenger_max_calib_batch[12] = {
/*w=1 2 3 4 5 6 7 8 9 10 11 12 */
100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000, 30000, 30000, 30000
};

secp256k1_ge *points = NULL;
secp256k1_scalar *scalars = NULL;
secp256k1_gej result;
size_t max_points = batch_sizes[n_batch_sizes - 1];
int algo;
size_t i, j;
int base_iters = 1000;

points = (secp256k1_ge *)malloc(max_points * sizeof(secp256k1_ge));
scalars = (secp256k1_scalar *)malloc(max_points * sizeof(secp256k1_scalar));
CHECK(points != NULL);
CHECK(scalars != NULL);

for (i = 0; i < max_points; i++) {
points[i] = data->pubkeys[i % POINTS];
scalars[i] = data->scalars[i % POINTS];
}

printf("# ECMULT_MULTI Calibration Data\n");
printf("# Format: ALGO,N,TIME_US (microseconds per batch)\n");
printf("# Copy the DATA section below into the Python script\n");
printf("#\n");
printf("# BEGIN DATA\n");

/* Measure STRAUSS */
algo = SECP256K1_ECMULT_MULTI_ALGO_STRAUSS;
for (i = 0; i < n_batch_sizes; i++) {
size_t n = batch_sizes[i];
int64_t t_start, t_end;
double time_us;
int iters = base_iters;
int iter;

/* Only run up to the max to not skew result */
if (n > STRAUSS_MAX_CALIB_BATCH) continue;

/* Using many iterations in Strauss since batch sizes are small */
if (n >= 300) iters = base_iters / 2;
if (iters < 100) iters = 100;

t_start = gettime_i64();
for (iter = 0; iter < iters; iter++) {
secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo,
&result, n, points, scalars, NULL);
}
t_end = gettime_i64();

time_us = (double)(t_end - t_start) / iters;
printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us);
}

/* Measure PIPPENGER variants */
for (algo = SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1;
algo <= SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_12;
algo++) {
int window = algo - SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1;
size_t min_batch = pippenger_min_calib_batch[window];
size_t max_batch = pippenger_max_calib_batch[window];

for (j = 0; j < n_batch_sizes; j++) {
size_t n = batch_sizes[j];
int64_t t_start, t_end;
double time_us;
int iters = base_iters;
int iter;

/* Only run for the selected range of each algo */
if (n < min_batch || n > max_batch) continue;

/* Limiting iterations to keep run-time managable */
if (n >= 1000) iters = base_iters / 10;
if (n >= 5000) iters = base_iters / 50;
if (n >= 15000) iters = base_iters / 100;
if (iters < 3) iters = 3;

t_start = gettime_i64();
for (iter = 0; iter < iters; iter++) {
secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo,
&result, n, points, scalars, NULL);
}
t_end = gettime_i64();

time_us = (double)(t_end - t_start) / iters;
printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us);
}
}

printf("# END DATA\n");
printf("#\n");
printf("# To calculate ABCD constants, run:\n");
printf("# ./bench_ecmult calib 2>&1 | python3 tools/ecmult_multi_calib.py\n");
printf("#\n");

free(points);
free(scalars);
}

/* Hashes x into [0, POINTS) twice and store the result in offset1 and offset2. */
static void hash_into_offset(bench_data* data, size_t x) {
data->offset1 = (x * 0x537b7f6f + 0x8f66a481) % POINTS;
Expand Down Expand Up @@ -214,32 +358,54 @@ static void run_ecmult_bench(bench_data* data, int iters) {
run_benchmark(str, bench_ecmult_1p_g, bench_ecmult_setup, bench_ecmult_1p_g_teardown, data, 10, 2*iters);
}

static int bench_ecmult_multi_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
bench_data* data = (bench_data*)arg;
if (data->includes_g) ++idx;
if (idx == 0) {
*sc = data->scalars[data->offset1];
*ge = secp256k1_ge_const_g;
} else {
*sc = data->scalars[(data->offset1 + idx) % POINTS];
*ge = data->pubkeys[(data->offset2 + idx - 1) % POINTS];
}
return 1;
}

static void bench_ecmult_multi(void* arg, int iters) {
bench_data* data = (bench_data*)arg;

int includes_g = data->includes_g;
int iter;
int count = data->count;
size_t n_points = count - includes_g;
secp256k1_ecmult_multi_algo algo;
secp256k1_ge *points = NULL;
secp256k1_scalar *scalars = NULL;
size_t i;
iters = iters / data->count;

if (n_points > 0) {
points = (secp256k1_ge *)malloc(n_points * sizeof(secp256k1_ge));
scalars = (secp256k1_scalar *)malloc(n_points * sizeof(secp256k1_scalar));
CHECK(points != NULL);
CHECK(scalars != NULL);
}

for (iter = 0; iter < iters; ++iter) {
data->ecmult_multi(&data->ctx->error_callback, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_ecmult_multi_callback, arg, count - includes_g);
const secp256k1_scalar *g_scalar_ptr = NULL;

if (includes_g) {
g_scalar_ptr = &data->scalars[data->offset1];
}

for (i = 0; i < n_points; ++i) {
size_t idx = includes_g ? i + 1 : i;
scalars[i] = data->scalars[(data->offset1 + idx) % POINTS];
points[i] = data->pubkeys[(data->offset2 + i) % POINTS];
}

if (data->forced_algo >= 0) {
algo = data->forced_algo;
} else {
algo = secp256k1_ecmult_multi_select(data->mem_limit, n_points);
}

CHECK(secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo, &data->output[iter],
n_points, points, scalars, g_scalar_ptr));

data->offset1 = (data->offset1 + count) % POINTS;
data->offset2 = (data->offset2 + count - 1) % POINTS;
}

free(points);
free(scalars);
}

static void bench_ecmult_multi_setup(void* arg) {
Expand Down Expand Up @@ -309,7 +475,7 @@ static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_
int main(int argc, char **argv) {
bench_data data;
int i, p;
size_t scratch_size;
int run_calib = 0;

int default_iters = 10000;
int iters = get_iters(default_iters);
Expand All @@ -318,7 +484,8 @@ int main(int argc, char **argv) {
return EXIT_FAILURE;
}

data.ecmult_multi = secp256k1_ecmult_multi_var;
data.forced_algo = BENCH_ALGO_AUTO;
data.mem_limit = DEFAULT_MEM_LIMIT;

if (argc > 1) {
if(have_flag(argc, argv, "-h")
Expand All @@ -328,12 +495,19 @@ int main(int argc, char **argv) {
return EXIT_SUCCESS;
} else if(have_flag(argc, argv, "pippenger_wnaf")) {
printf("Using pippenger_wnaf:\n");
data.ecmult_multi = secp256k1_ecmult_pippenger_batch_single;
/* TODO: Make this a dynamic selection again */
data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_4;
} else if(have_flag(argc, argv, "strauss_wnaf")) {
printf("Using strauss_wnaf:\n");
data.ecmult_multi = secp256k1_ecmult_strauss_batch_single;
data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_STRAUSS;
} else if(have_flag(argc, argv, "simple")) {
printf("Using simple algorithm:\n");
data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_TRIVIAL;
} else if(have_flag(argc, argv, "auto")) {
printf("Using automatic algorithm selection:\n");
data.forced_algo = BENCH_ALGO_AUTO;
} else if(have_flag(argc, argv, "calib")) {
run_calib = 1;
} else {
fprintf(stderr, "%s: unrecognized argument '%s'.\n\n", argv[0], argv[1]);
help(argv, default_iters);
Expand All @@ -342,12 +516,6 @@ int main(int argc, char **argv) {
}

data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_NONE);
scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*ALIGNMENT;
if (!have_flag(argc, argv, "simple")) {
data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);
} else {
data.scratch = NULL;
}

/* Allocate stuff */
data.scalars = malloc(sizeof(secp256k1_scalar) * POINTS);
Expand All @@ -370,32 +538,32 @@ int main(int argc, char **argv) {
}
secp256k1_ge_set_all_gej_var(data.pubkeys, data.pubkeys_gej, POINTS);

if (run_calib) {
run_ecmult_multi_calib(&data);
} else {
print_output_table_header_row();
/* Initialize offset1 and offset2 */
hash_into_offset(&data, 0);
run_ecmult_bench(&data, iters);

print_output_table_header_row();
/* Initialize offset1 and offset2 */
hash_into_offset(&data, 0);
run_ecmult_bench(&data, iters);

for (i = 1; i <= 8; ++i) {
run_ecmult_multi_bench(&data, i, 1, iters);
}
for (i = 1; i <= 8; ++i) {
run_ecmult_multi_bench(&data, i, 1, iters);
}

/* This is disabled with low count of iterations because the loop runs 77 times even with iters=1
* and the higher it goes the longer the computation takes(more points)
* So we don't run this benchmark with low iterations to prevent slow down */
if (iters > 2) {
for (p = 0; p <= 11; ++p) {
for (i = 9; i <= 16; ++i) {
run_ecmult_multi_bench(&data, i << p, 1, iters);
/* This is disabled with low count of iterations because the loop runs 77 times even with iters=1
* and the higher it goes the longer the computation takes(more points)
* So we don't run this benchmark with low iterations to prevent slow down */
if (iters > 2) {
for (p = 0; p <= 11; ++p) {
for (i = 9; i <= 16; ++i) {
run_ecmult_multi_bench(&data, i << p, 1, iters);
}
}
} else {
printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n");
}
} else {
printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n");
}

if (data.scratch != NULL) {
secp256k1_scratch_space_destroy(data.ctx, data.scratch);
}
secp256k1_context_destroy(data.ctx);
free(data.scalars);
free(data.pubkeys);
Expand Down
Loading