ecmult_multi: Replace scratch space with malloc, use abcd cost model #1789

New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
fjahr wants to merge 2 commits into bitcoin-core:master from fjahr:2025-11-mem-multi-var
src/bench_ecmult.c
            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -16,9 +16,15 @@
  
    #include "scalar_impl.h"

    #include "ecmult_impl.h"

    #include "bench.h"

    #include "tests_common.h"

    #define POINTS 32768

    /* Default memory limit (64 MB) */

    #define DEFAULT_MEM_LIMIT (64 * 1024 * 1024)

    /* Select bench algorithm automatically */

    #define BENCH_ALGO_AUTO (-1)

    static void help(char **argv, int default_iters) {

        printf("Benchmark EC multiplication algorithms\n");

        printf("\n");

    @@ -30,23 +36,25 @@ static void help(char **argv, int default_iters) {
  
        printf("function name. The letter 'g' indicates that one of the points is the generator.\n");

        printf("The benchmarks are divided by the number of points.\n");

        printf("\n");

        printf("default (ecmult_multi): picks pippenger_wnaf or strauss_wnaf depending on the\n");

        printf("                        batch size\n");

        printf("pippenger_wnaf:         for all batch sizes\n");

        printf("strauss_wnaf:           for all batch sizes\n");

        printf("simple:                 multiply and sum each point individually\n");

        printf("default (auto):   automatically select best algorithm\n");

        printf("pippenger_wnaf:   for all batch sizes\n");

        printf("strauss_wnaf:     for all batch sizes\n");

        printf("simple:           multiply and sum each point individually\n");

        printf("\n");

    }

    typedef struct {

        /* Setup once in advance */

        secp256k1_context* ctx;

        secp256k1_scratch_space* scratch;

        secp256k1_scalar* scalars;

        secp256k1_ge* pubkeys;

        secp256k1_gej* pubkeys_gej;

        secp256k1_scalar* seckeys;

        secp256k1_gej* expected_output;

        secp256k1_ecmult_multi_func ecmult_multi;

        /* Algorithm selection */

        int forced_algo;

        size_t mem_limit;

        /* Changes per benchmark */

        size_t count;

    @@ -62,6 +70,142 @@ typedef struct {
  
        secp256k1_fe* output_xonly;

    } bench_data;

    /*

     * ABCD Calibration Benchmarks

     *

     * Measures the performance of each algorithm at various batch sizes and

     * outputs. Use tools/ecmult_multi_calib.py to calculate optimal C and D

     * values from the output.

     *

     * Each algorithm is only calibrated within its optimal batch size range

     * to avoid skewing results with uninteresting results far away from that

     * range.

     */

    static void run_ecmult_multi_calib(bench_data* data) {

        static const size_t batch_sizes[] = {

            /* Small numbers should help stabilize Strauss intercept */

            2, 3, 5, 7, 10, 15, 20, 30, 50, 70,

            /* Crossover region between Strauss and Pippenger */

            85, 88, 90, 100, 120, 150, 175,

            /* Pippenger windows, getting progressively larger */

            200, 300, 500, 750, 1000, 1200, 1500, 2000, 3000, 5000, 7500, 10000, 15000, 20000, 30000

        };

        static const size_t n_batch_sizes = sizeof(batch_sizes) / sizeof(batch_sizes[0]);

        static const char* algo_names[] = {

            "TRIVIAL", "STRAUSS", "PIPPENGER_1", "PIPPENGER_2", "PIPPENGER_3", "PIPPENGER_4", "PIPPENGER_5", "PIPPENGER_6", "PIPPENGER_7", "PIPPENGER_8", "PIPPENGER_9", "PIPPENGER_10", "PIPPENGER_11", "PIPPENGER_12"

        };

        /* Maximum batch size for Strauss calibration. */

        static const size_t STRAUSS_MAX_CALIB_BATCH = 500;

        /* Per-window min/max batch sizes for Pippenger calibration. */

        static const size_t pippenger_min_calib_batch[12] = {

        /*w=1   2   3   4   5   6    7    8    9     10    11    12 */

            5,  5,  10, 30, 70, 150, 300, 750, 1500, 3000, 7500, 15000

        };

        static const size_t pippenger_max_calib_batch[12] = {

        /*w=1    2    3    4     5     6     7      8      9      10     11     12 */

            100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000, 30000, 30000, 30000

        };

        secp256k1_ge *points = NULL;

        secp256k1_scalar *scalars = NULL;

        secp256k1_gej result;

        size_t max_points = batch_sizes[n_batch_sizes - 1];

        int algo;

        size_t i, j;

        int base_iters = 1000;

        points = (secp256k1_ge *)malloc(max_points * sizeof(secp256k1_ge));

        scalars = (secp256k1_scalar *)malloc(max_points * sizeof(secp256k1_scalar));

        CHECK(points != NULL);

        CHECK(scalars != NULL);

        for (i = 0; i < max_points; i++) {

            points[i] = data->pubkeys[i % POINTS];

            scalars[i] = data->scalars[i % POINTS];

        }

        printf("# ECMULT_MULTI Calibration Data\n");

        printf("# Format: ALGO,N,TIME_US (microseconds per batch)\n");

        printf("# Copy the DATA section below into the Python script\n");

        printf("#\n");

        printf("# BEGIN DATA\n");

        /* Measure STRAUSS */

        algo = SECP256K1_ECMULT_MULTI_ALGO_STRAUSS;

        for (i = 0; i < n_batch_sizes; i++) {

            size_t n = batch_sizes[i];

            int64_t t_start, t_end;

            double time_us;

            int iters = base_iters;

            int iter;

            /* Only run up to the max to not skew result */

            if (n > STRAUSS_MAX_CALIB_BATCH) continue;

            /* Using many iterations in Strauss since batch sizes are small */

            if (n >= 300) iters = base_iters / 2;

            if (iters < 100) iters = 100;

            t_start = gettime_i64();

            for (iter = 0; iter < iters; iter++) {

                secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo,

                                                &result, n, points, scalars, NULL);

            }

            t_end = gettime_i64();

            time_us = (double)(t_end - t_start) / iters;

            printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us);

        }

        /* Measure PIPPENGER variants */

        for (algo = SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1;

             algo <= SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_12;

             algo++) {

            int window = algo - SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_1;

            size_t min_batch = pippenger_min_calib_batch[window];

            size_t max_batch = pippenger_max_calib_batch[window];

            for (j = 0; j < n_batch_sizes; j++) {

                size_t n = batch_sizes[j];

                int64_t t_start, t_end;

                double time_us;

                int iters = base_iters;

                int iter;

                /* Only run for the selected range of each algo */

                if (n < min_batch || n > max_batch) continue;

                /* Limiting iterations to keep run-time managable */

                if (n >= 1000) iters = base_iters / 10;

                if (n >= 5000) iters = base_iters / 50;

                if (n >= 15000) iters = base_iters / 100;

                if (iters < 3) iters = 3;

                t_start = gettime_i64();

                for (iter = 0; iter < iters; iter++) {

                    secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo,

                                                    &result, n, points, scalars, NULL);

                }

                t_end = gettime_i64();

                time_us = (double)(t_end - t_start) / iters;

                printf("%s,%lu,%.3f\n", algo_names[algo], (unsigned long)n, time_us);

            }

        }

        printf("# END DATA\n");

        printf("#\n");

        printf("# To calculate ABCD constants, run:\n");

        printf("#   ./bench_ecmult calib 2>&1 | python3 tools/ecmult_multi_calib.py\n");

        printf("#\n");

        free(points);

        free(scalars);

    }

    /* Hashes x into [0, POINTS) twice and store the result in offset1 and offset2. */

    static void hash_into_offset(bench_data* data, size_t x) {

        data->offset1 = (x * 0x537b7f6f + 0x8f66a481) % POINTS;

    @@ -214,32 +358,54 @@ static void run_ecmult_bench(bench_data* data, int iters) {
  
        run_benchmark(str, bench_ecmult_1p_g, bench_ecmult_setup, bench_ecmult_1p_g_teardown, data, 10, 2*iters);

    }

    static int bench_ecmult_multi_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {

        bench_data* data = (bench_data*)arg;

        if (data->includes_g) ++idx;

        if (idx == 0) {

            *sc = data->scalars[data->offset1];

            *ge = secp256k1_ge_const_g;

        } else {

            *sc = data->scalars[(data->offset1 + idx) % POINTS];

            *ge = data->pubkeys[(data->offset2 + idx - 1) % POINTS];

        }

        return 1;

    }

    static void bench_ecmult_multi(void* arg, int iters) {

        bench_data* data = (bench_data*)arg;

        int includes_g = data->includes_g;

        int iter;

        int count = data->count;

        size_t n_points = count - includes_g;

        secp256k1_ecmult_multi_algo algo;

        secp256k1_ge *points = NULL;

        secp256k1_scalar *scalars = NULL;

        size_t i;

        iters = iters / data->count;

        if (n_points > 0) {

            points = (secp256k1_ge *)malloc(n_points * sizeof(secp256k1_ge));

            scalars = (secp256k1_scalar *)malloc(n_points * sizeof(secp256k1_scalar));

            CHECK(points != NULL);

            CHECK(scalars != NULL);

        }

        for (iter = 0; iter < iters; ++iter) {

            data->ecmult_multi(&data->ctx->error_callback, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_ecmult_multi_callback, arg, count - includes_g);

            const secp256k1_scalar *g_scalar_ptr = NULL;

            if (includes_g) {

                g_scalar_ptr = &data->scalars[data->offset1];

            }

            for (i = 0; i < n_points; ++i) {

                size_t idx = includes_g ? i + 1 : i;

                scalars[i] = data->scalars[(data->offset1 + idx) % POINTS];

                points[i] = data->pubkeys[(data->offset2 + i) % POINTS];

            }

            if (data->forced_algo >= 0) {

                algo = data->forced_algo;

            } else {

                algo = secp256k1_ecmult_multi_select(data->mem_limit, n_points);

            }

            CHECK(secp256k1_ecmult_multi_internal(&data->ctx->error_callback, algo, &data->output[iter],

                                                  n_points, points, scalars, g_scalar_ptr));

            data->offset1 = (data->offset1 + count) % POINTS;

            data->offset2 = (data->offset2 + count - 1) % POINTS;

        }

        free(points);

        free(scalars);

    }

    static void bench_ecmult_multi_setup(void* arg) {

    @@ -309,7 +475,7 @@ static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_
  
    int main(int argc, char **argv) {

        bench_data data;

        int i, p;

        size_t scratch_size;

        int run_calib = 0;

        int default_iters = 10000;

        int iters = get_iters(default_iters);

    @@ -318,7 +484,8 @@ int main(int argc, char **argv) {
  
            return EXIT_FAILURE;

        }

        data.ecmult_multi = secp256k1_ecmult_multi_var;

        data.forced_algo = BENCH_ALGO_AUTO;

        data.mem_limit = DEFAULT_MEM_LIMIT;

        if (argc > 1) {

            if(have_flag(argc, argv, "-h")

    @@ -328,12 +495,19 @@ int main(int argc, char **argv) {
  
                return EXIT_SUCCESS;

            } else if(have_flag(argc, argv, "pippenger_wnaf")) {

                printf("Using pippenger_wnaf:\n");

                data.ecmult_multi = secp256k1_ecmult_pippenger_batch_single;

                /* TODO: Make this a dynamic selection again */

                data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_PIPPENGER_4;

            } else if(have_flag(argc, argv, "strauss_wnaf")) {

                printf("Using strauss_wnaf:\n");

                data.ecmult_multi = secp256k1_ecmult_strauss_batch_single;

                data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_STRAUSS;

            } else if(have_flag(argc, argv, "simple")) {

                printf("Using simple algorithm:\n");

                data.forced_algo = SECP256K1_ECMULT_MULTI_ALGO_TRIVIAL;

            } else if(have_flag(argc, argv, "auto")) {

                printf("Using automatic algorithm selection:\n");

                data.forced_algo = BENCH_ALGO_AUTO;

            } else if(have_flag(argc, argv, "calib")) {

                run_calib = 1;

            } else {

                fprintf(stderr, "%s: unrecognized argument '%s'.\n\n", argv[0], argv[1]);

                help(argv, default_iters);

    @@ -342,12 +516,6 @@ int main(int argc, char **argv) {
  
        }

        data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_NONE);

        scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*ALIGNMENT;

        if (!have_flag(argc, argv, "simple")) {

            data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);

        } else {

            data.scratch = NULL;

        }

        /* Allocate stuff */

        data.scalars = malloc(sizeof(secp256k1_scalar) * POINTS);

    @@ -370,32 +538,32 @@ int main(int argc, char **argv) {
  
        }

        secp256k1_ge_set_all_gej_var(data.pubkeys, data.pubkeys_gej, POINTS);

        if (run_calib) {

            run_ecmult_multi_calib(&data);

        } else {

            print_output_table_header_row();

            /* Initialize offset1 and offset2 */

            hash_into_offset(&data, 0);

            run_ecmult_bench(&data, iters);

        print_output_table_header_row();

        /* Initialize offset1 and offset2 */

        hash_into_offset(&data, 0);

        run_ecmult_bench(&data, iters);

        for (i = 1; i <= 8; ++i) {

            run_ecmult_multi_bench(&data, i, 1, iters);

        }

            for (i = 1; i <= 8; ++i) {

                run_ecmult_multi_bench(&data, i, 1, iters);

            }

        /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1

        * and the higher it goes the longer the computation takes(more points)

        * So we don't run this benchmark with low iterations to prevent slow down */

         if (iters > 2) {

            for (p = 0; p <= 11; ++p) {

                for (i = 9; i <= 16; ++i) {

                    run_ecmult_multi_bench(&data, i << p, 1, iters);

            /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1

            * and the higher it goes the longer the computation takes(more points)

            * So we don't run this benchmark with low iterations to prevent slow down */

            if (iters > 2) {

                for (p = 0; p <= 11; ++p) {

                    for (i = 9; i <= 16; ++i) {

                        run_ecmult_multi_bench(&data, i << p, 1, iters);

                    }

                }

            } else {

                printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n");

            }

        } else {

            printf("Skipping some benchmarks due to SECP256K1_BENCH_ITERS <= 2\n");

        }

        if (data.scratch != NULL) {

            secp256k1_scratch_space_destroy(data.ctx, data.scratch);

        }

        secp256k1_context_destroy(data.ctx);

        free(data.scalars);

        free(data.pubkeys);
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ecmult_multi: Replace scratch space with malloc, use abcd cost model #1789

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

ecmult_multi: Replace scratch space with malloc, use abcd cost model #1789

Are you sure you want to change the base?

Uh oh!

ecmult_multi: Replace scratch space with malloc, use abcd cost model #1789

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!