diff --git a/benchmark_analysis.ipynb b/benchmark_analysis.ipynb new file mode 100644 index 0000000000..53df29728f --- /dev/null +++ b/benchmark_analysis.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b49ae6d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "plt.rcParams['figure.figsize'] = (16, 10)\n", + "plt.rcParams['font.size'] = 11" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d236980d", + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'BASELINE_bench.csv'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 17\u001b[39m\n\u001b[32m 14\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m pd.DataFrame(data)\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m baseline = \u001b[43mparse_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mBASELINE_bench.csv\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 18\u001b[39m custom = parse_csv(\u001b[33m'\u001b[39m\u001b[33mCUSTOM_SIMD_bench.csv\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 20\u001b[39m merged = baseline.merge(custom, on=\u001b[33m'\u001b[39m\u001b[33mBenchmark\u001b[39m\u001b[33m'\u001b[39m, suffixes=(\u001b[33m'\u001b[39m\u001b[33m_baseline\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33m_custom\u001b[39m\u001b[33m'\u001b[39m))\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m, in \u001b[36mparse_csv\u001b[39m\u001b[34m(filepath)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mparse_csv\u001b[39m(filepath):\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 3\u001b[39m lines = f.readlines()[\u001b[32m1\u001b[39m:]\n\u001b[32m 5\u001b[39m data = []\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/secp256k1/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:343\u001b[39m, in \u001b[36m_modified_open\u001b[39m\u001b[34m(file, *args, **kwargs)\u001b[39m\n\u001b[32m 336\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[32m0\u001b[39m, \u001b[32m1\u001b[39m, \u001b[32m2\u001b[39m}:\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 338\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIPython won\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m by default \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 339\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 340\u001b[39m \u001b[33m\"\u001b[39m\u001b[33myou can use builtins\u001b[39m\u001b[33m'\u001b[39m\u001b[33m open.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 341\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m343\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'BASELINE_bench.csv'" + ] + } + ], + "source": [ + "def parse_csv(filepath):\n", + " with open(filepath, 'r') as f:\n", + " lines = f.readlines()[1:]\n", + " \n", + " data = []\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line and ',' in line and not line.endswith(','):\n", + " parts = line.split(',')\n", + " if len(parts) >= 3:\n", + " try:\n", + " data.append({'Benchmark': parts[0].strip(), 'Time': float(parts[2])})\n", + " except:\n", + " continue\n", + " return pd.DataFrame(data)\n", + "\n", + "baseline = parse_csv('BASELINE_bench.csv')\n", + "custom = parse_csv('CUSTOM_SIMD_bench.csv')\n", + "\n", + "merged = baseline.merge(custom, on='Benchmark', suffixes=('_baseline', '_custom'))\n", + "merged['improvement'] = ((merged['Time_baseline'] - merged['Time_custom']) / merged['Time_baseline']) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8442b12d", + "metadata": {}, + "outputs": [], + "source": [ + "sorted_data = merged.sort_values('improvement', ascending=False)\n", + "top10 = sorted_data.head(10)\n", + "bottom10 = sorted_data.tail(10)\n", + "filtered = pd.concat([top10, bottom10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa07550a", + "metadata": {}, + "outputs": [], + "source": [ + "heatmap_data = filtered.set_index('Benchmark')[['improvement']]\n", + "\n", + "plt.figure(figsize=(8, 12))\n", + "sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn', center=0, \n", + " cbar_kws={'label': 'Performance Improvement (%)'})\n", + "plt.title('CUSTOM_SIMD vs BASELINE Performance (Top/Bottom 10)', fontsize=14, fontweight='bold')\n", + "plt.ylabel('')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/out b/out new file mode 100644 index 0000000000..e69de29bb2 diff --git a/simd-bench.sh b/simd-bench.sh new file mode 100755 index 0000000000..0820858d7c --- /dev/null +++ b/simd-bench.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e + +options=("OFF" "ON") +BENCH_ITERS=${SECP256K1_BENCH_ITERS:-20000} + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' + +echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo > /dev/null +sudo cpupower -c 0 frequency-set -g performance > /dev/null +command -v taskset > /dev/null && TASKSET_CMD="taskset -c 0" + +run_bench() { + local dir=$1 bin=$2 log=$3 + ( + cd "$dir" + $TASKSET_CMD env SECP256K1_BENCH_ITERS=$BENCH_ITERS nice -n 0 ./bin/$bin >> "../../$log" 2>&1 + echo "" >> "../../$log" + ) +} + +bench_all() { + local config="$1" + local dir="build/$config" + local log="${config}_bench.csv" + + if [[ ! -d "$dir" ]]; then + echo -e "${RED}✖ $config${NC} (no dir)" + return 1 + fi + + { + echo "Benchmark results for $config" + echo "Generated on $(date)" + echo "Iterations: $BENCH_ITERS" + echo "" + } > "$log" + + for bin in bench bench_ecmult bench_internal; do + if run_bench "$dir" "$bin" "$log"; then + echo -e " ${GREEN}✔ $bin${NC}" + else + echo -e " ${RED}✖ $bin${NC}" + return 1 + fi + done + + echo -e "${GREEN}✔ $config${NC} (log: $log)" +} + +bench_all "BASELINE" +bench_all "CUSTOM_SIMD" + +echo -e "\n${YELLOW}All benchmarks successful. Logs in project root${NC}" \ No newline at end of file diff --git a/simd-build.sh b/simd-build.sh new file mode 100755 index 0000000000..eb09b2c37c --- /dev/null +++ b/simd-build.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +mkdir -p build + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' + +run_build() { + local config="$1" + local flags="-O3 -mavx -mavx2 $2" + local dir="build/$config" + local log="${config}_build.log" + + mkdir -p "$dir" + + if (cd "$dir" && cmake ../.. -G Ninja -DCMAKE_BUILD_TYPE=Release -DSECP256K1_APPEND_CFLAGS="$flags" >"../../$log" 2>&1 && ninja >>"../../$log" 2>&1); then + echo -e "${GREEN}✔ $config${NC}" + else + echo -e "${RED}✖ $config failed${NC}" + return 1 + fi +} + +run_build "BASELINE" "-U__AVX__ -U__AVX2__" +run_build "CUSTOM_SIMD" "-D__AVX__ -D__AVX2__" + +echo -e "\n${YELLOW}All builds done. Logs in project root${NC}" \ No newline at end of file diff --git a/simd-test.sh b/simd-test.sh new file mode 100755 index 0000000000..af92035557 --- /dev/null +++ b/simd-test.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' + +run_test() { + local config="$1" + local dir="build/$config" + local log="${config}_test.log" + + if [[ ! -d "$dir" ]]; then + echo -e "${RED}✖ $config${NC} (no dir)" + return 1 + fi + + if (cd "$dir" && ctest --output-on-failure -j"$(nproc)" &> "../../$log"); then + echo -e "${GREEN}✔ $config${NC} (log: $log)" + else + echo -e "${RED}✖ $config${NC} (log: $log)" + return 1 + fi +} + +run_test "BASELINE" +run_test "CUSTOM_SIMD" + +echo -e "\n${YELLOW}All tests passed. Logs in project root${NC}" \ No newline at end of file diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index ea14c27318..9e53e30436 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -38,16 +38,20 @@ static void secp256k1_fe_impl_verify(const secp256k1_fe *a) { #endif static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) { - r->n[0] = 0x3FFFFFFUL * 2 * m; - r->n[1] = 0x3FFFFFFUL * 2 * m; - r->n[2] = 0x3FFFFFFUL * 2 * m; - r->n[3] = 0x3FFFFFFUL * 2 * m; - r->n[4] = 0x3FFFFFFUL * 2 * m; - r->n[5] = 0x3FFFFFFUL * 2 * m; - r->n[6] = 0x3FFFFFFUL * 2 * m; - r->n[7] = 0x3FFFFFFUL * 2 * m; - r->n[8] = 0x3FFFFFFUL * 2 * m; - r->n[9] = 0x03FFFFFUL * 2 * m; + const uint64_t two_m = 2 * m; + const uint64_t bound1 = 0x3FFFFFFUL * two_m; + const uint64_t bound2 = 0x03FFFFFUL * two_m; + + r->n[0] = bound1; + r->n[1] = bound1; + r->n[2] = bound1; + r->n[3] = bound1; + r->n[4] = bound1; + r->n[5] = bound1; + r->n[6] = bound1; + r->n[7] = bound1; + r->n[8] = bound1; + r->n[9] = bound2; } static void secp256k1_fe_impl_normalize(secp256k1_fe *r) { @@ -257,8 +261,8 @@ static int secp256k1_fe_impl_normalizes_to_zero_var(const secp256k1_fe *r) { } SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) { + memset(r->n, 0, sizeof(r->n)); r->n[0] = a; - r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0; } SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) { @@ -272,12 +276,11 @@ SECP256K1_INLINE static int secp256k1_fe_impl_is_odd(const secp256k1_fe *a) { static int secp256k1_fe_impl_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { int i; + int diff; for (i = 9; i >= 0; i--) { - if (a->n[i] > b->n[i]) { - return 1; - } - if (a->n[i] < b->n[i]) { - return -1; + diff = (a->n[i] > b->n[i]) - (a->n[i] < b->n[i]); + if (diff != 0) { + return diff; } } return 0; @@ -338,24 +341,30 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) { } SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) { + const uint32_t two_m1 = 2 * (m + 1); + const uint32_t bound1 = 0x3FFFC2FUL * two_m1; + const uint32_t bound2 = 0x3FFFFBFUL * two_m1; + const uint32_t bound3 = 0x3FFFFFFUL * two_m1; + const uint32_t bound4 = 0x03FFFFFUL * two_m1; + /* For all legal values of m (0..31), the following properties hold: */ - VERIFY_CHECK(0x3FFFC2FUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m); - VERIFY_CHECK(0x3FFFFBFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m); - VERIFY_CHECK(0x3FFFFFFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m); - VERIFY_CHECK(0x03FFFFFUL * 2 * (m + 1) >= 0x03FFFFFUL * 2 * m); + VERIFY_CHECK(bound1 >= 0x3FFFFFFUL * 2 * m); + VERIFY_CHECK(bound2 >= 0x3FFFFFFUL * 2 * m); + VERIFY_CHECK(bound3 >= 0x3FFFFFFUL * 2 * m); + VERIFY_CHECK(bound4 >= 0x03FFFFFUL * 2 * m); /* Due to the properties above, the left hand in the subtractions below is never less than * the right hand. */ - r->n[0] = 0x3FFFC2FUL * 2 * (m + 1) - a->n[0]; - r->n[1] = 0x3FFFFBFUL * 2 * (m + 1) - a->n[1]; - r->n[2] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[2]; - r->n[3] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[3]; - r->n[4] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[4]; - r->n[5] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[5]; - r->n[6] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[6]; - r->n[7] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[7]; - r->n[8] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[8]; - r->n[9] = 0x03FFFFFUL * 2 * (m + 1) - a->n[9]; + r->n[0] = bound1 - a->n[0]; + r->n[1] = bound2 - a->n[1]; + r->n[2] = bound3 - a->n[2]; + r->n[3] = bound3 - a->n[3]; + r->n[4] = bound3 - a->n[4]; + r->n[5] = bound3 - a->n[5]; + r->n[6] = bound3 - a->n[6]; + r->n[7] = bound3 - a->n[7]; + r->n[8] = bound3 - a->n[8]; + r->n[9] = bound4 - a->n[9]; } SECP256K1_INLINE static void secp256k1_fe_impl_mul_int_unchecked(secp256k1_fe *r, int a) { @@ -1111,24 +1120,24 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, } static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) { - r->n[0] = a->n[0] | a->n[1] << 26; - r->n[1] = a->n[1] >> 6 | a->n[2] << 20; + r->n[0] = a->n[0] | a->n[1] << 26; + r->n[1] = a->n[1] >> 6 | a->n[2] << 20; r->n[2] = a->n[2] >> 12 | a->n[3] << 14; r->n[3] = a->n[3] >> 18 | a->n[4] << 8; r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28; - r->n[5] = a->n[6] >> 4 | a->n[7] << 22; + r->n[5] = a->n[6] >> 4 | a->n[7] << 22; r->n[6] = a->n[7] >> 10 | a->n[8] << 16; r->n[7] = a->n[8] >> 16 | a->n[9] << 10; } static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) { r->n[0] = a->n[0] & 0x3FFFFFFUL; - r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL); + r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL); r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL); r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL); - r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL); + r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL); r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL; - r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL); + r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL); r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL); r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL); r->n[9] = a->n[7] >> 10; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 46dca6b981..9edec46050 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -14,6 +14,10 @@ #include "field_5x52_int128_impl.h" +#ifdef X86 +# include +#endif + #ifdef VERIFY static void secp256k1_fe_impl_verify(const secp256k1_fe *a) { const uint64_t *d = a->n; @@ -33,11 +37,20 @@ static void secp256k1_fe_impl_verify(const secp256k1_fe *a) { #endif static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) { - r->n[0] = 0xFFFFFFFFFFFFFULL * 2 * m; - r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * m; - r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * m; - r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * m; - r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * m; + const uint64_t two_m = 2 * m; + const uint64_t bound1 = 0xFFFFFFFFFFFFFULL * two_m; + const uint64_t bound2 = 0x0FFFFFFFFFFFFULL * two_m; + +#ifdef __AVX__ + __m256i vec = _mm256_set1_epi64x(bound1); + _mm256_storeu_si256((__m256i *)r->n, vec); +#else + r->n[0] = bound1; + r->n[1] = bound1; + r->n[2] = bound1; + r->n[3] = bound1; +#endif + r->n[4] = bound2; } static void secp256k1_fe_impl_normalize(secp256k1_fe *r) { @@ -199,8 +212,8 @@ static int secp256k1_fe_impl_normalizes_to_zero_var(const secp256k1_fe *r) { } SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) { + memset(r->n, 0, sizeof(r->n)); r->n[0] = a; - r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0; } SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) { @@ -214,52 +227,35 @@ SECP256K1_INLINE static int secp256k1_fe_impl_is_odd(const secp256k1_fe *a) { static int secp256k1_fe_impl_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { int i; + int8_t diff; for (i = 4; i >= 0; i--) { - if (a->n[i] > b->n[i]) { - return 1; - } - if (a->n[i] < b->n[i]) { - return -1; + diff = (a->n[i] > b->n[i]) - (a->n[i] < b->n[i]); + if (diff != 0) { + return diff; } } return 0; } static void secp256k1_fe_impl_set_b32_mod(secp256k1_fe *r, const unsigned char *a) { - r->n[0] = (uint64_t)a[31] - | ((uint64_t)a[30] << 8) - | ((uint64_t)a[29] << 16) - | ((uint64_t)a[28] << 24) - | ((uint64_t)a[27] << 32) - | ((uint64_t)a[26] << 40) - | ((uint64_t)(a[25] & 0xF) << 48); - r->n[1] = (uint64_t)((a[25] >> 4) & 0xF) - | ((uint64_t)a[24] << 4) - | ((uint64_t)a[23] << 12) - | ((uint64_t)a[22] << 20) - | ((uint64_t)a[21] << 28) - | ((uint64_t)a[20] << 36) - | ((uint64_t)a[19] << 44); - r->n[2] = (uint64_t)a[18] - | ((uint64_t)a[17] << 8) - | ((uint64_t)a[16] << 16) - | ((uint64_t)a[15] << 24) - | ((uint64_t)a[14] << 32) - | ((uint64_t)a[13] << 40) - | ((uint64_t)(a[12] & 0xF) << 48); - r->n[3] = (uint64_t)((a[12] >> 4) & 0xF) - | ((uint64_t)a[11] << 4) - | ((uint64_t)a[10] << 12) - | ((uint64_t)a[9] << 20) - | ((uint64_t)a[8] << 28) - | ((uint64_t)a[7] << 36) - | ((uint64_t)a[6] << 44); - r->n[4] = (uint64_t)a[5] - | ((uint64_t)a[4] << 8) - | ((uint64_t)a[3] << 16) - | ((uint64_t)a[2] << 24) - | ((uint64_t)a[1] << 32) - | ((uint64_t)a[0] << 40); + uint64_t limbs[4]; + memcpy(limbs, a, 32); + +#ifdef SECP256K1_LITTLE_ENDIAN + limbs[0] = BYTESWAP_64(limbs[0]); + limbs[1] = BYTESWAP_64(limbs[1]); + limbs[2] = BYTESWAP_64(limbs[2]); + limbs[3] = BYTESWAP_64(limbs[3]); +#endif + + /* TODO: parallelize avx2 */ + + r->n[0] = (limbs[3] & 0xFFFFFFFFFFFFFULL); + r->n[1] = (limbs[3] >> 52) | ((limbs[2] & 0xFFFFFFFFFFULL) << 12); + r->n[2] = (limbs[2] >> 40) | ((limbs[1] & 0xFFFFFFFULL) << 24); + r->n[3] = (limbs[1] >> 28) | ((limbs[0] & 0xFFFFULL) << 36); + + r->n[4] = (limbs[0] >> 16) & 0xFFFFFFFFFFFFULL; } static int secp256k1_fe_impl_set_b32_limit(secp256k1_fe *r, const unsigned char *a) { @@ -269,53 +265,72 @@ static int secp256k1_fe_impl_set_b32_limit(secp256k1_fe *r, const unsigned char /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) { - r[0] = (a->n[4] >> 40) & 0xFF; - r[1] = (a->n[4] >> 32) & 0xFF; - r[2] = (a->n[4] >> 24) & 0xFF; - r[3] = (a->n[4] >> 16) & 0xFF; - r[4] = (a->n[4] >> 8) & 0xFF; - r[5] = a->n[4] & 0xFF; - r[6] = (a->n[3] >> 44) & 0xFF; - r[7] = (a->n[3] >> 36) & 0xFF; - r[8] = (a->n[3] >> 28) & 0xFF; - r[9] = (a->n[3] >> 20) & 0xFF; - r[10] = (a->n[3] >> 12) & 0xFF; - r[11] = (a->n[3] >> 4) & 0xFF; - r[12] = ((a->n[2] >> 48) & 0xF) | ((a->n[3] & 0xF) << 4); - r[13] = (a->n[2] >> 40) & 0xFF; - r[14] = (a->n[2] >> 32) & 0xFF; - r[15] = (a->n[2] >> 24) & 0xFF; - r[16] = (a->n[2] >> 16) & 0xFF; - r[17] = (a->n[2] >> 8) & 0xFF; - r[18] = a->n[2] & 0xFF; - r[19] = (a->n[1] >> 44) & 0xFF; - r[20] = (a->n[1] >> 36) & 0xFF; - r[21] = (a->n[1] >> 28) & 0xFF; - r[22] = (a->n[1] >> 20) & 0xFF; - r[23] = (a->n[1] >> 12) & 0xFF; - r[24] = (a->n[1] >> 4) & 0xFF; - r[25] = ((a->n[0] >> 48) & 0xF) | ((a->n[1] & 0xF) << 4); - r[26] = (a->n[0] >> 40) & 0xFF; - r[27] = (a->n[0] >> 32) & 0xFF; - r[28] = (a->n[0] >> 24) & 0xFF; - r[29] = (a->n[0] >> 16) & 0xFF; - r[30] = (a->n[0] >> 8) & 0xFF; - r[31] = a->n[0] & 0xFF; + const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; + + r[0] = (a4 >> 40) & 0xFF; + r[1] = (a4 >> 32) & 0xFF; + r[2] = (a4 >> 24) & 0xFF; + r[3] = (a4 >> 16) & 0xFF; + r[4] = (a4 >> 8) & 0xFF; + r[5] = a4 & 0xFF; + r[6] = (a3 >> 44) & 0xFF; + r[7] = (a3 >> 36) & 0xFF; + r[8] = (a3 >> 28) & 0xFF; + r[9] = (a3 >> 20) & 0xFF; + r[10] = (a3 >> 12) & 0xFF; + r[11] = (a3 >> 4) & 0xFF; + r[12] = ((a2 >> 48) & 0xF) | ((a3 & 0xF) << 4); + r[13] = (a2 >> 40) & 0xFF; + r[14] = (a2 >> 32) & 0xFF; + r[15] = (a2 >> 24) & 0xFF; + r[16] = (a2 >> 16) & 0xFF; + r[17] = (a2 >> 8) & 0xFF; + r[18] = a2 & 0xFF; + r[19] = (a1 >> 44) & 0xFF; + r[20] = (a1 >> 36) & 0xFF; + r[21] = (a1 >> 28) & 0xFF; + r[22] = (a1 >> 20) & 0xFF; + r[23] = (a1 >> 12) & 0xFF; + r[24] = (a1 >> 4) & 0xFF; + r[25] = ((a0 >> 48) & 0xF) | ((a1 & 0xF) << 4); + r[26] = (a0 >> 40) & 0xFF; + r[27] = (a0 >> 32) & 0xFF; + r[28] = (a0 >> 24) & 0xFF; + r[29] = (a0 >> 16) & 0xFF; + r[30] = (a0 >> 8) & 0xFF; + r[31] = a0 & 0xFF; } SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) { +#if defined(__AVX__) && defined(__AVX2__) + /* load here to mitigate load latency */ + __m256i vec_a = _mm256_loadu_si256((__m256i *)a->n); +#endif + const uint32_t two_m1 = 2 * (m + 1); + const uint64_t bound1 = 0xFFFFEFFFFFC2FULL * two_m1; + const uint64_t bound2 = 0xFFFFFFFFFFFFFULL * two_m1; + const uint64_t bound3 = 0x0FFFFFFFFFFFFULL * two_m1; + /* For all legal values of m (0..31), the following properties hold: */ - VERIFY_CHECK(0xFFFFEFFFFFC2FULL * 2 * (m + 1) >= 0xFFFFFFFFFFFFFULL * 2 * m); - VERIFY_CHECK(0xFFFFFFFFFFFFFULL * 2 * (m + 1) >= 0xFFFFFFFFFFFFFULL * 2 * m); - VERIFY_CHECK(0x0FFFFFFFFFFFFULL * 2 * (m + 1) >= 0x0FFFFFFFFFFFFULL * 2 * m); + VERIFY_CHECK(bound1 >= 0xFFFFFFFFFFFFFULL * 2 * m); + VERIFY_CHECK(bound2 >= 0xFFFFFFFFFFFFFULL * 2 * m); + VERIFY_CHECK(bound3 >= 0x0FFFFFFFFFFFFULL * 2 * m); /* Due to the properties above, the left hand in the subtractions below is never less than * the right hand. */ - r->n[0] = 0xFFFFEFFFFFC2FULL * 2 * (m + 1) - a->n[0]; - r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[1]; - r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[2]; - r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[3]; - r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * (m + 1) - a->n[4]; +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_bounds = _mm256_setr_epi64x(bound1, bound2, bound2, bound2); + __m256i out = _mm256_sub_epi64(vec_bounds, vec_a); + _mm256_storeu_si256((__m256i *)r->n, out); + } +#else + r->n[0] = bound1 - a->n[0]; + r->n[1] = bound2 - a->n[1]; + r->n[2] = bound2 - a->n[2]; + r->n[3] = bound2 - a->n[3]; +#endif + r->n[4] = bound3 - a->n[4]; } SECP256K1_INLINE static void secp256k1_fe_impl_mul_int_unchecked(secp256k1_fe *r, int a) { @@ -347,15 +362,32 @@ SECP256K1_INLINE static void secp256k1_fe_impl_sqr(secp256k1_fe *r, const secp25 } SECP256K1_INLINE static void secp256k1_fe_impl_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) { +#if defined(__AVX__) && defined(__AVX2__) + /* load here to mitigate load latency */ + __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->n)); + __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->n)); +#endif + uint64_t mask0, mask1; volatile int vflag = flag; SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n)); mask0 = vflag + ~((uint64_t)0); mask1 = ~mask0; + +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_mask0 = _mm256_set1_epi64x(mask0); + __m256i vec_mask1 = _mm256_set1_epi64x(mask1); + vec_r = _mm256_and_si256(vec_r, vec_mask0); + vec_a = _mm256_and_si256(vec_a, vec_mask1); + _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(vec_r, vec_a)); + } +#else r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1); r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1); r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1); r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1); +#endif r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1); } @@ -426,18 +458,43 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, } static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) { - r->n[0] = a->n[0] | a->n[1] << 52; +#if defined(__AVX__) && defined(__AVX2__) + __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->n); + __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->n + 1)); + const __m256i shift_lhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(52, 40, 28, 16); /* TODO: precompute */ + __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs); + __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs); + _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(lhs, rhs)); +#else + r->n[0] = a->n[0] | a->n[1] << 52; r->n[1] = a->n[1] >> 12 | a->n[2] << 40; r->n[2] = a->n[2] >> 24 | a->n[3] << 28; r->n[3] = a->n[3] >> 36 | a->n[4] << 16; +#endif } static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) { - r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL; - r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL); - r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL); - r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL); - r->n[4] = a->n[3] >> 16; + const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3]; + +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3); + __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2); + const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 40, 28); /* TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */ + const __m256i mask52 = _mm256_set1_epi64x(0xFFFFFFFFFFFFFULL); /* TODO: precompute */ + __m256i rhs = _mm256_and_si256(_mm256_sllv_epi64(limbs_0123, shift_rhs), mask52); + __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs); + _mm256_storeu_si256((__m256i*)r->n, _mm256_or_si256(lhs, rhs)); + } +#else + r->n[0] = a0 & 0xFFFFFFFFFFFFFULL; + r->n[1] = a0 >> 52 | ((a1 << 12) & 0xFFFFFFFFFFFFFULL); + r->n[2] = a1 >> 40 | ((a2 << 24) & 0xFFFFFFFFFFFFFULL); + r->n[3] = a2 >> 28 | ((a3 << 36) & 0xFFFFFFFFFFFFFULL); +#endif + r->n[4] = a3 >> 16; } static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) { @@ -453,10 +510,24 @@ static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64 VERIFY_CHECK(a3 >> 62 == 0); VERIFY_CHECK(a4 >> 8 == 0); +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3); + __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2); + const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 42, 32); /*TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */ + const __m256i mask52 = _mm256_set1_epi64x(M52); /*TODO: precompute */ + __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs); + __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs); + __m256i out = _mm256_or_si256(lhs, rhs); + _mm256_storeu_si256((__m256i*)r->n, _mm256_and_si256(out, mask52)); + } +#else r->n[0] = a0 & M52; r->n[1] = (a0 >> 52 | a1 << 10) & M52; r->n[2] = (a1 >> 42 | a2 << 20) & M52; r->n[3] = (a2 >> 32 | a3 << 30) & M52; +#endif r->n[4] = (a3 >> 22 | a4 << 40); } @@ -464,10 +535,24 @@ static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp2 const uint64_t M62 = UINT64_MAX >> 2; const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3); + __m256i limbs_1234 = _mm256_setr_epi64x(a1, a2, a3, a4); + const __m256i shift_lhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(52, 42, 32, 22); /*TODO: precompute */ + const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */ + __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs); + __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs); + __m256i out = _mm256_or_si256(lhs, rhs); + _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62)); + } +#else r->v[0] = (a0 | a1 << 52) & M62; r->v[1] = (a1 >> 10 | a2 << 42) & M62; r->v[2] = (a2 >> 20 | a3 << 32) & M62; r->v[3] = (a3 >> 30 | a4 << 22) & M62; +#endif r->v[4] = a4 >> 40; } diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index f23f8ee1c4..309b2c1619 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -18,7 +18,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { secp256k1_uint128 c, d; uint64_t t3, t4, tx, u0; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; VERIFY_BITS(a[0], 56); diff --git a/src/hash_impl.h b/src/hash_impl.h index 1065acd643..f3b5131d93 100644 --- a/src/hash_impl.h +++ b/src/hash_impl.h @@ -16,9 +16,9 @@ #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) #define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y)))) -#define Sigma0(x) (((x) >> 2 | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10)) -#define Sigma1(x) (((x) >> 6 | (x) << 26) ^ ((x) >> 11 | (x) << 21) ^ ((x) >> 25 | (x) << 7)) -#define sigma0(x) (((x) >> 7 | (x) << 25) ^ ((x) >> 18 | (x) << 14) ^ ((x) >> 3)) +#define Sigma0(x) (((x) >> 2 | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10)) +#define Sigma1(x) (((x) >> 6 | (x) << 26) ^ ((x) >> 11 | (x) << 21) ^ ((x) >> 25 | (x) << 7)) +#define sigma0(x) (((x) >> 7 | (x) << 25) ^ ((x) >> 18 | (x) << 14) ^ ((x) >> 3)) #define sigma1(x) (((x) >> 17 | (x) << 15) ^ ((x) >> 19 | (x) << 13) ^ ((x) >> 10)) #define Round(a,b,c,d,e,f,g,h,k,w) do { \ @@ -62,56 +62,56 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) { Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = secp256k1_read_be32(&buf[56])); Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = secp256k1_read_be32(&buf[60])); - Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1)); - Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2)); - Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3)); - Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4)); - Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5)); - Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6)); - Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7)); - Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8)); - Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9)); - Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10)); - Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11)); - Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12)); - Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13)); - Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14)); - Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15)); - Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0)); - - Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1)); - Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2)); - Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3)); - Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4)); - Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5)); - Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6)); - Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7)); - Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8)); - Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9)); - Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10)); - Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11)); - Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12)); - Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13)); - Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14)); - Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15)); - Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0)); - - Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1)); - Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2)); - Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3)); - Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4)); - Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5)); - Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6)); - Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7)); - Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8)); - Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9)); - Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10)); - Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11)); - Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12)); - Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13)); - Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14)); - Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15)); - Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0)); + Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0)); s[0] += a; s[1] += b; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 807b9b70ab..487c53594e 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -12,6 +12,10 @@ #include "modinv64_impl.h" #include "util.h" +#ifdef X86 +# include +#endif + /* Limbs of the secp256k1 order. */ #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL) #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL) @@ -62,10 +66,10 @@ SECP256K1_INLINE static uint32_t secp256k1_scalar_get_bits_var(const secp256k1_s SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) { int yes = 0; int no = 0; - no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */ - no |= (a->d[2] < SECP256K1_N_2); + no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */ + no |= (a->d[2] < SECP256K1_N_2); yes |= (a->d[2] > SECP256K1_N_2) & ~no; - no |= (a->d[1] < SECP256K1_N_1); + no |= (a->d[1] < SECP256K1_N_1); yes |= (a->d[1] > SECP256K1_N_1) & ~no; yes |= (a->d[0] >= SECP256K1_N_0) & ~no; return yes; @@ -143,10 +147,25 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) { int over; + +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_b32 = _mm256_loadu_si256((__m256i*)b32); + vec_b32 = _mm256_permute4x64_epi64(vec_b32, _MM_SHUFFLE(0,1,2,3)); + const __m256i bswap_mask = _mm256_setr_epi8( /* TODO: precompute */ + 7,6,5,4,3,2,1,0, + 15,14,13,12,11,10,9,8, + 23,22,21,20,19,18,17,16, + 31,30,29,28,27,26,25,24); + __m256i output = _mm256_shuffle_epi8(vec_b32, bswap_mask); + _mm256_storeu_si256((__m256i*)r->d, output); + } +#else r->d[0] = secp256k1_read_be64(&b32[24]); r->d[1] = secp256k1_read_be64(&b32[16]); r->d[2] = secp256k1_read_be64(&b32[8]); r->d[3] = secp256k1_read_be64(&b32[0]); +#endif over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); if (overflow) { *overflow = over; @@ -157,16 +176,28 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) { SECP256K1_SCALAR_VERIFY(a); - +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_a = _mm256_loadu_si256((__m256i*)a->d); + vec_a = _mm256_permute4x64_epi64(vec_a, _MM_SHUFFLE(0,1,2,3)); + const __m256i bswap_mask = _mm256_setr_epi8( /* TODO: precompute */ + 7,6,5,4,3,2,1,0, + 15,14,13,12,11,10,9,8, + 23,22,21,20,19,18,17,16, + 31,30,29,28,27,26,25,24); + __m256i output = _mm256_shuffle_epi8(vec_a, bswap_mask); + _mm256_storeu_si256((__m256i*)bin, output); + } +#else secp256k1_write_be64(&bin[0], a->d[3]); secp256k1_write_be64(&bin[8], a->d[2]); secp256k1_write_be64(&bin[16], a->d[1]); secp256k1_write_be64(&bin[24], a->d[0]); +#endif } SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) { SECP256K1_SCALAR_VERIFY(a); - return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0; } @@ -243,10 +274,10 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a) { int no = 0; SECP256K1_SCALAR_VERIFY(a); - no |= (a->d[3] < SECP256K1_N_H_3); + no |= (a->d[3] < SECP256K1_N_H_3); yes |= (a->d[3] > SECP256K1_N_H_3) & ~no; - no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */ - no |= (a->d[1] < SECP256K1_N_H_1) & ~yes; + no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */ + no |= (a->d[1] < SECP256K1_N_H_1) & ~yes; yes |= (a->d[1] > SECP256K1_N_H_1) & ~no; yes |= (a->d[0] > SECP256K1_N_H_0) & ~no; return yes; @@ -882,8 +913,16 @@ static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) { SECP256K1_SCALAR_VERIFY(a); SECP256K1_SCALAR_VERIFY(b); - +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_a = _mm256_loadu_si256((__m256i *)a->d); + __m256i vec_b = _mm256_loadu_si256((__m256i *)b->d); + __m256i vec_xor = _mm256_xor_si256(vec_a, vec_b); + return _mm256_testz_si256(vec_xor, vec_xor); + } +#else return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0; +#endif } SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) { @@ -899,6 +938,9 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, shiftlimbs = shift >> 6; shiftlow = shift & 0x3F; shifthigh = 64 - shiftlow; + + /* TODO: parallelize */ + r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0; r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0; r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0; @@ -909,6 +951,12 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, } static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) { +#if defined(__AVX__) && defined(__AVX2__) + /* load here to mitigate load latency */ + __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->d)); + __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->d)); +#endif + uint64_t mask0, mask1; volatile int vflag = flag; SECP256K1_SCALAR_VERIFY(a); @@ -916,30 +964,55 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se mask0 = vflag + ~((uint64_t)0); mask1 = ~mask0; + +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i vec_mask0 = _mm256_set1_epi64x(mask0); + __m256i vec_mask1 = _mm256_set1_epi64x(mask1); + vec_r = _mm256_and_si256(vec_r, vec_mask0); + vec_a = _mm256_and_si256(vec_a, vec_mask1); + _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(vec_r, vec_a)); + } +#else r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1); r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1); r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1); r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1); +#endif SECP256K1_SCALAR_VERIFY(r); } static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) { - const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4]; - /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4). */ - VERIFY_CHECK(a0 >> 62 == 0); - VERIFY_CHECK(a1 >> 62 == 0); - VERIFY_CHECK(a2 >> 62 == 0); - VERIFY_CHECK(a3 >> 62 == 0); - VERIFY_CHECK(a4 >> 8 == 0); + VERIFY_CHECK(a->v[0] >> 62 == 0); + VERIFY_CHECK(a->v[1] >> 62 == 0); + VERIFY_CHECK(a->v[2] >> 62 == 0); + VERIFY_CHECK(a->v[3] >> 62 == 0); + VERIFY_CHECK(a->v[4] >> 8 == 0); + +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->v); + __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->v + 1)); + const __m256i shift_lhs = _mm256_setr_epi64x(0, 2, 4, 6); /* TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(62, 60, 58, 56); /* TODO: precompute */ + __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs); + __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs); + _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(lhs, rhs)); + } +#else + { + const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4]; - r->d[0] = a0 | a1 << 62; - r->d[1] = a1 >> 2 | a2 << 60; - r->d[2] = a2 >> 4 | a3 << 58; - r->d[3] = a3 >> 6 | a4 << 56; + r->d[0] = a0 | a1 << 62; + r->d[1] = a1 >> 2 | a2 << 60; + r->d[2] = a2 >> 4 | a3 << 58; + r->d[3] = a3 >> 6 | a4 << 56; + } +#endif SECP256K1_SCALAR_VERIFY(r); } @@ -949,10 +1022,24 @@ static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const s const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3]; SECP256K1_SCALAR_VERIFY(a); +#if defined(__AVX__) && defined(__AVX2__) + { + __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2); + __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3); + const __m256i shift_lhs = _mm256_setr_epi64x(0, 62, 60, 58); /*TODO: precompute */ + const __m256i shift_rhs = _mm256_setr_epi64x(64, 2, 4, 6); /*TODO: precompute */ + const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */ + __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs); + __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs); + __m256i out = _mm256_or_si256(lhs, rhs); + _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62)); + } +#else r->v[0] = a0 & M62; r->v[1] = (a0 >> 62 | a1 << 2) & M62; r->v[2] = (a1 >> 60 | a2 << 4) & M62; r->v[3] = (a2 >> 58 | a3 << 6) & M62; +#endif r->v[4] = a3 >> 56; } diff --git a/src/util.h b/src/util.h index 5f29f4076c..702a67b317 100644 --- a/src/util.h +++ b/src/util.h @@ -20,6 +20,37 @@ #include #endif +/* endianess detection macro */ +#if defined(_MSC_VER) +# define SECP256K1_LITTLE_ENDIAN +#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SECP256K1_LITTLE_ENDIAN +#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define SECP256K1_BIG_ENDIAN +#else +# error "Cannot detect endianness" /* GCC <4.6 or embedded compilers */ +#endif + +/* Byteswap intrinsics */ +#if defined(_MSC_VER) +# define BYTESWAP_16(x) _byteswap_ushort(x) +# define BYTESWAP_32(x) _byteswap_ulong(x) +# define BYTESWAP_64(x) _byteswap_uint64(x) +#elif defined(__has_builtin) && __has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) && __has_builtin(__builtin_bswap64) +# define BYTESWAP_16(x) __builtin_bswap16(x) +# define BYTESWAP_32(x) __builtin_bswap32(x) +# define BYTESWAP_64(x) __builtin_bswap64(x) +#else +# define BYTESWAP_16(x) (((x) >> 8) | ((x) << 8)) +# define BYTESWAP_32(x) (((x) >> 24) | (((x) >> 8) & 0xFF00) | (((x) << 8) & 0xFF0000) | ((x) << 24)) +# define BYTESWAP_64(x) (((x) >> 56) | (((x) >> 40) & 0xFF00) | (((x) >> 24) & 0xFF0000) | (((x) >> 8) & 0xFF000000ULL) | (((x) << 8) & 0xFF00000000ULL) | (((x) << 24) & 0xFF0000000000ULL) | (((x) << 40) & 0xFF000000000000ULL) | ((x) << 56) ) +#endif + +/* X86 detection macro */ +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +# define X86 +#endif + #define STR_(x) #x #define STR(x) STR_(x) #define DEBUG_CONFIG_MSG(x) "DEBUG_CONFIG: " x @@ -398,42 +429,38 @@ static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) { /* Read a uint32_t in big endian */ SECP256K1_INLINE static uint32_t secp256k1_read_be32(const unsigned char* p) { - return (uint32_t)p[0] << 24 | - (uint32_t)p[1] << 16 | - (uint32_t)p[2] << 8 | - (uint32_t)p[3]; + uint32_t x; + memcpy(&x, p, sizeof(x)); +#ifdef SECP256K1_LITTLE_ENDIAN + x = BYTESWAP_32(x); +#endif + return x; } /* Write a uint32_t in big endian */ SECP256K1_INLINE static void secp256k1_write_be32(unsigned char* p, uint32_t x) { - p[3] = x; - p[2] = x >> 8; - p[1] = x >> 16; - p[0] = x >> 24; +#ifdef SECP256K1_LITTLE_ENDIAN + x = BYTESWAP_32(x); +#endif + memcpy(p, &x, sizeof(x)); } /* Read a uint64_t in big endian */ SECP256K1_INLINE static uint64_t secp256k1_read_be64(const unsigned char* p) { - return (uint64_t)p[0] << 56 | - (uint64_t)p[1] << 48 | - (uint64_t)p[2] << 40 | - (uint64_t)p[3] << 32 | - (uint64_t)p[4] << 24 | - (uint64_t)p[5] << 16 | - (uint64_t)p[6] << 8 | - (uint64_t)p[7]; + uint64_t x; + memcpy(&x, p, sizeof(x)); +#ifdef SECP256K1_LITTLE_ENDIAN + x = BYTESWAP_64(x); +#endif + return x; } /* Write a uint64_t in big endian */ SECP256K1_INLINE static void secp256k1_write_be64(unsigned char* p, uint64_t x) { - p[7] = x; - p[6] = x >> 8; - p[5] = x >> 16; - p[4] = x >> 24; - p[3] = x >> 32; - p[2] = x >> 40; - p[1] = x >> 48; - p[0] = x >> 56; +#ifdef SECP256K1_LITTLE_ENDIAN + x = BYTESWAP_64(x); +#endif + memcpy(p, &x, sizeof(x)); } /* Rotate a uint32_t to the right. */