diff --git a/benchmark_analysis.ipynb b/benchmark_analysis.ipynb
new file mode 100644
index 0000000000..53df29728f
--- /dev/null
+++ b/benchmark_analysis.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b49ae6d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "plt.rcParams['figure.figsize'] = (16, 10)\n",
+    "plt.rcParams['font.size'] = 11"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d236980d",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'BASELINE_bench.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 17\u001b[39m\n\u001b[32m     14\u001b[39m                     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m     15\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m pd.DataFrame(data)\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m baseline = \u001b[43mparse_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mBASELINE_bench.csv\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     18\u001b[39m custom = parse_csv(\u001b[33m'\u001b[39m\u001b[33mCUSTOM_SIMD_bench.csv\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m     20\u001b[39m merged = baseline.merge(custom, on=\u001b[33m'\u001b[39m\u001b[33mBenchmark\u001b[39m\u001b[33m'\u001b[39m, suffixes=(\u001b[33m'\u001b[39m\u001b[33m_baseline\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33m_custom\u001b[39m\u001b[33m'\u001b[39m))\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m, in \u001b[36mparse_csv\u001b[39m\u001b[34m(filepath)\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mparse_csv\u001b[39m(filepath):\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m      3\u001b[39m         lines = f.readlines()[\u001b[32m1\u001b[39m:]\n\u001b[32m      5\u001b[39m     data = []\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/secp256k1/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:343\u001b[39m, in \u001b[36m_modified_open\u001b[39m\u001b[34m(file, *args, **kwargs)\u001b[39m\n\u001b[32m    336\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[32m0\u001b[39m, \u001b[32m1\u001b[39m, \u001b[32m2\u001b[39m}:\n\u001b[32m    337\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m    338\u001b[39m         \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIPython won\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m by default \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    339\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    340\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33myou can use builtins\u001b[39m\u001b[33m'\u001b[39m\u001b[33m open.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    341\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m343\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'BASELINE_bench.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "def parse_csv(filepath):\n",
+    "    with open(filepath, 'r') as f:\n",
+    "        lines = f.readlines()[1:]\n",
+    "    \n",
+    "    data = []\n",
+    "    for line in lines:\n",
+    "        line = line.strip()\n",
+    "        if line and ',' in line and not line.endswith(','):\n",
+    "            parts = line.split(',')\n",
+    "            if len(parts) >= 3:\n",
+    "                try:\n",
+    "                    data.append({'Benchmark': parts[0].strip(), 'Time': float(parts[2])})\n",
+    "                except:\n",
+    "                    continue\n",
+    "    return pd.DataFrame(data)\n",
+    "\n",
+    "baseline = parse_csv('BASELINE_bench.csv')\n",
+    "custom = parse_csv('CUSTOM_SIMD_bench.csv')\n",
+    "\n",
+    "merged = baseline.merge(custom, on='Benchmark', suffixes=('_baseline', '_custom'))\n",
+    "merged['improvement'] = ((merged['Time_baseline'] - merged['Time_custom']) / merged['Time_baseline']) * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8442b12d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_data = merged.sort_values('improvement', ascending=False)\n",
+    "top10 = sorted_data.head(10)\n",
+    "bottom10 = sorted_data.tail(10)\n",
+    "filtered = pd.concat([top10, bottom10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa07550a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data = filtered.set_index('Benchmark')[['improvement']]\n",
+    "\n",
+    "plt.figure(figsize=(8, 12))\n",
+    "sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn', center=0, \n",
+    "            cbar_kws={'label': 'Performance Improvement (%)'})\n",
+    "plt.title('CUSTOM_SIMD vs BASELINE Performance (Top/Bottom 10)', fontsize=14, fontweight='bold')\n",
+    "plt.ylabel('')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/out b/out
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/simd-bench.sh b/simd-bench.sh
new file mode 100755
index 0000000000..0820858d7c
--- /dev/null
+++ b/simd-bench.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -e
+
+options=("OFF" "ON")
+BENCH_ITERS=${SECP256K1_BENCH_ITERS:-20000}
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo > /dev/null
+sudo cpupower -c 0 frequency-set -g performance > /dev/null
+command -v taskset > /dev/null && TASKSET_CMD="taskset -c 0"
+
+run_bench() {
+  local dir=$1 bin=$2 log=$3
+  (
+    cd "$dir"
+    $TASKSET_CMD env SECP256K1_BENCH_ITERS=$BENCH_ITERS nice -n 0 ./bin/$bin >> "../../$log" 2>&1
+    echo "" >> "../../$log"
+  )
+}
+
+bench_all() {
+  local config="$1"
+  local dir="build/$config"
+  local log="${config}_bench.csv"
+
+  if [[ ! -d "$dir" ]]; then
+    echo -e "${RED}✖ $config${NC} (no dir)"
+    return 1
+  fi
+  
+  {
+    echo "Benchmark results for $config"
+    echo "Generated on $(date)"
+    echo "Iterations: $BENCH_ITERS"
+    echo ""
+  } > "$log"
+
+  for bin in bench bench_ecmult bench_internal; do
+    if run_bench "$dir" "$bin" "$log"; then
+      echo -e "  ${GREEN}✔ $bin${NC}"
+    else
+      echo -e "  ${RED}✖ $bin${NC}"
+      return 1
+    fi
+  done
+
+  echo -e "${GREEN}✔ $config${NC} (log: $log)"
+}
+
+bench_all "BASELINE"
+bench_all "CUSTOM_SIMD"
+
+echo -e "\n${YELLOW}All benchmarks successful. Logs in project root${NC}"
\ No newline at end of file
diff --git a/simd-build.sh b/simd-build.sh
new file mode 100755
index 0000000000..eb09b2c37c
--- /dev/null
+++ b/simd-build.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+mkdir -p build
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+run_build() {
+  local config="$1"
+  local flags="-O3 -mavx -mavx2 $2"
+  local dir="build/$config"
+  local log="${config}_build.log"
+  
+  mkdir -p "$dir"
+  
+  if (cd "$dir" && cmake ../.. -G Ninja -DCMAKE_BUILD_TYPE=Release -DSECP256K1_APPEND_CFLAGS="$flags" >"../../$log" 2>&1 && ninja >>"../../$log" 2>&1); then
+    echo -e "${GREEN}✔ $config${NC}"
+  else
+    echo -e "${RED}✖ $config failed${NC}"
+    return 1
+  fi
+}
+
+run_build "BASELINE"    "-U__AVX__ -U__AVX2__"
+run_build "CUSTOM_SIMD" "-D__AVX__ -D__AVX2__"
+
+echo -e "\n${YELLOW}All builds done. Logs in project root${NC}"
\ No newline at end of file
diff --git a/simd-test.sh b/simd-test.sh
new file mode 100755
index 0000000000..af92035557
--- /dev/null
+++ b/simd-test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+run_test() {
+  local config="$1"
+  local dir="build/$config"
+  local log="${config}_test.log"
+
+  if [[ ! -d "$dir" ]]; then
+    echo -e "${RED}✖ $config${NC} (no dir)"
+    return 1
+  fi
+
+  if (cd "$dir" && ctest --output-on-failure -j"$(nproc)" &> "../../$log"); then
+    echo -e "${GREEN}✔ $config${NC} (log: $log)"
+  else
+    echo -e "${RED}✖ $config${NC} (log: $log)"
+    return 1
+  fi
+}
+
+run_test "BASELINE"
+run_test "CUSTOM_SIMD"
+
+echo -e "\n${YELLOW}All tests passed. Logs in project root${NC}"
\ No newline at end of file
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index ea14c27318..9e53e30436 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -38,16 +38,20 @@ static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
 #endif
 
 static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
-    r->n[0] = 0x3FFFFFFUL * 2 * m;
-    r->n[1] = 0x3FFFFFFUL * 2 * m;
-    r->n[2] = 0x3FFFFFFUL * 2 * m;
-    r->n[3] = 0x3FFFFFFUL * 2 * m;
-    r->n[4] = 0x3FFFFFFUL * 2 * m;
-    r->n[5] = 0x3FFFFFFUL * 2 * m;
-    r->n[6] = 0x3FFFFFFUL * 2 * m;
-    r->n[7] = 0x3FFFFFFUL * 2 * m;
-    r->n[8] = 0x3FFFFFFUL * 2 * m;
-    r->n[9] = 0x03FFFFFUL * 2 * m;
+    const uint64_t two_m = 2 * m;
+    const uint64_t bound1 = 0x3FFFFFFUL * two_m;
+    const uint64_t bound2 = 0x03FFFFFUL * two_m;
+
+    r->n[0] = bound1;
+    r->n[1] = bound1;
+    r->n[2] = bound1;
+    r->n[3] = bound1;
+    r->n[4] = bound1;
+    r->n[5] = bound1;
+    r->n[6] = bound1;
+    r->n[7] = bound1;
+    r->n[8] = bound1;
+    r->n[9] = bound2;
 }
 
 static void secp256k1_fe_impl_normalize(secp256k1_fe *r) {
@@ -257,8 +261,8 @@ static int secp256k1_fe_impl_normalizes_to_zero_var(const secp256k1_fe *r) {
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) {
+    memset(r->n, 0, sizeof(r->n));
     r->n[0] = a;
-    r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
 }
 
 SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) {
@@ -272,12 +276,11 @@ SECP256K1_INLINE static int secp256k1_fe_impl_is_odd(const secp256k1_fe *a) {
 
 static int secp256k1_fe_impl_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     int i;
+    int diff;
     for (i = 9; i >= 0; i--) {
-        if (a->n[i] > b->n[i]) {
-            return 1;
-        }
-        if (a->n[i] < b->n[i]) {
-            return -1;
+        diff = (a->n[i] > b->n[i]) - (a->n[i] < b->n[i]);
+        if (diff != 0) {
+            return diff;
         }
     }
     return 0;
@@ -338,24 +341,30 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) {
+    const uint32_t two_m1 = 2 * (m + 1);
+    const uint32_t bound1 = 0x3FFFC2FUL * two_m1;
+    const uint32_t bound2 = 0x3FFFFBFUL * two_m1;
+    const uint32_t bound3 = 0x3FFFFFFUL * two_m1;
+    const uint32_t bound4 = 0x03FFFFFUL * two_m1;
+
     /* For all legal values of m (0..31), the following properties hold: */
-    VERIFY_CHECK(0x3FFFC2FUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
-    VERIFY_CHECK(0x3FFFFBFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
-    VERIFY_CHECK(0x3FFFFFFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
-    VERIFY_CHECK(0x03FFFFFUL * 2 * (m + 1) >= 0x03FFFFFUL * 2 * m);
+    VERIFY_CHECK(bound1 >= 0x3FFFFFFUL * 2 * m);
+    VERIFY_CHECK(bound2 >= 0x3FFFFFFUL * 2 * m);
+    VERIFY_CHECK(bound3 >= 0x3FFFFFFUL * 2 * m);
+    VERIFY_CHECK(bound4 >= 0x03FFFFFUL * 2 * m);
 
     /* Due to the properties above, the left hand in the subtractions below is never less than
      * the right hand. */
-    r->n[0] = 0x3FFFC2FUL * 2 * (m + 1) - a->n[0];
-    r->n[1] = 0x3FFFFBFUL * 2 * (m + 1) - a->n[1];
-    r->n[2] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[2];
-    r->n[3] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[3];
-    r->n[4] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[4];
-    r->n[5] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[5];
-    r->n[6] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[6];
-    r->n[7] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[7];
-    r->n[8] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[8];
-    r->n[9] = 0x03FFFFFUL * 2 * (m + 1) - a->n[9];
+    r->n[0] = bound1 - a->n[0];
+    r->n[1] = bound2 - a->n[1];
+    r->n[2] = bound3 - a->n[2];
+    r->n[3] = bound3 - a->n[3];
+    r->n[4] = bound3 - a->n[4];
+    r->n[5] = bound3 - a->n[5];
+    r->n[6] = bound3 - a->n[6];
+    r->n[7] = bound3 - a->n[7];
+    r->n[8] = bound3 - a->n[8];
+    r->n[9] = bound4 - a->n[9];
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_mul_int_unchecked(secp256k1_fe *r, int a) {
@@ -1111,24 +1120,24 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
 }
 
 static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
-    r->n[0] = a->n[0] | a->n[1] << 26;
-    r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
+    r->n[0] = a->n[0]       | a->n[1] << 26;
+    r->n[1] = a->n[1] >> 6  | a->n[2] << 20;
     r->n[2] = a->n[2] >> 12 | a->n[3] << 14;
     r->n[3] = a->n[3] >> 18 | a->n[4] << 8;
     r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28;
-    r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
+    r->n[5] = a->n[6] >> 4  | a->n[7] << 22;
     r->n[6] = a->n[7] >> 10 | a->n[8] << 16;
     r->n[7] = a->n[8] >> 16 | a->n[9] << 10;
 }
 
 static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
     r->n[0] = a->n[0] & 0x3FFFFFFUL;
-    r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
+    r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6)  & 0x3FFFFFFUL);
     r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
     r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
-    r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
+    r->n[4] = a->n[3] >> 8  | ((a->n[4] << 24) & 0x3FFFFFFUL);
     r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL;
-    r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
+    r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4)  & 0x3FFFFFFUL);
     r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
     r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
     r->n[9] = a->n[7] >> 10;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 46dca6b981..9edec46050 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -14,6 +14,10 @@
 
 #include "field_5x52_int128_impl.h"
 
+#ifdef X86
+# include <immintrin.h>
+#endif
+
 #ifdef VERIFY
 static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
     const uint64_t *d = a->n;
@@ -33,11 +37,20 @@ static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
 #endif
 
 static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
-    r->n[0] = 0xFFFFFFFFFFFFFULL * 2 * m;
-    r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * m;
-    r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * m;
-    r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * m;
-    r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * m;
+    const uint64_t two_m = 2 * m;
+    const uint64_t bound1 = 0xFFFFFFFFFFFFFULL * two_m;
+    const uint64_t bound2 = 0x0FFFFFFFFFFFFULL * two_m;
+
+#ifdef __AVX__
+    __m256i vec = _mm256_set1_epi64x(bound1);
+    _mm256_storeu_si256((__m256i *)r->n, vec);
+#else
+    r->n[0] = bound1;
+    r->n[1] = bound1;
+    r->n[2] = bound1;
+    r->n[3] = bound1;
+#endif
+    r->n[4] = bound2;
 }
 
 static void secp256k1_fe_impl_normalize(secp256k1_fe *r) {
@@ -199,8 +212,8 @@ static int secp256k1_fe_impl_normalizes_to_zero_var(const secp256k1_fe *r) {
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) {
+    memset(r->n, 0, sizeof(r->n));
     r->n[0] = a;
-    r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
 }
 
 SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) {
@@ -214,52 +227,35 @@ SECP256K1_INLINE static int secp256k1_fe_impl_is_odd(const secp256k1_fe *a) {
 
 static int secp256k1_fe_impl_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     int i;
+    int8_t diff;
     for (i = 4; i >= 0; i--) {
-        if (a->n[i] > b->n[i]) {
-            return 1;
-        }
-        if (a->n[i] < b->n[i]) {
-            return -1;
+        diff = (a->n[i] > b->n[i]) - (a->n[i] < b->n[i]);
+        if (diff != 0) {
+            return diff;
         }
     }
     return 0;
 }
 
 static void secp256k1_fe_impl_set_b32_mod(secp256k1_fe *r, const unsigned char *a) {
-    r->n[0] = (uint64_t)a[31]
-            | ((uint64_t)a[30] << 8)
-            | ((uint64_t)a[29] << 16)
-            | ((uint64_t)a[28] << 24)
-            | ((uint64_t)a[27] << 32)
-            | ((uint64_t)a[26] << 40)
-            | ((uint64_t)(a[25] & 0xF)  << 48);
-    r->n[1] = (uint64_t)((a[25] >> 4) & 0xF)
-            | ((uint64_t)a[24] << 4)
-            | ((uint64_t)a[23] << 12)
-            | ((uint64_t)a[22] << 20)
-            | ((uint64_t)a[21] << 28)
-            | ((uint64_t)a[20] << 36)
-            | ((uint64_t)a[19] << 44);
-    r->n[2] = (uint64_t)a[18]
-            | ((uint64_t)a[17] << 8)
-            | ((uint64_t)a[16] << 16)
-            | ((uint64_t)a[15] << 24)
-            | ((uint64_t)a[14] << 32)
-            | ((uint64_t)a[13] << 40)
-            | ((uint64_t)(a[12] & 0xF) << 48);
-    r->n[3] = (uint64_t)((a[12] >> 4) & 0xF)
-            | ((uint64_t)a[11] << 4)
-            | ((uint64_t)a[10] << 12)
-            | ((uint64_t)a[9]  << 20)
-            | ((uint64_t)a[8]  << 28)
-            | ((uint64_t)a[7]  << 36)
-            | ((uint64_t)a[6]  << 44);
-    r->n[4] = (uint64_t)a[5]
-            | ((uint64_t)a[4] << 8)
-            | ((uint64_t)a[3] << 16)
-            | ((uint64_t)a[2] << 24)
-            | ((uint64_t)a[1] << 32)
-            | ((uint64_t)a[0] << 40);
+    uint64_t limbs[4];
+    memcpy(limbs, a, 32);
+
+#ifdef SECP256K1_LITTLE_ENDIAN
+    limbs[0] = BYTESWAP_64(limbs[0]);
+    limbs[1] = BYTESWAP_64(limbs[1]);
+    limbs[2] = BYTESWAP_64(limbs[2]);
+    limbs[3] = BYTESWAP_64(limbs[3]);
+#endif
+
+    /* TODO: parallelize avx2 */
+
+    r->n[0] =                     (limbs[3] & 0xFFFFFFFFFFFFFULL);
+    r->n[1] = (limbs[3] >> 52) | ((limbs[2] & 0xFFFFFFFFFFULL) << 12);
+    r->n[2] = (limbs[2] >> 40) | ((limbs[1] & 0xFFFFFFFULL) << 24);
+    r->n[3] = (limbs[1] >> 28) | ((limbs[0] & 0xFFFFULL) << 36);
+
+    r->n[4] = (limbs[0] >> 16) & 0xFFFFFFFFFFFFULL;
 }
 
 static int secp256k1_fe_impl_set_b32_limit(secp256k1_fe *r, const unsigned char *a) {
@@ -269,53 +265,72 @@ static int secp256k1_fe_impl_set_b32_limit(secp256k1_fe *r, const unsigned char
 
 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
-    r[0] = (a->n[4] >> 40) & 0xFF;
-    r[1] = (a->n[4] >> 32) & 0xFF;
-    r[2] = (a->n[4] >> 24) & 0xFF;
-    r[3] = (a->n[4] >> 16) & 0xFF;
-    r[4] = (a->n[4] >> 8) & 0xFF;
-    r[5] = a->n[4] & 0xFF;
-    r[6] = (a->n[3] >> 44) & 0xFF;
-    r[7] = (a->n[3] >> 36) & 0xFF;
-    r[8] = (a->n[3] >> 28) & 0xFF;
-    r[9] = (a->n[3] >> 20) & 0xFF;
-    r[10] = (a->n[3] >> 12) & 0xFF;
-    r[11] = (a->n[3] >> 4) & 0xFF;
-    r[12] = ((a->n[2] >> 48) & 0xF) | ((a->n[3] & 0xF) << 4);
-    r[13] = (a->n[2] >> 40) & 0xFF;
-    r[14] = (a->n[2] >> 32) & 0xFF;
-    r[15] = (a->n[2] >> 24) & 0xFF;
-    r[16] = (a->n[2] >> 16) & 0xFF;
-    r[17] = (a->n[2] >> 8) & 0xFF;
-    r[18] = a->n[2] & 0xFF;
-    r[19] = (a->n[1] >> 44) & 0xFF;
-    r[20] = (a->n[1] >> 36) & 0xFF;
-    r[21] = (a->n[1] >> 28) & 0xFF;
-    r[22] = (a->n[1] >> 20) & 0xFF;
-    r[23] = (a->n[1] >> 12) & 0xFF;
-    r[24] = (a->n[1] >> 4) & 0xFF;
-    r[25] = ((a->n[0] >> 48) & 0xF) | ((a->n[1] & 0xF) << 4);
-    r[26] = (a->n[0] >> 40) & 0xFF;
-    r[27] = (a->n[0] >> 32) & 0xFF;
-    r[28] = (a->n[0] >> 24) & 0xFF;
-    r[29] = (a->n[0] >> 16) & 0xFF;
-    r[30] = (a->n[0] >> 8) & 0xFF;
-    r[31] = a->n[0] & 0xFF;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
+
+    r[0] = (a4 >> 40) & 0xFF;
+    r[1] = (a4 >> 32) & 0xFF;
+    r[2] = (a4 >> 24) & 0xFF;
+    r[3] = (a4 >> 16) & 0xFF;
+    r[4] = (a4 >> 8) & 0xFF;
+    r[5] = a4 & 0xFF;
+    r[6] = (a3 >> 44) & 0xFF;
+    r[7] = (a3 >> 36) & 0xFF;
+    r[8] = (a3 >> 28) & 0xFF;
+    r[9] = (a3 >> 20) & 0xFF;
+    r[10] = (a3 >> 12) & 0xFF;
+    r[11] = (a3 >> 4) & 0xFF;
+    r[12] = ((a2 >> 48) & 0xF) | ((a3 & 0xF) << 4);
+    r[13] = (a2 >> 40) & 0xFF;
+    r[14] = (a2 >> 32) & 0xFF;
+    r[15] = (a2 >> 24) & 0xFF;
+    r[16] = (a2 >> 16) & 0xFF;
+    r[17] = (a2 >> 8) & 0xFF;
+    r[18] = a2 & 0xFF;
+    r[19] = (a1 >> 44) & 0xFF;
+    r[20] = (a1 >> 36) & 0xFF;
+    r[21] = (a1 >> 28) & 0xFF;
+    r[22] = (a1 >> 20) & 0xFF;
+    r[23] = (a1 >> 12) & 0xFF;
+    r[24] = (a1 >> 4) & 0xFF;
+    r[25] = ((a0 >> 48) & 0xF) | ((a1 & 0xF) << 4);
+    r[26] = (a0 >> 40) & 0xFF;
+    r[27] = (a0 >> 32) & 0xFF;
+    r[28] = (a0 >> 24) & 0xFF;
+    r[29] = (a0 >> 16) & 0xFF;
+    r[30] = (a0 >> 8) & 0xFF;
+    r[31] = a0 & 0xFF;
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)a->n);
+#endif
+    const uint32_t two_m1 = 2 * (m + 1);
+    const uint64_t bound1 = 0xFFFFEFFFFFC2FULL * two_m1;
+    const uint64_t bound2 = 0xFFFFFFFFFFFFFULL * two_m1;
+    const uint64_t bound3 = 0x0FFFFFFFFFFFFULL * two_m1;
+    
     /* For all legal values of m (0..31), the following properties hold: */
-    VERIFY_CHECK(0xFFFFEFFFFFC2FULL * 2 * (m + 1) >= 0xFFFFFFFFFFFFFULL * 2 * m);
-    VERIFY_CHECK(0xFFFFFFFFFFFFFULL * 2 * (m + 1) >= 0xFFFFFFFFFFFFFULL * 2 * m);
-    VERIFY_CHECK(0x0FFFFFFFFFFFFULL * 2 * (m + 1) >= 0x0FFFFFFFFFFFFULL * 2 * m);
+    VERIFY_CHECK(bound1 >= 0xFFFFFFFFFFFFFULL * 2 * m);
+    VERIFY_CHECK(bound2 >= 0xFFFFFFFFFFFFFULL * 2 * m);
+    VERIFY_CHECK(bound3 >= 0x0FFFFFFFFFFFFULL * 2 * m);
 
     /* Due to the properties above, the left hand in the subtractions below is never less than
      * the right hand. */
-    r->n[0] = 0xFFFFEFFFFFC2FULL * 2 * (m + 1) - a->n[0];
-    r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[1];
-    r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[2];
-    r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[3];
-    r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * (m + 1) - a->n[4];
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_bounds = _mm256_setr_epi64x(bound1, bound2, bound2, bound2);
+        __m256i out = _mm256_sub_epi64(vec_bounds, vec_a);
+        _mm256_storeu_si256((__m256i *)r->n, out);
+    }
+#else
+    r->n[0] = bound1 - a->n[0];
+    r->n[1] = bound2 - a->n[1];
+    r->n[2] = bound2 - a->n[2];
+    r->n[3] = bound2 - a->n[3];
+#endif
+    r->n[4] = bound3 - a->n[4];
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_mul_int_unchecked(secp256k1_fe *r, int a) {
@@ -347,15 +362,32 @@ SECP256K1_INLINE static void secp256k1_fe_impl_sqr(secp256k1_fe *r, const secp25
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->n));
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->n));
+#endif
+
     uint64_t mask0, mask1;
     volatile int vflag = flag;
     SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = vflag + ~((uint64_t)0);
     mask1 = ~mask0;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_mask0 = _mm256_set1_epi64x(mask0);
+        __m256i vec_mask1 = _mm256_set1_epi64x(mask1);
+        vec_r = _mm256_and_si256(vec_r, vec_mask0);
+        vec_a = _mm256_and_si256(vec_a, vec_mask1);
+        _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(vec_r, vec_a));
+    }
+#else
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
+#endif
     r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
 }
 
@@ -426,18 +458,43 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
 }
 
 static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
-    r->n[0] = a->n[0] | a->n[1] << 52;
+#if defined(__AVX__) && defined(__AVX2__)
+    __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->n);
+    __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->n + 1));
+    const __m256i shift_lhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
+    const __m256i shift_rhs = _mm256_setr_epi64x(52, 40, 28, 16); /* TODO: precompute */
+    __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+    __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+    _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(lhs, rhs));
+#else
+    r->n[0] = a->n[0]       | a->n[1] << 52;
     r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
     r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
     r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
+#endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
-    r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL;
-    r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL);
-    r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL);
-    r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL);
-    r->n[4] = a->n[3] >> 16;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3];
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 40, 28); /* TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
+        const __m256i mask52 = _mm256_set1_epi64x(0xFFFFFFFFFFFFFULL); /* TODO: precompute */
+        __m256i rhs = _mm256_and_si256(_mm256_sllv_epi64(limbs_0123, shift_rhs), mask52);
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        _mm256_storeu_si256((__m256i*)r->n, _mm256_or_si256(lhs, rhs));
+    }
+#else
+    r->n[0] = a0 & 0xFFFFFFFFFFFFFULL;
+    r->n[1] = a0 >> 52 | ((a1 << 12) & 0xFFFFFFFFFFFFFULL);
+    r->n[2] = a1 >> 40 | ((a2 << 24) & 0xFFFFFFFFFFFFFULL);
+    r->n[3] = a2 >> 28 | ((a3 << 36) & 0xFFFFFFFFFFFFFULL);
+#endif
+    r->n[4] = a3 >> 16;
 }
 
 static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) {
@@ -453,10 +510,24 @@ static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64
     VERIFY_CHECK(a3 >> 62 == 0);
     VERIFY_CHECK(a4 >> 8 == 0);
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 42, 32); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
+        const __m256i mask52 = _mm256_set1_epi64x(M52); /*TODO: precompute */
+        __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i*)r->n, _mm256_and_si256(out, mask52));
+    }
+#else
     r->n[0] =  a0                   & M52;
     r->n[1] = (a0 >> 52 | a1 << 10) & M52;
     r->n[2] = (a1 >> 42 | a2 << 20) & M52;
     r->n[3] = (a2 >> 32 | a3 << 30) & M52;
+#endif
     r->n[4] = (a3 >> 22 | a4 << 40);
 }
 
@@ -464,10 +535,24 @@ static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp2
     const uint64_t M62 = UINT64_MAX >> 2;
     const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_1234 = _mm256_setr_epi64x(a1, a2, a3, a4);
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(52, 42, 32, 22); /*TODO: precompute */
+        const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62));
+    }
+#else
     r->v[0] = (a0       | a1 << 52) & M62;
     r->v[1] = (a1 >> 10 | a2 << 42) & M62;
     r->v[2] = (a2 >> 20 | a3 << 32) & M62;
     r->v[3] = (a3 >> 30 | a4 << 22) & M62;
+#endif
     r->v[4] =  a4 >> 40;
 }
 
diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h
index f23f8ee1c4..309b2c1619 100644
--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@@ -18,7 +18,7 @@
 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
     secp256k1_uint128 c, d;
     uint64_t t3, t4, tx, u0;
-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
 
     VERIFY_BITS(a[0], 56);
diff --git a/src/hash_impl.h b/src/hash_impl.h
index 1065acd643..f3b5131d93 100644
--- a/src/hash_impl.h
+++ b/src/hash_impl.h
@@ -16,9 +16,9 @@
 
 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 #define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
-#define Sigma0(x) (((x) >> 2 | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10))
-#define Sigma1(x) (((x) >> 6 | (x) << 26) ^ ((x) >> 11 | (x) << 21) ^ ((x) >> 25 | (x) << 7))
-#define sigma0(x) (((x) >> 7 | (x) << 25) ^ ((x) >> 18 | (x) << 14) ^ ((x) >> 3))
+#define Sigma0(x) (((x) >> 2  | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10))
+#define Sigma1(x) (((x) >> 6  | (x) << 26) ^ ((x) >> 11 | (x) << 21) ^ ((x) >> 25 | (x) << 7))
+#define sigma0(x) (((x) >> 7  | (x) << 25) ^ ((x) >> 18 | (x) << 14) ^ ((x) >> 3))
 #define sigma1(x) (((x) >> 17 | (x) << 15) ^ ((x) >> 19 | (x) << 13) ^ ((x) >> 10))
 
 #define Round(a,b,c,d,e,f,g,h,k,w) do { \
@@ -62,56 +62,56 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) {
     Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = secp256k1_read_be32(&buf[56]));
     Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = secp256k1_read_be32(&buf[60]));
 
-    Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
-    Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
-    Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3));
-    Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4));
-    Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5));
-    Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6));
-    Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7));
-    Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8));
-    Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9));
-    Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10));
-    Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11));
-    Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12));
-    Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13));
-    Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14));
-    Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15));
-    Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0));
-
-    Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1));
-    Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2));
-    Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3));
-    Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4));
-    Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5));
-    Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6));
-    Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7));
-    Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8));
-    Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9));
-    Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10));
-    Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11));
-    Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12));
-    Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13));
-    Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14));
-    Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15));
-    Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0));
-
-    Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1));
-    Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2));
-    Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3));
-    Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4));
-    Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5));
-    Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6));
-    Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7));
-    Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8));
-    Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9));
-    Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10));
-    Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11));
-    Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12));
-    Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13));
-    Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14));
-    Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15));
-    Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0));
+    Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0  += sigma1(w14) + w9  + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1  += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2  += sigma1(w0)  + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3  += sigma1(w1)  + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4  += sigma1(w2)  + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5  += sigma1(w3)  + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6  += sigma1(w4)  + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x76f988da, w7  += sigma1(w5)  + w0  + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x983e5152, w8  += sigma1(w6)  + w1  + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9  += sigma1(w7)  + w2  + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8)  + w3  + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9)  + w4  + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5  + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6  + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7  + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8  + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0  += sigma1(w14) + w9  + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1  += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2  += sigma1(w0)  + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x53380d13, w3  += sigma1(w1)  + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x650a7354, w4  += sigma1(w2)  + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5  += sigma1(w3)  + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6  += sigma1(w4)  + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x92722c85, w7  += sigma1(w5)  + w0  + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8  += sigma1(w6)  + w1  + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9  += sigma1(w7)  + w2  + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8)  + w3  + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9)  + w4  + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5  + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6  + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7  + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8  + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0  += sigma1(w14) + w9  + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1  += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x2748774c, w2  += sigma1(w0)  + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3  += sigma1(w1)  + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4  += sigma1(w2)  + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5  += sigma1(w3)  + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6  += sigma1(w4)  + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7  += sigma1(w5)  + w0  + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8  += sigma1(w6)  + w1  + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9  += sigma1(w7)  + w2  + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8)  + w3  + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9)  + w4  + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5  + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6  + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12)  + w7  + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13)  + w8  + sigma0(w0));
 
     s[0] += a;
     s[1] += b;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 807b9b70ab..487c53594e 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -12,6 +12,10 @@
 #include "modinv64_impl.h"
 #include "util.h"
 
+#ifdef X86
+# include <immintrin.h>
+#endif
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -62,10 +66,10 @@ SECP256K1_INLINE static uint32_t secp256k1_scalar_get_bits_var(const secp256k1_s
 SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) {
     int yes = 0;
     int no = 0;
-    no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */
-    no |= (a->d[2] < SECP256K1_N_2);
+    no  |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */
+    no  |= (a->d[2] < SECP256K1_N_2);
     yes |= (a->d[2] > SECP256K1_N_2) & ~no;
-    no |= (a->d[1] < SECP256K1_N_1);
+    no  |= (a->d[1] < SECP256K1_N_1);
     yes |= (a->d[1] > SECP256K1_N_1) & ~no;
     yes |= (a->d[0] >= SECP256K1_N_0) & ~no;
     return yes;
@@ -143,10 +147,25 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
 
 static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
     int over;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_b32 = _mm256_loadu_si256((__m256i*)b32);
+        vec_b32 = _mm256_permute4x64_epi64(vec_b32, _MM_SHUFFLE(0,1,2,3));
+        const __m256i bswap_mask = _mm256_setr_epi8( /* TODO: precompute */
+            7,6,5,4,3,2,1,0,
+            15,14,13,12,11,10,9,8,
+            23,22,21,20,19,18,17,16,
+            31,30,29,28,27,26,25,24);
+        __m256i output = _mm256_shuffle_epi8(vec_b32, bswap_mask);
+        _mm256_storeu_si256((__m256i*)r->d, output);
+    }
+#else
     r->d[0] = secp256k1_read_be64(&b32[24]);
     r->d[1] = secp256k1_read_be64(&b32[16]);
     r->d[2] = secp256k1_read_be64(&b32[8]);
     r->d[3] = secp256k1_read_be64(&b32[0]);
+#endif
     over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
     if (overflow) {
         *overflow = over;
@@ -157,16 +176,28 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b
 
 static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
     SECP256K1_SCALAR_VERIFY(a);
-
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_a = _mm256_loadu_si256((__m256i*)a->d);
+        vec_a = _mm256_permute4x64_epi64(vec_a, _MM_SHUFFLE(0,1,2,3));
+        const __m256i bswap_mask = _mm256_setr_epi8( /* TODO: precompute */
+            7,6,5,4,3,2,1,0,
+            15,14,13,12,11,10,9,8,
+            23,22,21,20,19,18,17,16,
+            31,30,29,28,27,26,25,24);
+        __m256i output = _mm256_shuffle_epi8(vec_a, bswap_mask);
+        _mm256_storeu_si256((__m256i*)bin, output);
+    }
+#else
     secp256k1_write_be64(&bin[0],  a->d[3]);
     secp256k1_write_be64(&bin[8],  a->d[2]);
     secp256k1_write_be64(&bin[16], a->d[1]);
     secp256k1_write_be64(&bin[24], a->d[0]);
+#endif
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
     SECP256K1_SCALAR_VERIFY(a);
-
     return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0;
 }
 
@@ -243,10 +274,10 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
     int no = 0;
     SECP256K1_SCALAR_VERIFY(a);
 
-    no |= (a->d[3] < SECP256K1_N_H_3);
+    no  |= (a->d[3] < SECP256K1_N_H_3);
     yes |= (a->d[3] > SECP256K1_N_H_3) & ~no;
-    no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */
-    no |= (a->d[1] < SECP256K1_N_H_1) & ~yes;
+    no  |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */
+    no  |= (a->d[1] < SECP256K1_N_H_1) & ~yes;
     yes |= (a->d[1] > SECP256K1_N_H_1) & ~no;
     yes |= (a->d[0] > SECP256K1_N_H_0) & ~no;
     return yes;
@@ -882,8 +913,16 @@ static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r
 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) {
     SECP256K1_SCALAR_VERIFY(a);
     SECP256K1_SCALAR_VERIFY(b);
-
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_a = _mm256_loadu_si256((__m256i *)a->d);
+        __m256i vec_b = _mm256_loadu_si256((__m256i *)b->d);
+        __m256i vec_xor = _mm256_xor_si256(vec_a, vec_b);
+        return _mm256_testz_si256(vec_xor, vec_xor);
+    }
+#else
     return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0;
+#endif
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) {
@@ -899,6 +938,9 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
     shiftlimbs = shift >> 6;
     shiftlow = shift & 0x3F;
     shifthigh = 64 - shiftlow;
+
+    /* TODO: parallelize */
+
     r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
@@ -909,6 +951,12 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
 }
 
 static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->d));
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->d));
+#endif
+
     uint64_t mask0, mask1;
     volatile int vflag = flag;
     SECP256K1_SCALAR_VERIFY(a);
@@ -916,30 +964,55 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
 
     mask0 = vflag + ~((uint64_t)0);
     mask1 = ~mask0;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_mask0 = _mm256_set1_epi64x(mask0);
+        __m256i vec_mask1 = _mm256_set1_epi64x(mask1);
+        vec_r = _mm256_and_si256(vec_r, vec_mask0);
+        vec_a = _mm256_and_si256(vec_a, vec_mask1);
+        _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(vec_r, vec_a));
+    }
+#else
     r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
     r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1);
     r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1);
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
+#endif
 
     SECP256K1_SCALAR_VERIFY(r);
 }
 
 static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) {
-    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
-
     /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
      * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
      */
-    VERIFY_CHECK(a0 >> 62 == 0);
-    VERIFY_CHECK(a1 >> 62 == 0);
-    VERIFY_CHECK(a2 >> 62 == 0);
-    VERIFY_CHECK(a3 >> 62 == 0);
-    VERIFY_CHECK(a4 >> 8 == 0);
+    VERIFY_CHECK(a->v[0] >> 62 == 0);
+    VERIFY_CHECK(a->v[1] >> 62 == 0);
+    VERIFY_CHECK(a->v[2] >> 62 == 0);
+    VERIFY_CHECK(a->v[3] >> 62 == 0);
+    VERIFY_CHECK(a->v[4] >> 8 == 0);
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->v);
+        __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->v + 1));
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 2, 4, 6); /* TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(62, 60, 58, 56); /* TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+        _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(lhs, rhs));
+    }
+#else
+    {
+        const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
 
-    r->d[0] = a0      | a1 << 62;
-    r->d[1] = a1 >> 2 | a2 << 60;
-    r->d[2] = a2 >> 4 | a3 << 58;
-    r->d[3] = a3 >> 6 | a4 << 56;
+        r->d[0] = a0      | a1 << 62;
+        r->d[1] = a1 >> 2 | a2 << 60;
+        r->d[2] = a2 >> 4 | a3 << 58;
+        r->d[3] = a3 >> 6 | a4 << 56;
+    }
+#endif
 
     SECP256K1_SCALAR_VERIFY(r);
 }
@@ -949,10 +1022,24 @@ static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const s
     const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
     SECP256K1_SCALAR_VERIFY(a);
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 62, 60, 58); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(64, 2, 4, 6); /*TODO: precompute */
+        const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62));
+    }
+#else
     r->v[0] =  a0                   & M62;
     r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
     r->v[2] = (a1 >> 60 | a2 <<  4) & M62;
     r->v[3] = (a2 >> 58 | a3 <<  6) & M62;
+#endif
     r->v[4] =  a3 >> 56;
 }
 
diff --git a/src/util.h b/src/util.h
index 5f29f4076c..702a67b317 100644
--- a/src/util.h
+++ b/src/util.h
@@ -20,6 +20,37 @@
 #include <Windows.h>
 #endif
 
+/* endianess detection macro */
+#if defined(_MSC_VER)
+# define SECP256K1_LITTLE_ENDIAN
+#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#  define SECP256K1_LITTLE_ENDIAN
+#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  define SECP256K1_BIG_ENDIAN
+#else
+# error "Cannot detect endianness" /* GCC <4.6 or embedded compilers */
+#endif
+
+/* Byteswap intrinsics */
+#if defined(_MSC_VER)
+# define BYTESWAP_16(x) _byteswap_ushort(x)
+# define BYTESWAP_32(x) _byteswap_ulong(x)
+# define BYTESWAP_64(x) _byteswap_uint64(x)
+#elif defined(__has_builtin) && __has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) && __has_builtin(__builtin_bswap64)
+# define BYTESWAP_16(x) __builtin_bswap16(x)
+# define BYTESWAP_32(x) __builtin_bswap32(x)
+# define BYTESWAP_64(x) __builtin_bswap64(x)
+#else
+# define BYTESWAP_16(x) (((x) >> 8)  | ((x)  << 8))
+# define BYTESWAP_32(x) (((x) >> 24) | (((x) >> 8)  & 0xFF00) | (((x) << 8)  & 0xFF0000) | ((x)  << 24))
+# define BYTESWAP_64(x) (((x) >> 56) | (((x) >> 40) & 0xFF00) | (((x) >> 24) & 0xFF0000) | (((x) >> 8) & 0xFF000000ULL) | (((x) << 8) & 0xFF00000000ULL) | (((x) << 24) & 0xFF0000000000ULL) | (((x) << 40) & 0xFF000000000000ULL) | ((x) << 56) )
+#endif
+
+/* X86 detection macro */
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+# define X86
+#endif
+
 #define STR_(x) #x
 #define STR(x) STR_(x)
 #define DEBUG_CONFIG_MSG(x) "DEBUG_CONFIG: " x
@@ -398,42 +429,38 @@ static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) {
 
 /* Read a uint32_t in big endian */
 SECP256K1_INLINE static uint32_t secp256k1_read_be32(const unsigned char* p) {
-    return (uint32_t)p[0] << 24 |
-           (uint32_t)p[1] << 16 |
-           (uint32_t)p[2] << 8  |
-           (uint32_t)p[3];
+    uint32_t x;
+    memcpy(&x, p, sizeof(x));
+#ifdef SECP256K1_LITTLE_ENDIAN
+    x = BYTESWAP_32(x);
+#endif
+    return x;
 }
 
 /* Write a uint32_t in big endian */
 SECP256K1_INLINE static void secp256k1_write_be32(unsigned char* p, uint32_t x) {
-    p[3] = x;
-    p[2] = x >>  8;
-    p[1] = x >> 16;
-    p[0] = x >> 24;
+#ifdef SECP256K1_LITTLE_ENDIAN
+    x = BYTESWAP_32(x);
+#endif
+    memcpy(p, &x, sizeof(x));
 }
 
 /* Read a uint64_t in big endian */
 SECP256K1_INLINE static uint64_t secp256k1_read_be64(const unsigned char* p) {
-    return (uint64_t)p[0] << 56 |
-           (uint64_t)p[1] << 48 |
-           (uint64_t)p[2] << 40 |
-           (uint64_t)p[3] << 32 |
-           (uint64_t)p[4] << 24 |
-           (uint64_t)p[5] << 16 |
-           (uint64_t)p[6] << 8  |
-           (uint64_t)p[7];
+    uint64_t x;
+    memcpy(&x, p, sizeof(x));
+#ifdef SECP256K1_LITTLE_ENDIAN
+    x = BYTESWAP_64(x);
+#endif
+    return x;
 }
 
 /* Write a uint64_t in big endian */
 SECP256K1_INLINE static void secp256k1_write_be64(unsigned char* p, uint64_t x) {
-    p[7] = x;
-    p[6] = x >>  8;
-    p[5] = x >> 16;
-    p[4] = x >> 24;
-    p[3] = x >> 32;
-    p[2] = x >> 40;
-    p[1] = x >> 48;
-    p[0] = x >> 56;
+#ifdef SECP256K1_LITTLE_ENDIAN
+    x = BYTESWAP_64(x);
+#endif
+    memcpy(p, &x, sizeof(x));
 }
 
 /* Rotate a uint32_t to the right. */