From 0f0a87240a296ef50ffeb88588567fedd73c8d5f Mon Sep 17 00:00:00 2001
From: dor-forer <dor.forer@redis.com>
Date: Sun, 16 Mar 2025 12:10:31 +0200
Subject: [PATCH] Add Arm machines support to benchamrks [MOD-8531] (#600)

* Add arm support

* Changed the arm cpu info

* Add ip test

* Add to tests

* Added tests andbm

* fix tests

* Add github benchmakrs

* Check 1

* only arm

* change ami

* Try ireland

* Try different image

* try image

* back to old image

* larger image

* Add option to change env

* back to default region

* Created new image

* Try to add the x86 to check

* Try different machine

* added include

* Try without opti on arm

* Change to c6g

* added matrix region

* change to west

* try the i8

* Try oregon

* Change subnet id

* Now subnet

* Change subnet

* add subnet

* Try group id

* Change to vpc id

* change subnet

* Change ami

* Try without subnet

* add security group again

* Change the subnets

* Change to ids

* Change sg

* psubnet

* Try different

* different

* to a file

* print

* p

* leave empty

* empty

* Try different account

* Run 2 arm machines

* Move both to us-west-2

* Try workflow

* Change name

* Changes

* Change the secrets

* Add supprted arch

* Add defaults

* Support all

* Change the jq

* Change machine to t4g

* Change the name

* Change the machine

* fix the stop

* only benchamrk

* add the secrets

* region secret

* benchmark region

* Change timeout

* Added support for arch name in benchamrks

* change th json

* changed to v9.0

* Change the check

* add v9

* Check alt version of armv9

* added check

* add arc_arch

* changed to CONCAT_WITH_UNDERSCORE_ARCH

* change the check

* Add full check

* fix the instruct

* Added the cmake

* fix the support

* put it back to cmake

* back

* change the condition

* No armpl for now

* cland format

* remove the opt

* Changed to one machine

* Added BENCHMARK_ARCH

* fix endif

* Remove secrets call

* pr changes

* suuport check for armv7

* Change or OR

(cherry picked from commit 977def571d21a7f1438e7b366c0dc8420f796597)
---
 .github/workflows/benchmark-runner.yml        | 112 +++++++++++++
 .github/workflows/benchmark.yml               | 158 +++++++-----------
 cmake/aarch64InstructionFlags.cmake           |  22 +++
 cmake/x86_64InstructionFlags.cmake            | 110 ++++++------
 tests/benchmark/CMakeLists.txt                |  14 +-
 tests/benchmark/bm_macros.h                   |  32 ++++
 tests/benchmark/bm_vecsim_general.h           |   1 +
 .../run_files/bm_basics_multi_bf16.cpp        |   8 +-
 .../run_files/bm_basics_multi_fp16.cpp        |   8 +-
 .../run_files/bm_basics_multi_fp32.cpp        |   8 +-
 .../run_files/bm_basics_multi_fp64.cpp        |   8 +-
 .../run_files/bm_basics_single_bf16.cpp       |   8 +-
 .../run_files/bm_basics_single_fp16.cpp       |   8 +-
 .../run_files/bm_basics_single_fp32.cpp       |   8 +-
 .../run_files/bm_basics_single_fp64.cpp       |   8 +-
 .../run_files/bm_basics_single_int8.cpp       |   8 +-
 .../bm_batch_iterator_multi_bf16.cpp          |   2 +-
 .../bm_batch_iterator_multi_fp16.cpp          |   2 +-
 .../bm_batch_iterator_multi_fp32.cpp          |   2 +-
 .../bm_batch_iterator_multi_fp64.cpp          |   2 +-
 .../bm_batch_iterator_single_bf16.cpp         |   2 +-
 .../bm_batch_iterator_single_fp16.cpp         |   2 +-
 .../bm_batch_iterator_single_fp32.cpp         |   2 +-
 .../bm_batch_iterator_single_fp64.cpp         |   2 +-
 .../bm_batch_iterator_single_int8.cpp         |   2 +-
 .../bm_updated_index_single_fp32.cpp          |   3 +-
 tests/benchmark/spaces_benchmarks/bm_spaces.h |  11 +-
 27 files changed, 346 insertions(+), 207 deletions(-)
 create mode 100644 .github/workflows/benchmark-runner.yml
 create mode 100644 cmake/aarch64InstructionFlags.cmake
 create mode 100644 tests/benchmark/bm_macros.h

diff --git a/.github/workflows/benchmark-runner.yml b/.github/workflows/benchmark-runner.yml
new file mode 100644
index 000000000..ff4cb5d75
--- /dev/null
+++ b/.github/workflows/benchmark-runner.yml
@@ -0,0 +1,112 @@
+on:
+  workflow_call:
+    inputs:
+      setup:
+        required: true
+        type: string
+      architecture:
+        required: true
+        type: string
+      instance-type:
+        required: true
+        type: string
+      ami-id:
+        required: true
+        type: string
+      github-runner-label:
+        required: true
+        type: string
+
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      runner_label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2_instance_id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION_BENCHMARK }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ inputs.ami-id }}
+          ec2-instance-type: ${{ inputs.instance-type }}
+          subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID_BENCHMARK }}
+          security-group-id: ${{ secrets.AWS_EC2_SG_ID_BENCHMARK }}
+          label: ${{ inputs.github-runner-label }}
+
+  benchmark:
+    name: Run benchmarks on runner
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.runner_label }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.number && format('refs/pull/{0}/merge', github.event.number) || github.head_ref }}
+      - name: Print runner info
+        run: |
+          printf "Runner lscpu:\n$(lscpu)\n"
+          printf "Runner lsmem:\n$(lsmem)\n"
+          printf "Runner nproc:\n$(nproc)\n"
+          printf "Runner uname:\n$(uname -a)\n"
+          printf "Runner arch:\n$(arch)\n"
+      - name: Install benchmark dependencies
+        run: |
+          sudo .install/install_script.sh
+          sudo apt install python3-pip -y
+          pip3 install --upgrade pip PyYAML setuptools redisbench-admin
+          pip3 install -r requirements.txt
+      - name: Download pre-generated indices
+        timeout-minutes: 20
+        run: ./tests/benchmark/bm_files.sh ${{ inputs.setup }}
+      - name: Run Benchmark
+        env:
+          ARCH: ${{ inputs.architecture }}
+        timeout-minutes: 300
+        run: |
+          make benchmark BM_FILTER=${{ inputs.setup }}
+      - name: Collect results
+        run: |
+          ./tests/benchmark/benchmarks.sh ${{ inputs.setup }} | xargs -P 0 -I {} redisbench-admin export \
+          --redistimeseries_host ${{ secrets.PERFORMANCE_RTS_HOST }} \
+          --redistimeseries_port ${{ secrets.PERFORMANCE_RTS_PORT }} \
+          --redistimeseries_user default \
+          --redistimeseries_pass '${{ secrets.PERFORMANCE_RTS_AUTH }}' \
+          --github_repo ${{ github.event.repository.name }} \
+          --github_org ${{ github.repository_owner }} \
+          --github_branch ${{ github.head_ref || github.ref_name }} \
+          --github_actor ${{ github.triggering_actor }} \
+          --results-format google.benchmark \
+          --benchmark-result-file {}_results.json
+
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner # required to get output from the start-runner job
+      - benchmark # required to wait when the main job is done
+    runs-on: ubuntu-latest
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION_BENCHMARK }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ inputs.github-runner-label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2_instance_id }}
+
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index cc53ed43f..1655e26bb 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -4,6 +4,11 @@ on:
       setup:
         type: string
         required: true
+      architecture:
+        type: string
+        required: false
+        default: 'all'
+        description: 'Run only on specific architecture'
   workflow_dispatch:
     inputs:
       setup:
@@ -35,107 +40,66 @@ on:
           - bm-spaces
         description: 'Benchmarks set to run'
         default: benchmarks-all
+      architecture:
+          type: choice
+          options:
+            - all
+            - arm64
+            - x86_64
+          description: 'Run only on specific architecture'
+          default: 'all'
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
+  prepare_runner_configurations:
     runs-on: ubuntu-latest
     outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          # Ubuntu 22.04 128GB Storage AMI
-          ec2-image-id: ami-0ba430d4b7b64de57
-          ec2-instance-type: r7i.xlarge
-          subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }}
-          security-group-id: ${{ secrets.AWS_EC2_SG_ID }}
-
-  benchmark:
-    name: Run the benchmarks on the runner
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - name: checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.number && format('refs/pull/{0}/merge', github.event.number) || github.head_ref }}
-      - name: Print runner info
+      - name: Set matrix
+        id: set-matrix
         run: |
-          printf "Runner lscpu:\n$(lscpu)\n"
-          printf "Runner lsmem:\n$(lsmem)\n"
-          printf "Runner nproc:\n$(nproc)\n"
-          printf "Runner uname:\n$(uname -a)\n"
-
-      - name: Install benchmark dependencies
-        run: |
-             sudo .install/install_script.sh
-             sudo apt install python3-pip -y
-             pip3 install --upgrade pip PyYAML setuptools redisbench-admin
-             pip3 install -r requirements.txt
-
-      # - name: stress test
-      #   run: |
-      #     sudo apt install stress-ng -qqy
-      #     uptime
-      #     stress-ng -c 1 --timeout 60s --metrics-brief
-      #     uptime
-      #     stress-ng --stream 1 -t 60 --metrics-brief
-      #     uptime
-      #     stress-ng --ipsec-mb=1 -t 60 --metrics-brief
-      #     uptime
+          # Define the full matrix as a JSON string
+          FULL_MATRIX='
+          {
+            "include": [
+              {
+                "architecture": "arm64",
+                "instance-type": "i8g.xlarge",
+                "ami-id": "ami-0d6c92b636b74f843"
+              },
+              {
+                "architecture": "x86_64",
+                "instance-type": "r7i.xlarge",
+                "ami-id": "ami-09fabd03bb09b3704"
+              }
+            ]
+          }
+          '
+          
+          # Filter the matrix based on architecture
+          if [ "${{ inputs.architecture }}" = "all" ]; then
+            # Use the full matrix
+            FILTERED_MATRIX="$FULL_MATRIX"
+          else
+            # Filter to only the selected architecture
+            FILTERED_MATRIX=$(echo "$FULL_MATRIX" | jq -c '{include: [.include[] | select(.architecture | contains("${{ inputs.architecture }}"))]}')
+          fi
+          
+          # Use multiline output delimiter syntax for GitHub Actions
+          echo "matrix<<EOF" >> $GITHUB_OUTPUT
+          echo "$FILTERED_MATRIX" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
 
-      # TODO: remove "--no-check-certificate" when possible
-      - name: Download pre-generated indices
-        timeout-minutes: 20
-        run: ./tests/benchmark/bm_files.sh ${{ inputs.setup }}
-      - name: Benchmark
-        timeout-minutes: 120
-        run: make benchmark BM_FILTER=${{ inputs.setup }}
-
-      - name: Collect results
-        run: |
-          ./tests/benchmark/benchmarks.sh ${{ inputs.setup }} | xargs -P 0 -I {} redisbench-admin export     \
-          --redistimeseries_host      ${{ secrets.PERFORMANCE_RTS_HOST }}           \
-          --redistimeseries_port      ${{ secrets.PERFORMANCE_RTS_PORT }}           \
-          --redistimeseries_user      default                                       \
-          --redistimeseries_pass      '${{ secrets.PERFORMANCE_RTS_AUTH }}'         \
-          --github_repo               ${{ github.event.repository.name }}           \
-          --github_org                ${{ github.repository_owner }}                \
-          --github_branch             ${{ github.head_ref || github.ref_name }}     \
-          --github_actor              ${{ github.triggering_actor }}                \
-          --results-format            google.benchmark                              \
-          --benchmark-result-file     {}_results.json
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - benchmark # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+  run_benchmarks:
+    name: Run ${{ matrix.architecture }} benchmarks
+    needs: prepare_runner_configurations
+    uses: ./.github/workflows/benchmark-runner.yml
+    secrets: inherit
+    strategy:
+      matrix: ${{ fromJson(needs.prepare_runner_configurations.outputs.matrix) }}
+    with:
+      setup: ${{ inputs.setup }}
+      architecture: ${{ matrix.architecture }}
+      instance-type: ${{ matrix.instance-type }}
+      ami-id: ${{ matrix.ami-id }}
+      github-runner-label: ${{ matrix.architecture }}-${{ matrix.instance-type }}-${{ github.run_id }}
diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake
new file mode 100644
index 000000000..7c1363262
--- /dev/null
+++ b/cmake/aarch64InstructionFlags.cmake
@@ -0,0 +1,22 @@
+include(CheckCXXCompilerFlag)
+
+
+message(STATUS "Building for ARM aarch64")
+
+# Check what compiler flags are supported
+CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
+CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
+CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
+CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_ARMV9)
+
+# Only use ARMv9 if both compiler and CPU support it
+if(CXX_ARMV9)
+  message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
+  add_compile_definitions(OPT_ARMV9)
+endif()
+if (CXX_ARMV8A OR CXX_ARMV7_NEON)
+  add_compile_definitions(OPT_NEON)
+endif()
+if (CXX_SVE)
+  add_compile_definitions(OPT_SVE)
+endif()
diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
index 1ff8f48f2..4976d60e6 100644
--- a/cmake/x86_64InstructionFlags.cmake
+++ b/cmake/x86_64InstructionFlags.cmake
@@ -3,73 +3,71 @@
 
 include(CheckCXXCompilerFlag)
 
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
-	# build SSE/AVX* code only on x64 processors.
-	# Check that the compiler supports instructions flag.
-	# This will add the relevant flag both the the space selector and the optimization.
-	CHECK_CXX_COMPILER_FLAG(-mavx512vl CXX_AVX512VL)
-	CHECK_CXX_COMPILER_FLAG(-mavx512bf16 CXX_AVX512BF16)
-	CHECK_CXX_COMPILER_FLAG(-mavx512bw CXX_AVX512BW)
-	CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
-	CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
-	CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
-	CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI)
-	CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
-	CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
-	CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
-	CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA)
-	CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3)
-	CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
+# build SSE/AVX* code only on x64 processors.
+# Check that the compiler supports instructions flag.
+# This will add the relevant flag both the the space selector and the optimization.
+CHECK_CXX_COMPILER_FLAG(-mavx512vl CXX_AVX512VL)
+CHECK_CXX_COMPILER_FLAG(-mavx512bf16 CXX_AVX512BF16)
+CHECK_CXX_COMPILER_FLAG(-mavx512bw CXX_AVX512BW)
+CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
+CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
+CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
+CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI)
+CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
+CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
+CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
+CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA)
+CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3)
+CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
 
-	# Turn off AVX512BF16 on Ubuntu 18.04 as it is not supported by its binutils assembler version.
-	if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-		execute_process(COMMAND lsb_release -rs
-					OUTPUT_VARIABLE UBUNTU_VERSION
-					OUTPUT_STRIP_TRAILING_WHITESPACE)
+# Turn off AVX512BF16 on Ubuntu 18.04 as it is not supported by its binutils assembler version.
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+	execute_process(COMMAND lsb_release -rs
+				OUTPUT_VARIABLE UBUNTU_VERSION
+				OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-		if("${UBUNTU_VERSION}" STREQUAL "18.04")
-			message(STATUS "Compiling on Ubuntu 18.04, turning off CXX_AVX512BF16 flag.")
-			set(CXX_AVX512BF16 FALSE)
-		endif()
+	if("${UBUNTU_VERSION}" STREQUAL "18.04")
+		message(STATUS "Compiling on Ubuntu 18.04, turning off CXX_AVX512BF16 flag.")
+		set(CXX_AVX512BF16 FALSE)
 	endif()
+endif()
 
-	if(CXX_AVX512VL AND CXX_AVX512BF16)
-		add_compile_definitions(OPT_AVX512_BF16_VL)
-	endif()
+if(CXX_AVX512VL AND CXX_AVX512BF16)
+	add_compile_definitions(OPT_AVX512_BF16_VL)
+endif()
 
-	if(CXX_AVX512VL AND CXX_AVX512FP16)
-		add_compile_definitions(OPT_AVX512_FP16_VL)
-	endif()
+if(CXX_AVX512VL AND CXX_AVX512FP16)
+	add_compile_definitions(OPT_AVX512_FP16_VL)
+endif()
 
-	if(CXX_AVX512F)
-		add_compile_definitions(OPT_AVX512F)
-	endif()
+if(CXX_AVX512F)
+	add_compile_definitions(OPT_AVX512F)
+endif()
 
-	if(CXX_AVX512BW AND CXX_AVX512VBMI2)
-		add_compile_definitions(OPT_AVX512_BW_VBMI2)
-	endif()
+if(CXX_AVX512BW AND CXX_AVX512VBMI2)
+	add_compile_definitions(OPT_AVX512_BW_VBMI2)
+endif()
 
-	if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
-		add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
-	endif()
+if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
+	add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
+endif()
 
-	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
-		add_compile_definitions(OPT_F16C)
-	endif()
+if(CXX_F16C AND CXX_FMA AND CXX_AVX)
+	add_compile_definitions(OPT_F16C)
+endif()
 
-	if(CXX_AVX2)
-		add_compile_definitions(OPT_AVX2)
-	endif()
+if(CXX_AVX2)
+	add_compile_definitions(OPT_AVX2)
+endif()
 
-	if(CXX_AVX)
-		add_compile_definitions(OPT_AVX)
-	endif()
+if(CXX_AVX)
+	add_compile_definitions(OPT_AVX)
+endif()
 
-	if(CXX_SSE3)
-		add_compile_definitions(OPT_SSE3)
-	endif()
+if(CXX_SSE3)
+	add_compile_definitions(OPT_SSE3)
+endif()
 
-	if(CXX_SSE)
-		add_compile_definitions(OPT_SSE)
-	endif()
+if(CXX_SSE)
+	add_compile_definitions(OPT_SSE)
 endif()
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index a82bec73d..7bf528de9 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 message("# VectorSimilarity_Benchmark root: " ${root})
 message("# VectorSimilarity_Benchmark binroot: " ${binroot})
 
@@ -9,6 +8,7 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 include_directories(../)
+include_directories(./)
 
 enable_testing()
 
@@ -25,12 +25,20 @@ foreach(benchmark IN ITEMS ${BENCHMARKS})
 	target_link_libraries(bm_${benchmark} VectorSimilarity benchmark::benchmark)
 endforeach()
 
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
+	message(STATUS "Enabling Arm Performance Libraries integration")
+	include(${root}/cmake/aarch64InstructionFlags.cmake)
+	add_compile_definitions(BENCHMARK_ARCH=arm64)
+
+else()
+	include(${root}/cmake/x86_64InstructionFlags.cmake)
+	add_compile_definitions(BENCHMARK_ARCH=x86_64)
+endif()
+
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # Spaces benchmarks								                                          #
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-include(${root}/cmake/x86_64InstructionFlags.cmake)
-
 set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8)
 foreach(data_type IN LISTS DATA_TYPE)
 	add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp)
diff --git a/tests/benchmark/bm_macros.h b/tests/benchmark/bm_macros.h
new file mode 100644
index 000000000..e3b0a4ec3
--- /dev/null
+++ b/tests/benchmark/bm_macros.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "cpu_features_macros.h"
+
+#define EXPAND(x)  x
+#define EXPAND2(x) EXPAND(x)
+// Helper for raw concatenation with varying arguments
+#define BM_FUNC_NAME_HELPER1_2(a, b)          a##_##b
+#define BM_FUNC_NAME_HELPER1_3(a, b, c)       a##_##b##_##c
+#define BM_FUNC_NAME_HELPER1_4(a, b, c, d)    a##_##b##_##c##_##d
+#define BM_FUNC_NAME_HELPER1_5(a, b, c, d, e) a##_##b##_##c##_##d##_##e
+
+// Force expansion of macro arguments
+#define BM_FUNC_NAME_HELPER_2(a, b)          BM_FUNC_NAME_HELPER1_2(a, b)
+#define BM_FUNC_NAME_HELPER_3(a, b, c)       BM_FUNC_NAME_HELPER1_3(a, b, c)
+#define BM_FUNC_NAME_HELPER_4(a, b, c, d)    BM_FUNC_NAME_HELPER1_4(a, b, c, d)
+#define BM_FUNC_NAME_HELPER_5(a, b, c, d, e) BM_FUNC_NAME_HELPER1_5(a, b, c, d, e)
+
+// Determine the number of arguments and select the appropriate helper
+#define COUNT_ARGS(...)                             COUNT_ARGS_(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
+#define COUNT_ARGS_(_1, _2, _3, _4, _5, _6, N, ...) N
+
+// Concatenate BM_FUNC_NAME_HELPER with the number of arguments
+#define CONCAT_HELPER(a, b) a##_##b
+#define CONCAT(a, b)        CONCAT_HELPER(a, b)
+
+// Main macro that selects the appropriate helper based on argument count
+#define CONCAT_WITH_UNDERSCORE(...)                                                                \
+    EXPAND2(CONCAT(BM_FUNC_NAME_HELPER, EXPAND2(COUNT_ARGS(__VA_ARGS__)))(__VA_ARGS__))
+
+// Modify this macro to account for the extra BENCHMARK_ARCH parameter
+#define CONCAT_WITH_UNDERSCORE_ARCH(...) CONCAT_WITH_UNDERSCORE(__VA_ARGS__, BENCHMARK_ARCH)
diff --git a/tests/benchmark/bm_vecsim_general.h b/tests/benchmark/bm_vecsim_general.h
index bc95e54f5..d847f7fc7 100644
--- a/tests/benchmark/bm_vecsim_general.h
+++ b/tests/benchmark/bm_vecsim_general.h
@@ -20,6 +20,7 @@
 #include "VecSim/algorithms/hnsw/hnsw.h"
 #include "VecSim/index_factories/hnsw_factory.h"
 #include "bm_definitions.h"
+#include "bm_macros.h"
 #include "utils/mock_thread_pool.h"
 
 // This class includes every static data member that is:
diff --git a/tests/benchmark/run_files/bm_basics_multi_bf16.cpp b/tests/benchmark/run_files/bm_basics_multi_bf16.cpp
index 32ac09766..bfc570221 100644
--- a/tests/benchmark/run_files/bm_basics_multi_bf16.cpp
+++ b/tests/benchmark/run_files/bm_basics_multi_bf16.cpp
@@ -21,10 +21,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-bf16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Multi
-#define BM_ADD_LABEL                AddLabel_Multi
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Multi
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Multi)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Multi)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel_Async, Multi)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Multi)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), bf16_index_t, BruteForceIndex_Multi,
                     vecsim_types::bfloat16, float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_multi_fp16.cpp b/tests/benchmark/run_files/bm_basics_multi_fp16.cpp
index 04d0d04df..2113f3071 100644
--- a/tests/benchmark/run_files/bm_basics_multi_fp16.cpp
+++ b/tests/benchmark/run_files/bm_basics_multi_fp16.cpp
@@ -21,10 +21,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-fp16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Multi
-#define BM_ADD_LABEL                AddLabel_Multi
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Multi
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Multi)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Multi)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel_Async, Multi)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Multi)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp16_index_t, BruteForceIndex_Multi,
                     vecsim_types::float16, float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_multi_fp32.cpp b/tests/benchmark/run_files/bm_basics_multi_fp32.cpp
index fd614f49d..2c79baf18 100644
--- a/tests/benchmark/run_files/bm_basics_multi_fp32.cpp
+++ b/tests/benchmark/run_files/bm_basics_multi_fp32.cpp
@@ -20,10 +20,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Multi
-#define BM_ADD_LABEL                AddLabel_Multi
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Multi
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Multi)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Multi)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel_Async, Multi)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Multi)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp32_index_t, BruteForceIndex_Multi, float,
                     float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_multi_fp64.cpp b/tests/benchmark/run_files/bm_basics_multi_fp64.cpp
index daad4bc90..9af1e9301 100644
--- a/tests/benchmark/run_files/bm_basics_multi_fp64.cpp
+++ b/tests/benchmark/run_files/bm_basics_multi_fp64.cpp
@@ -20,10 +20,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-fp64-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Multi
-#define BM_ADD_LABEL                AddLabel_Multi
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Multi
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Multi)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Multi)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel_Async, Multi)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Multi)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp64_index_t, BruteForceIndex_Multi, double,
                     double, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_single_bf16.cpp b/tests/benchmark/run_files/bm_basics_single_bf16.cpp
index 505edbe75..a94e7a27e 100644
--- a/tests/benchmark/run_files/bm_basics_single_bf16.cpp
+++ b/tests/benchmark/run_files/bm_basics_single_bf16.cpp
@@ -21,10 +21,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-bf16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
-#define BM_ADD_LABEL                AddLabel_Single
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Single
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel, Async, Single)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), bf16_index_t, BruteForceIndex_Single,
                     vecsim_types::bfloat16, float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_single_fp16.cpp b/tests/benchmark/run_files/bm_basics_single_fp16.cpp
index 0a1948b84..a158d5a89 100644
--- a/tests/benchmark/run_files/bm_basics_single_fp16.cpp
+++ b/tests/benchmark/run_files/bm_basics_single_fp16.cpp
@@ -21,10 +21,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-fp16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
-#define BM_ADD_LABEL                AddLabel_Single
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Single
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel, Async, Single)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp16_index_t, BruteForceIndex_Single,
                     vecsim_types::float16, float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_single_fp32.cpp b/tests/benchmark/run_files/bm_basics_single_fp32.cpp
index 889927779..620845d54 100644
--- a/tests/benchmark/run_files/bm_basics_single_fp32.cpp
+++ b/tests/benchmark/run_files/bm_basics_single_fp32.cpp
@@ -20,10 +20,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
-#define BM_ADD_LABEL                AddLabel_Single
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Single
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Single)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp32_index_t, BruteForceIndex_Single, float,
                     float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_single_fp64.cpp b/tests/benchmark/run_files/bm_basics_single_fp64.cpp
index c3a6b5664..47ea1362d 100644
--- a/tests/benchmark/run_files/bm_basics_single_fp64.cpp
+++ b/tests/benchmark/run_files/bm_basics_single_fp64.cpp
@@ -20,10 +20,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-fp64-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
-#define BM_ADD_LABEL                AddLabel_Single
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Single
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel, Async, Single)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp64_index_t, BruteForceIndex_Single, double,
                     double, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_basics_single_int8.cpp b/tests/benchmark/run_files/bm_basics_single_int8.cpp
index 09fbb47c8..919b5efa3 100644
--- a/tests/benchmark/run_files/bm_basics_single_int8.cpp
+++ b/tests/benchmark/run_files/bm_basics_single_int8.cpp
@@ -20,10 +20,10 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/wipedia_single-cosine-dim1024-int8-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) bm_func##_##algo##_Single
-#define BM_ADD_LABEL                AddLabel_Single
-#define BM_ADD_LABEL_ASYNC          AddLabel_Async_Single
-#define BM_DELETE_LABEL_ASYNC       DeleteLabel_Async_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
+#define BM_ADD_LABEL                CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)
+#define BM_ADD_LABEL_ASYNC          CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single)
+#define BM_DELETE_LABEL_ASYNC       CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel, Async, Single)
 
 DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), int8_index_t, BruteForceIndex_Single, int8_t,
                     float, VecSimAlgo_BF)
diff --git a/tests/benchmark/run_files/bm_batch_iterator_multi_bf16.cpp b/tests/benchmark/run_files/bm_batch_iterator_multi_bf16.cpp
index fff5784bb..ccfd5686e 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_multi_bf16.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_multi_bf16.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-bf16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Multi)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_bf16.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_multi_fp16.cpp b/tests/benchmark/run_files/bm_batch_iterator_multi_fp16.cpp
index 1493cb897..7a73a6d10 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_multi_fp16.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_multi_fp16.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-fp16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Multi)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp16.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_multi_fp32.cpp b/tests/benchmark/run_files/bm_batch_iterator_multi_fp32.cpp
index b8d47f72e..3c3a22509 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_multi_fp32.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_multi_fp32.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Multi)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp32.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_multi_fp64.cpp b/tests/benchmark/run_files/bm_batch_iterator_multi_fp64.cpp
index f3d970b48..a75635359 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_multi_fp64.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_multi_fp64.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/fashion_images_multi_value-cosine-dim512-fp64-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Multi
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Multi)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp64.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_single_bf16.cpp b/tests/benchmark/run_files/bm_batch_iterator_single_bf16.cpp
index 869b658d9..351f89167 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_single_bf16.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_single_bf16.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-bf16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Single)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_bf16.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_single_fp16.cpp b/tests/benchmark/run_files/bm_batch_iterator_single_fp16.cpp
index 1e411a142..35ca175ff 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_single_fp16.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_single_fp16.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-fp16-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Single)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp16.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_single_fp32.cpp b/tests/benchmark/run_files/bm_batch_iterator_single_fp32.cpp
index 5cfaa737d..b45613e23 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_single_fp32.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_single_fp32.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Single)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp32.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_single_fp64.cpp b/tests/benchmark/run_files/bm_batch_iterator_single_fp64.cpp
index 7978e0eb0..8ef5cd22f 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_single_fp64.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_single_fp64.cpp
@@ -16,7 +16,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-fp64-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Single)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_fp64.h"
 
diff --git a/tests/benchmark/run_files/bm_batch_iterator_single_int8.cpp b/tests/benchmark/run_files/bm_batch_iterator_single_int8.cpp
index 1ad03d5b4..a9c7f8a97 100644
--- a/tests/benchmark/run_files/bm_batch_iterator_single_int8.cpp
+++ b/tests/benchmark/run_files/bm_batch_iterator_single_int8.cpp
@@ -15,7 +15,7 @@ const char *BM_VecSimGeneral::hnsw_index_file =
 const char *BM_VecSimGeneral::test_queries_file =
     "tests/benchmark/data/wipedia_single-cosine-dim1024-int8-test_vectors.raw";
 
-#define BM_FUNC_NAME(bm_func, algo) algo##_##bm_func##_Single
+#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(algo, bm_func, Single)
 
 #include "benchmark/bm_initialization/bm_batch_initialize_int8.h"
 
diff --git a/tests/benchmark/run_files/bm_updated_index_single_fp32.cpp b/tests/benchmark/run_files/bm_updated_index_single_fp32.cpp
index e703a6320..e5289912b 100644
--- a/tests/benchmark/run_files/bm_updated_index_single_fp32.cpp
+++ b/tests/benchmark/run_files/bm_updated_index_single_fp32.cpp
@@ -21,7 +21,8 @@ template <>
 const char *BM_VecSimUpdatedIndex<fp32_index_t>::updated_hnsw_index_file =
     "tests/benchmark/data/dbpedia-cosine-dim768-M65-efc512-n500k-updated.hnsw_v3";
 
-#define BM_BEFORE_FUNC_NAME(bm_func, algo)  bm_func##_##algo##_before_Single
+#define BM_BEFORE_FUNC_NAME(bm_func, algo)                                                         \
+    CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, before, Single)
 #define BM_UPDATED_FUNC_NAME(bm_func, algo) bm_func##_##algo##_updated_Single
 
 #include "benchmark/bm_initialization/bm_updated_initialize_fp32.h"
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index b7431c43c..3906f3d16 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -24,13 +24,13 @@
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE.h"
-
+#include "bm_macros.h"
 #include "bm_spaces_class.h"
 
 // Defining the generic benchmark flow: if there is support for the optimization, benchmark the
 // function.
 #define BENCHMARK_DISTANCE_F(bm_class, type_prefix, arch, metric, bm_name, arch_supported)         \
-    BENCHMARK_DEFINE_F(bm_class, type_prefix##_##arch##_##metric##_##bm_name)                      \
+    BENCHMARK_DEFINE_F(bm_class, CONCAT_WITH_UNDERSCORE_ARCH(type_prefix, arch, metric, bm_name))  \
     (benchmark::State & st) {                                                                      \
         if (!arch_supported) {                                                                     \
             st.SkipWithError("This benchmark requires " #arch ", which is not available");         \
@@ -44,7 +44,8 @@
 
 #define INITIALIZE_BM(bm_class, type_prefix, arch, metric, bm_name, arch_supported)                \
     BENCHMARK_DISTANCE_F(bm_class, type_prefix, arch, metric, bm_name, arch_supported)             \
-    BENCHMARK_REGISTER_F(bm_class, type_prefix##_##arch##_##metric##_##bm_name)                    \
+    BENCHMARK_REGISTER_F(bm_class,                                                                 \
+                         CONCAT_WITH_UNDERSCORE_ARCH(type_prefix, arch, metric, bm_name))          \
         ->ArgName("Dimension")                                                                     \
         ->Unit(benchmark::kNanosecond)
 
@@ -95,7 +96,7 @@ static constexpr size_t start = min_no_res_th_dim;
 
 /* Naive algorithms */
 #define BENCHMARK_DEFINE_NAIVE(bm_class, type_prefix, metric)                                      \
-    BENCHMARK_DEFINE_F(bm_class, type_prefix##_NAIVE_##metric)                                     \
+    BENCHMARK_DEFINE_F(bm_class, CONCAT_WITH_UNDERSCORE_ARCH(type_prefix, NAIVE, metric))          \
     (benchmark::State & st) {                                                                      \
         for (auto _ : st) {                                                                        \
             type_prefix##_##metric(v1, v2, dim);                                                   \
@@ -104,7 +105,7 @@ static constexpr size_t start = min_no_res_th_dim;
 
 #define INITIALIZE_NAIVE_BM(bm_class, type_prefix, metric, dim_opt)                                \
     BENCHMARK_DEFINE_NAIVE(bm_class, type_prefix, metric)                                          \
-    BENCHMARK_REGISTER_F(bm_class, type_prefix##_NAIVE_##metric)                                   \
+    BENCHMARK_REGISTER_F(bm_class, CONCAT_WITH_UNDERSCORE_ARCH(type_prefix, NAIVE, metric))        \
         ->ArgName("Dimension")                                                                     \
         ->Unit(benchmark::kNanosecond)                                                             \
         ->Arg(100)                                                                                 \