diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index c7b83e9438..6eb1868e40 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 <h3>Improvements 🛠</h3>
 
+- Linux and MacOS `lightning.qubit` wheels are now built with OpenMP support for all kernel types (LM, AVX2, and AVX512), enabling better performance tuning for CPU simulations.
+  [(#1133)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1133)
+
 <h3>Breaking changes 💔</h3>
 
 <h3>Deprecations 👋</h3>
diff --git a/.github/workflows/tests_lqcpu_python.yml b/.github/workflows/tests_lqcpu_python.yml
index dca3e4eb76..fa131fe5e9 100644
--- a/.github/workflows/tests_lqcpu_python.yml
+++ b/.github/workflows/tests_lqcpu_python.yml
@@ -106,7 +106,11 @@ jobs:
       - name: Create device wheel ${{ inputs.lightning-version }}
         run: |
           PL_BACKEND=${{ matrix.pl_backend }} python scripts/configure_pyproject_toml.py
-          CMAKE_ARGS="-DENABLE_BLAS=${{ matrix.blas }} -DENABLE_SCIPY_OPENBLAS=${{ matrix.blas }} -DLQ_ENABLE_KERNEL_OMP=ON -DENABLE_PYTHON=ON -DLIGHTNING_CATALYST_SRC_PATH=${{ github.workspace }}/catalyst" python -m build
+          CMAKE_ARGS="-DENABLE_BLAS=${{ matrix.blas }} \
+                -DENABLE_SCIPY_OPENBLAS=${{ matrix.blas }} \
+                -DLQ_ENABLE_KERNEL_OMP=ON \
+                -DENABLE_PYTHON=ON \
+                -DLIGHTNING_CATALYST_SRC_PATH=${{ github.workspace }}/catalyst" python -m build
           cd dist
           WHEEL_NAME=$(ls *.whl)
           cp $WHEEL_NAME ${{ github.workspace }}/wheel_${{ matrix.pl_backend }}-${{ matrix.blas }}.whl
diff --git a/.github/workflows/wheel_linux_aarch64.yml b/.github/workflows/wheel_linux_aarch64.yml
index ecbc67d21f..ecb81d0df9 100644
--- a/.github/workflows/wheel_linux_aarch64.yml
+++ b/.github/workflows/wheel_linux_aarch64.yml
@@ -152,7 +152,12 @@ jobs:
             PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH"
 
           CIBW_ENVIRONMENT: |
-            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release"
+            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH"
+            CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release"
+            if ${{ matrix.pl_backend == 'lightning_qubit'}}
+            then
+              CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON"
+            fi
 
           CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
 
diff --git a/.github/workflows/wheel_linux_x86_64.yml b/.github/workflows/wheel_linux_x86_64.yml
index 9549ec4907..b08a7e224a 100644
--- a/.github/workflows/wheel_linux_x86_64.yml
+++ b/.github/workflows/wheel_linux_x86_64.yml
@@ -158,7 +158,12 @@ jobs:
             PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH"
 
           CIBW_ENVIRONMENT: |
-            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release"
+            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH"
+            CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release"
+            if ${{ matrix.pl_backend == 'lightning_qubit'}}
+            then
+              CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON"
+            fi
 
           CIBW_BEFORE_TEST: |
             python -m pip install -r requirements-tests.txt
diff --git a/.github/workflows/wheel_macos_arm64.yml b/.github/workflows/wheel_macos_arm64.yml
index e55df0eb9e..a96b28feaf 100644
--- a/.github/workflows/wheel_macos_arm64.yml
+++ b/.github/workflows/wheel_macos_arm64.yml
@@ -77,7 +77,11 @@ jobs:
             python -m pip install ninja cmake setuptools
 
           CIBW_ENVIRONMENT: |
-            CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DENABLE_OPENMP=OFF -DCMAKE_BUILD_TYPE=Release"
+            CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_BUILD_TYPE=Release"
+            if ${{ matrix.pl_backend == 'lightning_qubit'}}
+            then
+              CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON"
+            fi
 
           CIBW_BEFORE_TEST: |
             python -m pip install -r requirements-tests.txt
diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst
index 6e3b7c5cfe..4f5e305d95 100644
--- a/doc/lightning_qubit/development/avx_kernels/index.rst
+++ b/doc/lightning_qubit/development/avx_kernels/index.rst
@@ -12,10 +12,6 @@ AVX2/AVX512 kernels
    :description: Explain how AVX2/512 Kernels works with Lightning's CMake build system
    :link: ./build_system.html
 
-.. title-card::
-   :name: Kernel Performance Tuning
-   :description: Explain how to tune Lightning-Qubit's kernel performance using CMake flags for OpenMP threading and AVX streaming.
-   :link: ./kernel_tuning.html
 
 .. raw:: html
 
@@ -27,4 +23,3 @@ AVX2/AVX512 kernels
 
    implementation
    build_system
-   kernel_tuning
diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
deleted file mode 100644
index bc65e33f59..0000000000
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-Kernel performance tuning
-#########################
-
-Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
-
-However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
-
-OpenMP threaded kernels
------------------------
-
-To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
-
-For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file
diff --git a/doc/lightning_qubit/development/index.rst b/doc/lightning_qubit/development/index.rst
index 90489e166f..5fe9b04cbc 100644
--- a/doc/lightning_qubit/development/index.rst
+++ b/doc/lightning_qubit/development/index.rst
@@ -1,6 +1,11 @@
 Lightning Qubit
 ###############
 
+.. title-card::
+   :name: Kernel Performance Tuning
+   :description: Explain how to tune Lightning-Qubit's kernel performance using CMake flags for OpenMP threading and AVX streaming.
+   :link: ./kernel_tuning.html
+
 .. title-card::
    :name: Adding gate implementations
    :description: How to add additional gate implementations
@@ -20,5 +25,6 @@ Lightning Qubit
 .. toctree::
    :hidden:
 
+   kernel_tuning
    add_gate_kernel
    avx_kernels/index
diff --git a/doc/lightning_qubit/development/kernel_tuning.rst b/doc/lightning_qubit/development/kernel_tuning.rst
new file mode 100644
index 0000000000..cfe861bea8
--- /dev/null
+++ b/doc/lightning_qubit/development/kernel_tuning.rst
@@ -0,0 +1,26 @@
+Kernel performance tuning
+#########################
+
+OpenMP threaded kernels
+-----------------------
+
+OpenMP acceleration of gate kernels across all kernel types (LM, AVX2, and AVX512) is enabled
+by default on Linux and MacOS wheels in Lightning-Qubit.
+
+On other operating systems, OpenMP support can be enabled by setting the environment variable
+``LQ_ENABLE_KERNEL_OMP=ON`` before starting your Python session, or if already running, before
+simulating your PennyLane programs.
+You can also control the number of threads used by setting the ``OMP_NUM_THREADS``
+environment variable.
+
+For workloads that involve gradient computations with many observable measurements,
+OpenMP acceleration may reduce performance due to oversubscription of threads to CPU cores.
+To mitigate this, use the CMake flag ``-DLQ_ENABLE_KERNEL_OMP=OFF`` when building
+Lightning-Qubit.
+
+For workloads that show benefit from the use of threaded gate kernels,
+sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck,
+and saturates the performance gained at high thread counts.
+This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using
+the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating
+the CPU cache and can improve performance for larger workloads.
diff --git a/doc/lightning_qubit/device.rst b/doc/lightning_qubit/device.rst
index a7f5665a7a..13bf78e180 100644
--- a/doc/lightning_qubit/device.rst
+++ b/doc/lightning_qubit/device.rst
@@ -141,6 +141,14 @@ If you are computing a large number of expectation values, or if you are using a
     dev = qml.device("lightning.qubit", wires=2, batch_obs=True)
 
 
+**OpenMP acceleration of the gate kernels:**
+
+OpenMP acceleration of gate kernels across all kernel types (LM, AVX2, and AVX512) is enabled
+by default on Linux and MacOS wheels in Lightning-Qubit.
+
+To learn more about this feature, check out the :doc:`/lightning_qubit/development/kernel_tuning` guide.
+
+
 **Markov Chain Monte Carlo sampling support:**
 
 The ``lightning.qubit`` device allows users to use the Markov Chain Monte Carlo (MCMC) sampling method to generate approximate samples. To enable the MCMC sampling method for sample generation, initialize a ``lightning.qubit`` device with the ``mcmc=True`` keyword argument, as:
@@ -158,4 +166,3 @@ The ``lightning.qubit`` device also supports a ``"NonZeroRandom"`` kernel. This
 
     import pennylane as qml
     dev = qml.device("lightning.qubit", wires=2, shots=1000, mcmc=True, kernel_name="NonZeroRandom", num_burnin=200)
-
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 2260464faf..2c96462166 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
 Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.45.0-dev3"
+__version__ = "0.45.0-dev4"