diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index c7b83e9438..6eb1868e40 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -4,6 +4,9 @@

Improvements 🛠

+- Linux and MacOS `lightning.qubit` wheels are now built with OpenMP support for all kernel types (LM, AVX2, and AVX512), enabling better performance tuning for CPU simulations. + [(#1133)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1133) +

Breaking changes 💔

Deprecations 👋

diff --git a/.github/workflows/tests_lqcpu_python.yml b/.github/workflows/tests_lqcpu_python.yml index dca3e4eb76..fa131fe5e9 100644 --- a/.github/workflows/tests_lqcpu_python.yml +++ b/.github/workflows/tests_lqcpu_python.yml @@ -106,7 +106,11 @@ jobs: - name: Create device wheel ${{ inputs.lightning-version }} run: | PL_BACKEND=${{ matrix.pl_backend }} python scripts/configure_pyproject_toml.py - CMAKE_ARGS="-DENABLE_BLAS=${{ matrix.blas }} -DENABLE_SCIPY_OPENBLAS=${{ matrix.blas }} -DLQ_ENABLE_KERNEL_OMP=ON -DENABLE_PYTHON=ON -DLIGHTNING_CATALYST_SRC_PATH=${{ github.workspace }}/catalyst" python -m build + CMAKE_ARGS="-DENABLE_BLAS=${{ matrix.blas }} \ + -DENABLE_SCIPY_OPENBLAS=${{ matrix.blas }} \ + -DLQ_ENABLE_KERNEL_OMP=ON \ + -DENABLE_PYTHON=ON \ + -DLIGHTNING_CATALYST_SRC_PATH=${{ github.workspace }}/catalyst" python -m build cd dist WHEEL_NAME=$(ls *.whl) cp $WHEEL_NAME ${{ github.workspace }}/wheel_${{ matrix.pl_backend }}-${{ matrix.blas }}.whl diff --git a/.github/workflows/wheel_linux_aarch64.yml b/.github/workflows/wheel_linux_aarch64.yml index ecbc67d21f..ecb81d0df9 100644 --- a/.github/workflows/wheel_linux_aarch64.yml +++ b/.github/workflows/wheel_linux_aarch64.yml @@ -152,7 +152,12 @@ jobs: PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CIBW_ENVIRONMENT: | - PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release" + PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release" + if ${{ matrix.pl_backend == 'lightning_qubit'}} + then + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON" + fi CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 diff --git a/.github/workflows/wheel_linux_x86_64.yml b/.github/workflows/wheel_linux_x86_64.yml index 9549ec4907..b08a7e224a 100644 --- a/.github/workflows/wheel_linux_x86_64.yml +++ b/.github/workflows/wheel_linux_x86_64.yml @@ -158,7 +158,12 @@ jobs: PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CIBW_ENVIRONMENT: | - PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release" + PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release" + if ${{ matrix.pl_backend == 'lightning_qubit'}} + then + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON" + fi CIBW_BEFORE_TEST: | python -m pip install -r requirements-tests.txt diff --git a/.github/workflows/wheel_macos_arm64.yml b/.github/workflows/wheel_macos_arm64.yml index e55df0eb9e..a96b28feaf 100644 --- a/.github/workflows/wheel_macos_arm64.yml +++ b/.github/workflows/wheel_macos_arm64.yml @@ -77,7 +77,11 @@ jobs: python -m pip install ninja cmake setuptools CIBW_ENVIRONMENT: | - CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DENABLE_OPENMP=OFF -DCMAKE_BUILD_TYPE=Release" + CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_BUILD_TYPE=Release" + if ${{ matrix.pl_backend == 'lightning_qubit'}} + then + CMAKE_ARGS="-DCMAKE_CXX_COMPILER_TARGET=arm64-apple-macos11 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_BUILD_TYPE=Release -DLQ_ENABLE_KERNEL_OMP=ON" + fi CIBW_BEFORE_TEST: | python -m pip install -r requirements-tests.txt diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst index 6e3b7c5cfe..4f5e305d95 100644 --- a/doc/lightning_qubit/development/avx_kernels/index.rst +++ b/doc/lightning_qubit/development/avx_kernels/index.rst @@ -12,10 +12,6 @@ AVX2/AVX512 kernels :description: Explain how AVX2/512 Kernels works with Lightning's CMake build system :link: ./build_system.html -.. title-card:: - :name: Kernel Performance Tuning - :description: Explain how to tune Lightning-Qubit's kernel performance using CMake flags for OpenMP threading and AVX streaming. - :link: ./kernel_tuning.html .. raw:: html @@ -27,4 +23,3 @@ AVX2/AVX512 kernels implementation build_system - kernel_tuning diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst deleted file mode 100644 index bc65e33f59..0000000000 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ /dev/null @@ -1,13 +0,0 @@ -Kernel performance tuning -######################### - -Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. - -However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. - -OpenMP threaded kernels ------------------------ - -To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. - -For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file diff --git a/doc/lightning_qubit/development/index.rst b/doc/lightning_qubit/development/index.rst index 90489e166f..5fe9b04cbc 100644 --- a/doc/lightning_qubit/development/index.rst +++ b/doc/lightning_qubit/development/index.rst @@ -1,6 +1,11 @@ Lightning Qubit ############### +.. title-card:: + :name: Kernel Performance Tuning + :description: Explain how to tune Lightning-Qubit's kernel performance using CMake flags for OpenMP threading and AVX streaming. + :link: ./kernel_tuning.html + .. title-card:: :name: Adding gate implementations :description: How to add additional gate implementations @@ -20,5 +25,6 @@ Lightning Qubit .. toctree:: :hidden: + kernel_tuning add_gate_kernel avx_kernels/index diff --git a/doc/lightning_qubit/development/kernel_tuning.rst b/doc/lightning_qubit/development/kernel_tuning.rst new file mode 100644 index 0000000000..cfe861bea8 --- /dev/null +++ b/doc/lightning_qubit/development/kernel_tuning.rst @@ -0,0 +1,26 @@ +Kernel performance tuning +######################### + +OpenMP threaded kernels +----------------------- + +OpenMP acceleration of gate kernels across all kernel types (LM, AVX2, and AVX512) is enabled +by default on Linux and MacOS wheels in Lightning-Qubit. + +On other operating systems, OpenMP support can be enabled by setting the environment variable +``LQ_ENABLE_KERNEL_OMP=ON`` before starting your Python session, or if already running, before +simulating your PennyLane programs. +You can also control the number of threads used by setting the ``OMP_NUM_THREADS`` +environment variable. + +For workloads that involve gradient computations with many observable measurements, +OpenMP acceleration may reduce performance due to oversubscription of threads to CPU cores. +To mitigate this, use the CMake flag ``-DLQ_ENABLE_KERNEL_OMP=OFF`` when building +Lightning-Qubit. + +For workloads that show benefit from the use of threaded gate kernels, +sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, +and saturates the performance gained at high thread counts. +This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using +the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating +the CPU cache and can improve performance for larger workloads. diff --git a/doc/lightning_qubit/device.rst b/doc/lightning_qubit/device.rst index a7f5665a7a..13bf78e180 100644 --- a/doc/lightning_qubit/device.rst +++ b/doc/lightning_qubit/device.rst @@ -141,6 +141,14 @@ If you are computing a large number of expectation values, or if you are using a dev = qml.device("lightning.qubit", wires=2, batch_obs=True) +**OpenMP acceleration of the gate kernels:** + +OpenMP acceleration of gate kernels across all kernel types (LM, AVX2, and AVX512) is enabled +by default on Linux and MacOS wheels in Lightning-Qubit. + +To learn more about this feature, check out the :doc:`/lightning_qubit/development/kernel_tuning` guide. + + **Markov Chain Monte Carlo sampling support:** The ``lightning.qubit`` device allows users to use the Markov Chain Monte Carlo (MCMC) sampling method to generate approximate samples. To enable the MCMC sampling method for sample generation, initialize a ``lightning.qubit`` device with the ``mcmc=True`` keyword argument, as: @@ -158,4 +166,3 @@ The ``lightning.qubit`` device also supports a ``"NonZeroRandom"`` kernel. This import pennylane as qml dev = qml.device("lightning.qubit", wires=2, shots=1000, mcmc=True, kernel_name="NonZeroRandom", num_burnin=200) - diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 2260464faf..2c96462166 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.45.0-dev3" +__version__ = "0.45.0-dev4"