graemenail · graemenail · May 5, 2022 · May 5, 2022 · May 5, 2022 · May 5, 2022
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -127,6 +127,35 @@ jobs:
       working-directory: build
       run: make -j2
 
+    - name: Configure CMake (oneDNN)
+      if: matrix.cpu == true && matrix.gpu == false
+      id: cmake-onednn
+      run: |
+        [ -z "${{ matrix.gcc }}" ] || export CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}
+        [ -z "${{ matrix.clang }}" ] || export CC=/usr/bin/clang-${{ matrix.clang }} CXX=/usr/bin/clang++-${{ matrix.clang }}
+        mkdir -p build-onednn
+        cd build-onednn
+        cmake .. \
+          -DBoost_ARCHITECTURE=-x64 \
+          -DCMAKE_BUILD_TYPE=Debug \
+          -DCOMPILE_CPU=${{ matrix.cpu }} \
+          -DCOMPILE_CUDA=${{ matrix.gpu }} \
+          -DCOMPILE_EXAMPLES=${{ matrix.examples }} \
+          -DCOMPILE_SERVER=on \
+          -DCOMPILE_TESTS=${{ matrix.unit_tests }} \
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
+          -DDETERMINISTIC=on \
+          -DUSE_FBGEMM=${{ matrix.cpu }} \
+          -DUSE_SENTENCEPIECE=on \
+          -DUSE_STATIC_LIBS=on \
+          -DUSE_MKL=OFF -DUSE_DNNL=ON
+
+    - name: Compile (oneDNN)
+      if: steps.cmake-onednn.conclusion == 'success'
+      working-directory: build-onednn
+      run: make -j2 && make clean
+
+
     # TODO: add a flag to CMake to compile unit tests only on CPU
     - name: Run unit tests
       working-directory: build

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -51,9 +51,17 @@ jobs:
         echo "$env:CUDA_PATH/bin"       | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
       shell: powershell
       if: matrix.gpu == true
+    # Cache boost install
+    - name: Cache Boost
+      id: cache-boost
+      uses: actions/cache@v3
+      with:
+        path: ${{ env.BOOST_ROOT }}
+        key: ${{ runner.os }}-${{ env.BOOST_URL }}
 
     # Boost is no longer pre-installed on GitHub-hosted Windows runners
     - name: Download Boost
+      if: ${{ steps.cache-boost.outputs.cache-hit != 'true' }}
       run: |
         Write-Host "Downloading Boost to ${{ env.BOOST_ROOT }}"
         C:\msys64\usr\bin\wget.exe -nv "${{ env.BOOST_URL }}" -O "${{ github.workspace }}/boost.exe"
@@ -70,6 +78,7 @@ jobs:
 
     # Windows CUDA builds use USE_NCCL=off due to compilation errors.
     - name: Build Debug
+      id: build-debug
       uses: lukka/run-cmake@v3
       with:
         buildDirectory: ${{ github.workspace }}/build/Debug
@@ -95,6 +104,44 @@ jobs:
       # able to find sometimes.
       if: matrix.gpu == true
 
+    - name: Cleanup Debug
+      if: steps.build-debug.conclusion == 'success'
+      working-directory: ${{ github.workspace }}/build/Debug
+      run: cmake --build . --target clean
+
+    - name: Build Debug (oneDNN)
+      id: build-debug-onednn
+      uses: lukka/run-cmake@v3
+      with:
+        buildDirectory: ${{ github.workspace }}/build/Debug-oneDNN
+        cmakeAppendedArgs: '-G Ninja
+          -DCMAKE_BUILD_TYPE="Debug"
+          -DOPENSSL_USE_STATIC_LIBS="TRUE"
+          -DOPENSSL_MSVC_STATIC_RT="TRUE"
+          -DCOMPILE_CPU="TRUE"
+          -DCOMPILE_CUDA="${{ matrix.gpu }}"
+          -DCOMPILE_SERVER="FALSE"
+          -DCOMPILE_TESTS="TRUE"
+          -DDETERMINISTIC="TRUE"
+          -DUSE_FBGEMM="TRUE"
+          -DUSE_DNNL="TRUE"
+          -DUSE_MPI="FALSE"
+          -DUSE_NCCL="FALSE"
+          -DUSE_SENTENCEPIECE="TRUE"
+          -DUSE_STATIC_LIBS="TRUE"'
+        cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
+        cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt
+        useVcpkgToolchainFile: true
+      # Building in Debug is sufficient for the all-in CPU+GPU compilation;
+      # its main purpose is to detect warnings that the Release build is not
+      # able to find sometimes.
+      if: matrix.gpu == false
+
+    - name: Cleanup Debug
+      if: steps.build-debug-onednn.conclusion == 'success'
+      working-directory: ${{ github.workspace }}/build/Debug-oneDNN
+      run: cmake --build . --target clean
+
     # Windows CUDA builds use USE_NCCL=off due to compilation errors
     - name: Build Release
       uses: lukka/run-cmake@v3

diff --git a/.gitmodules b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
+[submodule "src/3rd_party/oneDNN"]
+	path = src/3rd_party/oneDNN
+	url = https://github.com/oneapi-src/oneDNN.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fused inplace-dropout in FFN layer in Transformer
 - `--force-decode` option for marian-decoder
 - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`)
+- oneDNN is available for GEMM on CPU.
 
 ### Fixed
 - Use allocator in hashing

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,7 @@ option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
 option(USE_MKL "Compile with MKL support" ON)
+option(USE_DNNL "Compile with oneDNN support" OFF)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
@@ -84,6 +85,7 @@ endif()
 # Set compilation flags
 if(MSVC)
 # These are used in src/CMakeLists.txt on a per-target basis
+  list(APPEND EXTRA_DEFINITIONS  /DUNICODE /D_UNICODE)
   list(APPEND ALL_WARNINGS /WX; /W4;)
 
   # Disabled bogus warnings for CPU intrinsics and Protobuf:
@@ -105,7 +107,7 @@ if(MSVC)
   set(INTRINSICS "/arch:AVX2")
   # set(INTRINSICS "/arch:AVX512")
   # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
+  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
   set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
   set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
 
@@ -515,6 +517,12 @@ if(COMPILE_CPU)
     set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
     add_definitions(-DCOMPILE_CPU=1)
   endif()
+
+  if(USE_DNNL)
+    set(EXT_LIBS ${EXT_LIBS} dnnl)
+    add_definitions(-DDNNL_FOUND=1)
+  endif(USE_DNNL)
+
   if(USE_APPLE_ACCELERATE)
     if(NOT APPLE)
       message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
@@ -524,6 +532,7 @@ if(COMPILE_CPU)
     # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/)
     include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
     set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
+    set(BLAS_FOUND TRUE)
     add_definitions(-DBLAS_FOUND=1)
   else(USE_APPLE_ACCELERATE)
     if(USE_MKL)
@@ -545,10 +554,15 @@ if(COMPILE_CPU)
           add_definitions(-DBLAS_FOUND=1)
         endif(CBLAS_FOUND)
       endif(BLAS_FOUND)
-    endif(MKL_FOUND)
+    endif()
   endif(USE_APPLE_ACCELERATE)
 endif(COMPILE_CPU)
 
+
+if(NOT BLAS_FOUND AND USE_FBGEMM)
+message(FATAL_ERROR "FBGEMM was requested but a BLAS vendor was not found.")
+endif()
+
 ###############################################################################
 # Find OpenSSL
 set(BOOST_COMPONENTS "")

diff --git a/doc/operators.md b/doc/operators.md
@@ -383,6 +383,7 @@ libraries containing device-specific optimisations. These libraries include:
     - FBGEMM
     - INTGEMM
     - MKL
+    - oneDNN
   - GPU
     - CUDA (cuBLAS)
 

diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
@@ -15,6 +15,25 @@ if(COMPILE_CPU)
   endif()
 endif(COMPILE_CPU)
 
+if(USE_DNNL)
+  # OneDNN
+  set(DNNL_BUILD_TESTS OFF CACHE BOOL "Build dnnl tests")
+  set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "Build dnnl examples")
+
+  set(DNNL_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
+  if(USE_STATIC_LIBS)
+    set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "specifies whether oneDNN library should be SHARED or STATIC" FORCE)
+  endif(USE_STATIC_LIBS)
+
+  if(NOT USE_OPENMP)
+    set(DNNL_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
+  endif()
+
+  add_subdirectory(./oneDNN)
+  include_directories(./oneDNN/include)
+
+endif(USE_DNNL)
+
 if(USE_FBGEMM)
   # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
   # meant to silence CMakeFiles of 3rd_party tools.
@@ -169,7 +188,7 @@ if(CUDA_FOUND)
     endif(COMPILE_AMPERE)
 
     # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
-    # Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16. 
+    # Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16.
     # Apparently this does not get properly propagated otherwise and builts with only a single thread/process.
     ExternalProject_Add(nccl_install
       SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl

diff --git a/src/3rd_party/oneDNN b/src/3rd_party/oneDNN
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -130,7 +130,7 @@ set(MARIAN_SOURCES
 
 add_library(marian STATIC ${MARIAN_SOURCES})
 
-target_compile_options(marian PRIVATE ${ALL_WARNINGS})
+target_compile_options(marian PRIVATE ${ALL_WARNINGS} ${EXTRA_DEFINITIONS})
 
 # Generate git_revision.h to reflect current git revision information
 # [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -15,13 +15,6 @@
 #include <stdexcept>
 #include <string>
 
-#if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
-#endif
 
 namespace marian {
 
@@ -269,7 +262,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
   cli.add<int>("--transformer-dim-ffn",
       "Size of position-wise feed-forward network (transformer)",
-      2048);  
+      2048);
   cli.add<int>("--transformer-decoder-dim-ffn",
       "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
       0);

diff --git a/src/graph/auto_tuner.h b/src/graph/auto_tuner.h
@@ -26,7 +26,7 @@ class AutoTuner : public AutoTunerRecorder {
   const size_t collectStatMax = 50;
   UPtr<timer::CPUTimer> timer_;
 
-  // This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)
+  // This structure holds a hash key an algorithm function (e.g. int16, packed gemm, fp32 gemm)
   // for a specific operation size
   // hash: a unique hash key for each operation size
   //      (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)

diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
@@ -576,7 +576,7 @@ Expr bdot_legacy(Expr a, Expr b, bool transA, bool transB, float scale) {
 }
 
 Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
-  // general version, MKL, CBlas or CUDA
+  // general version (see affine for packed-type variants)
 
   int rows = a->shape().elements() / a->shape()[-1];
   Expr ones = a->graph()->ones({ rows, 1 });

diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp
@@ -4,7 +4,7 @@
 
 #include "3rd_party/faiss/utils/hamming.h"
 
-#if BLAS_FOUND
+#if defined(BLAS_FOUND) && defined(MKL_FOUND)
 #include "3rd_party/faiss/VectorTransform.h"
 #endif
 
@@ -20,7 +20,7 @@ int bytesPerVector(int nBits) {
 }
 
 void fillRandomRotationMatrix(Tensor output, Ptr<Allocator> allocator) {
-#if BLAS_FOUND
+#if defined(BLAS_FOUND) && defined(MKL_FOUND)
   int nRows = output->shape()[-2];
   int nBits = output->shape()[-1];
 
@@ -42,7 +42,7 @@ void fillRandomRotationMatrix(Tensor output, Ptr<Allocator> allocator) {
   allocator->free(memory);
 #else
   output; allocator;
-  ABORT("LSH with rotation matrix requires Marian to be compiled with a BLAS library");
+  ABORT("LSH with rotation matrix requires Marian to be compiled with MKL");
 #endif
 }
 

diff --git a/src/tensors/backend.h b/src/tensors/backend.h
@@ -8,7 +8,7 @@ namespace marian {
 // GEMM type enum
 typedef enum {
   Auto = 0,            // auto tuning between available GEMMs
-  Float32 = 1,         // MKL based GEMM, fp32
+  Float32 = 1,         // fp32 based GEMM
   FbFp16Packed = 10,   // FBGEMM based fp16 GEMM with packing
   FbInt8Packed = 11    // FBGEMM based int8 GEMM with packing
 } GemmType;
-Original file line number
+Diff line change
@@ Expand Up @@
         - FBGEMM
         - INTGEMM
         - MKL
+        - oneDNN
       - GPU
         - CUDA (cuBLAS)
@@ Expand Down @@