Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
36bf47e
Add oneDNN submodule to 3rd_party
graemenail May 5, 2022
ebd1559
Add oneDNN to CMake
graemenail May 5, 2022
0b9cb99
Don't build DNNL examples
graemenail May 5, 2022
e344dbc
Allow static builds of DNNL
graemenail May 5, 2022
b89f5d0
Remove MKL include from config parser
graemenail May 17, 2022
dc5c48f
Add oneDNN sgemm
graemenail Jun 8, 2022
28b0eb3
Improve oneDNN CMake
graemenail Jun 8, 2022
d454985
Add oneDNN in prod
graemenail Jun 8, 2022
01aba42
Use int in loop for ProdBatched
graemenail May 17, 2022
9d6437f
oneDNN only use OMP runtime when specified
graemenail May 18, 2022
6675ac3
Move MSVC unicode flags out of global flags
graemenail May 30, 2022
cc7cb75
Disable DNNL JIT Profiling
graemenail May 19, 2022
35d302a
Cache Boost
graemenail May 30, 2022
4e69c5a
Clean up after debug build
graemenail Jun 3, 2022
d44fa4d
Update CHANGELOG
graemenail Jun 8, 2022
a975a1a
Mention oneDNN in documentation
graemenail Jun 8, 2022
215eec1
Fix comments mentioning MKL
graemenail Jun 3, 2022
ed9fa14
oneDNN GH actions
graemenail Jun 8, 2022
04053c0
Warn if no BLAS for FBGEMM
graemenail Jun 9, 2022
79da945
Windows needs MKL for FBGEMM blas
graemenail Jun 9, 2022
a424614
Try Ubuntu with openblas for FBGEMM
graemenail Jun 9, 2022
714bdc8
Fix typo
graemenail Jun 9, 2022
5f96baa
Set BLAS_FOUND for Apple Accelerate
graemenail Jun 9, 2022
fdf6c59
Revert "Try Ubuntu with openblas for FBGEMM"
graemenail Jun 9, 2022
4a29f6d
Prefer DNNL codepaths at compile time when requested
graemenail Jun 9, 2022
55478ce
Update oneDNN compilation for clang CI
graemenail Sep 12, 2022
7426ebb
Require MKL for LSH (with rotation)
graemenail Sep 13, 2022
3e1a550
Fix abort message to require MKL rather than generic BLAS
graemenail Sep 13, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,35 @@ jobs:
working-directory: build
run: make -j2

- name: Configure CMake (oneDNN)
if: matrix.cpu == true && matrix.gpu == false
id: cmake-onednn
run: |
[ -z "${{ matrix.gcc }}" ] || export CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}
[ -z "${{ matrix.clang }}" ] || export CC=/usr/bin/clang-${{ matrix.clang }} CXX=/usr/bin/clang++-${{ matrix.clang }}
mkdir -p build-onednn
cd build-onednn
cmake .. \
-DBoost_ARCHITECTURE=-x64 \
-DCMAKE_BUILD_TYPE=Debug \
-DCOMPILE_CPU=${{ matrix.cpu }} \
-DCOMPILE_CUDA=${{ matrix.gpu }} \
-DCOMPILE_EXAMPLES=${{ matrix.examples }} \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=${{ matrix.unit_tests }} \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
-DDETERMINISTIC=on \
-DUSE_FBGEMM=${{ matrix.cpu }} \
-DUSE_SENTENCEPIECE=on \
-DUSE_STATIC_LIBS=on \
-DUSE_MKL=OFF -DUSE_DNNL=ON

- name: Compile (oneDNN)
if: steps.cmake-onednn.conclusion == 'success'
working-directory: build-onednn
run: make -j2 && make clean


# TODO: add a flag to CMake to compile unit tests only on CPU
- name: Run unit tests
working-directory: build
Expand Down
47 changes: 47 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,17 @@ jobs:
echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
shell: powershell
if: matrix.gpu == true
# Cache boost install
- name: Cache Boost
id: cache-boost
uses: actions/cache@v3
with:
path: ${{ env.BOOST_ROOT }}
key: ${{ runner.os }}-${{ env.BOOST_URL }}

# Boost is no longer pre-installed on GitHub-hosted Windows runners
- name: Download Boost
if: ${{ steps.cache-boost.outputs.cache-hit != 'true' }}
run: |
Write-Host "Downloading Boost to ${{ env.BOOST_ROOT }}"
C:\msys64\usr\bin\wget.exe -nv "${{ env.BOOST_URL }}" -O "${{ github.workspace }}/boost.exe"
Expand All @@ -70,6 +78,7 @@ jobs:

# Windows CUDA builds use USE_NCCL=off due to compilation errors.
- name: Build Debug
id: build-debug
uses: lukka/run-cmake@v3
with:
buildDirectory: ${{ github.workspace }}/build/Debug
Expand All @@ -95,6 +104,44 @@ jobs:
# able to find sometimes.
if: matrix.gpu == true

- name: Cleanup Debug
if: steps.build-debug.conclusion == 'success'
working-directory: ${{ github.workspace }}/build/Debug
run: cmake --build . --target clean

- name: Build Debug (oneDNN)
id: build-debug-onednn
uses: lukka/run-cmake@v3
with:
buildDirectory: ${{ github.workspace }}/build/Debug-oneDNN
cmakeAppendedArgs: '-G Ninja
-DCMAKE_BUILD_TYPE="Debug"
-DOPENSSL_USE_STATIC_LIBS="TRUE"
-DOPENSSL_MSVC_STATIC_RT="TRUE"
-DCOMPILE_CPU="TRUE"
-DCOMPILE_CUDA="${{ matrix.gpu }}"
-DCOMPILE_SERVER="FALSE"
-DCOMPILE_TESTS="TRUE"
-DDETERMINISTIC="TRUE"
-DUSE_FBGEMM="TRUE"
-DUSE_DNNL="TRUE"
-DUSE_MPI="FALSE"
-DUSE_NCCL="FALSE"
-DUSE_SENTENCEPIECE="TRUE"
-DUSE_STATIC_LIBS="TRUE"'
cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt
useVcpkgToolchainFile: true
# Building in Debug is sufficient for the all-in CPU+GPU compilation;
# its main purpose is to detect warnings that the Release build is not
# able to find sometimes.
if: matrix.gpu == false

- name: Cleanup Debug
if: steps.build-debug-onednn.conclusion == 'success'
working-directory: ${{ github.workspace }}/build/Debug-oneDNN
run: cmake --build . --target clean

# Windows CUDA builds use USE_NCCL=off due to compilation errors
- name: Build Release
uses: lukka/run-cmake@v3
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/oneDNN"]
path = src/3rd_party/oneDNN
url = https://github.com/oneapi-src/oneDNN.git
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Fused inplace-dropout in FFN layer in Transformer
- `--force-decode` option for marian-decoder
- `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`)
- oneDNN is available for GEMM on CPU.

### Fixed
- Use allocator in hashing
Expand Down
18 changes: 16 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ option(USE_CUDNN "Use CUDNN library" OFF)
option(USE_DOXYGEN "Build documentation with Doxygen" ON)
option(USE_FBGEMM "Use FBGEMM" OFF)
option(USE_MKL "Compile with MKL support" ON)
option(USE_DNNL "Compile with oneDNN support" OFF)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
Expand Down Expand Up @@ -84,6 +85,7 @@ endif()
# Set compilation flags
if(MSVC)
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND EXTRA_DEFINITIONS /DUNICODE /D_UNICODE)
list(APPEND ALL_WARNINGS /WX; /W4;)

# Disabled bogus warnings for CPU intrinsics and Protobuf:
Expand All @@ -105,7 +107,7 @@ if(MSVC)
set(INTRINSICS "/arch:AVX2")
# set(INTRINSICS "/arch:AVX512")
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")

Expand Down Expand Up @@ -515,6 +517,12 @@ if(COMPILE_CPU)
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
add_definitions(-DCOMPILE_CPU=1)
endif()

if(USE_DNNL)
set(EXT_LIBS ${EXT_LIBS} dnnl)
add_definitions(-DDNNL_FOUND=1)
endif(USE_DNNL)

if(USE_APPLE_ACCELERATE)
if(NOT APPLE)
message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
Expand All @@ -524,6 +532,7 @@ if(COMPILE_CPU)
# you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/)
include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
set(BLAS_FOUND TRUE)
add_definitions(-DBLAS_FOUND=1)
else(USE_APPLE_ACCELERATE)
if(USE_MKL)
Expand All @@ -545,10 +554,15 @@ if(COMPILE_CPU)
add_definitions(-DBLAS_FOUND=1)
endif(CBLAS_FOUND)
endif(BLAS_FOUND)
endif(MKL_FOUND)
endif()
endif(USE_APPLE_ACCELERATE)
endif(COMPILE_CPU)


if(NOT BLAS_FOUND AND USE_FBGEMM)
message(FATAL_ERROR "FBGEMM was requested but a BLAS vendor was not found.")
endif()

###############################################################################
# Find OpenSSL
set(BOOST_COMPONENTS "")
Expand Down
1 change: 1 addition & 0 deletions doc/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ libraries containing device-specific optimisations. These libraries include:
- FBGEMM
- INTGEMM
- MKL
- oneDNN
- GPU
- CUDA (cuBLAS)

Expand Down
21 changes: 20 additions & 1 deletion src/3rd_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,25 @@ if(COMPILE_CPU)
endif()
endif(COMPILE_CPU)

if(USE_DNNL)
# OneDNN
set(DNNL_BUILD_TESTS OFF CACHE BOOL "Build dnnl tests")
set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "Build dnnl examples")

set(DNNL_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
if(USE_STATIC_LIBS)
set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "specifies whether oneDNN library should be SHARED or STATIC" FORCE)
endif(USE_STATIC_LIBS)

if(NOT USE_OPENMP)
set(DNNL_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
endif()

add_subdirectory(./oneDNN)
include_directories(./oneDNN/include)

endif(USE_DNNL)

if(USE_FBGEMM)
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
# meant to silence CMakeFiles of 3rd_party tools.
Expand Down Expand Up @@ -169,7 +188,7 @@ if(CUDA_FOUND)
endif(COMPILE_AMPERE)

# install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
# Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16.
# Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16.
# Apparently this does not get properly propagated otherwise and builts with only a single thread/process.
ExternalProject_Add(nccl_install
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl
Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/oneDNN
Submodule oneDNN added at 11fa74
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ set(MARIAN_SOURCES

add_library(marian STATIC ${MARIAN_SOURCES})

target_compile_options(marian PRIVATE ${ALL_WARNINGS})
target_compile_options(marian PRIVATE ${ALL_WARNINGS} ${EXTRA_DEFINITIONS})

# Generate git_revision.h to reflect current git revision information
# [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]
Expand Down
9 changes: 1 addition & 8 deletions src/common/config_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@
#include <stdexcept>
#include <string>

#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif

namespace marian {

Expand Down Expand Up @@ -269,7 +262,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
"Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
cli.add<int>("--transformer-dim-ffn",
"Size of position-wise feed-forward network (transformer)",
2048);
2048);
cli.add<int>("--transformer-decoder-dim-ffn",
"Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
0);
Expand Down
2 changes: 1 addition & 1 deletion src/graph/auto_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class AutoTuner : public AutoTunerRecorder {
const size_t collectStatMax = 50;
UPtr<timer::CPUTimer> timer_;

// This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)
// This structure holds a hash key an algorithm function (e.g. int16, packed gemm, fp32 gemm)
// for a specific operation size
// hash: a unique hash key for each operation size
// (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
Expand Down
2 changes: 1 addition & 1 deletion src/graph/expression_operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ Expr bdot_legacy(Expr a, Expr b, bool transA, bool transB, float scale) {
}

Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// general version, MKL, CBlas or CUDA
// general version (see affine for packed-type variants)

int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({ rows, 1 });
Expand Down
6 changes: 3 additions & 3 deletions src/layers/lsh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#include "3rd_party/faiss/utils/hamming.h"

#if BLAS_FOUND
#if defined(BLAS_FOUND) && defined(MKL_FOUND)
#include "3rd_party/faiss/VectorTransform.h"
#endif

Expand All @@ -20,7 +20,7 @@ int bytesPerVector(int nBits) {
}

void fillRandomRotationMatrix(Tensor output, Ptr<Allocator> allocator) {
#if BLAS_FOUND
#if defined(BLAS_FOUND) && defined(MKL_FOUND)
int nRows = output->shape()[-2];
int nBits = output->shape()[-1];

Expand All @@ -42,7 +42,7 @@ void fillRandomRotationMatrix(Tensor output, Ptr<Allocator> allocator) {
allocator->free(memory);
#else
output; allocator;
ABORT("LSH with rotation matrix requires Marian to be compiled with a BLAS library");
ABORT("LSH with rotation matrix requires Marian to be compiled with MKL");
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion src/tensors/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace marian {
// GEMM type enum
typedef enum {
Auto = 0, // auto tuning between available GEMMs
Float32 = 1, // MKL based GEMM, fp32
Float32 = 1, // fp32 based GEMM
FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing
FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing
} GemmType;
Expand Down
Loading