Skip to content

Commit 51f89d0

Browse files
committed
Merge branch 'master' into kmp5/feature/CP
2 parents 297c302 + ec56aa0 commit 51f89d0

File tree

8 files changed

+267
-149
lines changed

8 files changed

+267
-149
lines changed

CMakeLists.txt

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# Jul 19, 2013
2525
#
2626

27-
cmake_minimum_required (VERSION 3.15.0) # need list(PREPEND for toolchains
27+
cmake_minimum_required (VERSION 3.21.0) # for HIP/ROCm
2828

2929
# Set TiledArray version =======================================================
3030

@@ -264,17 +264,13 @@ vgkit_cmake_git_metadata()
264264
##########################
265265
# Check compiler features
266266
##########################
267-
# need C++17, insist on strict standard
268-
set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ ISO Standard version")
269-
if (NOT(CMAKE_CXX_STANDARD EQUAL 17 OR CMAKE_CXX_STANDARD EQUAL 20))
270-
message(FATAL_ERROR "C++ 2017 ISO Standard or higher is required to compile TiledArray")
271-
endif()
272-
# C++20 is only configurable via compile features with cmake 3.12 and older
273-
if (CMAKE_CXX_STANDARD EQUAL 20 AND CMAKE_VERSION VERSION_LESS 3.12.0)
274-
cmake_minimum_required (VERSION 3.12.0)
267+
# need C++20, insist on strict standard
268+
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ ISO Standard version")
269+
if (CMAKE_CXX_STANDARD LESS 20)
270+
message(FATAL_ERROR "C++ 2020 ISO Standard or higher is required to compile TiledArray")
275271
endif()
276272
set(CMAKE_CXX_STANDARD_REQUIRED ON)
277-
set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Whether to use extensions of C++ ISO Standard version")
273+
set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Whether to use extensions of C++ ISO Standard version")
278274
# Check type support
279275
include(CheckTypeSize)
280276
check_type_size("long double" TILEDARRAY_HAS_LONG_DOUBLE LANGUAGE CXX)

INSTALL.md

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,14 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
2424

2525
## Prerequisites
2626

27-
- C++ compiler with support for the [C++17 standard](http://www.iso.org/standard/68564.html), or a more recent standard. This includes the following compilers:
28-
- [GNU C++](https://gcc.gnu.org/), version 7.0 or higher
29-
- [Clang](https://clang.llvm.org/), version 5 or higher
30-
- [Apple Clang](https://en.wikipedia.org/wiki/Xcode), version 9.3 or higher
31-
- [Intel C++ compiler](https://software.intel.com/en-us/c-compilers), version 19 or higher
27+
- C++ compiler with support for the [C++20 standard](http://www.iso.org/standard/68564.html), or a more recent standard. This includes the following compilers:
28+
- [GNU C++](https://gcc.gnu.org/), version 11 or higher
29+
- [Clang](https://clang.llvm.org/), version 14 or higher
30+
- [Apple Clang](https://en.wikipedia.org/wiki/Xcode), version 14 or higher
3231

3332
See the current [Travis CI matrix](.travis.yml) for the most up-to-date list of compilers that are known to work.
3433

35-
- [CMake](https://cmake.org/), version 3.15 or higher; if {CUDA,HIP} support is needed, CMake {3.18,3.21} or higher is required.
34+
- [CMake](https://cmake.org/), version 3.21 or higher.
3635
- [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub)
3736
- [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing)
3837
- [Boost libraries](www.boost.org/), version 1.81 or higher (will be downloaded automatically, if missing). The following principal Boost components are used:
@@ -66,14 +65,14 @@ Compiling BTAS requires the following prerequisites:
6665
Optional prerequisites:
6766
- for execution on GPGPUs:
6867
- device programming runtime:
69-
- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
68+
- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 12 or later is required.
7069
- [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
7170
- [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
7271
- [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 8c85866107f78a58403e20a2ae8e1f24c9852287).
7372
- [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
7473
- [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
75-
- [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
76-
- [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS
74+
- [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
75+
- [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ wrapper for BLACS
7776
- Python3 interpreter -- to test (optionally-built) Python bindings
7877
- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 3fe4a06dbf4b05091269488aab38223da1f8cb8e).
7978

@@ -186,7 +185,7 @@ Additional CMake variables are given below.
186185
* `CMAKE_BUILD_TYPE` -- Optimization/debug build type options include
187186
`Debug` (optimization off, debugging symbols and assersions on), `Release` (optimization on, debugging symbols and assertions off), `RelWithDebInfo` (optimization on, debugging symbols and assertions on) and `MinSizeRel` (same as `Release` but optimized for executable size). The default is empty build type. It is recommended that you set the build type explicitly.
188187
* `BUILD_SHARED_LIBS` -- Enable shared libraries. This option is only available if the platform supports shared libraries; if that's true and `TA_ASSUMES_ASLR_DISABLED` is `ON` (see below) the default is `ON`, otherwise the default is `OFF`.
189-
* `CMAKE_CXX_STANDARD` -- Specify the C++ ISO Standard to use. Valid values are `17` (default), and `20`.
188+
* `CMAKE_CXX_STANDARD` -- Specify the C++ ISO Standard to use. Valid values are `20` (default), and `23`.
190189

191190
Most of these are best specified in a _toolchain file_. TiledArray is recommended to use the toolchains distributed via [the Valeev Group CMake kit](https://github.com/ValeevGroup/kit-cmake/tree/master/toolchains). TiledArray by default downloads (via [the FetchContent CMake module](https://cmake.org/cmake/help/latest/module/FetchContent.html)) the VG CMake toolkit which makes the toolchains available without having to download the toolchain files manually. E.g., to use toolchain `x` from the VG CMake kit repository provide `-DCMAKE_TOOLCHAIN_FILE=cmake/vg/toolchains/x.cmake` to CMake when configuring TiledArray.
192191

doc/dox/dev/Basic-Programming.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ An object that specifies the structure of DistArray. E.g. it could be represente
6666

6767
## Implementation
6868

69-
TiledArray is a library written in standard C++ using features available in the 2017 ISO standard (commonly known as C++17). To use TiledArray it is necessary to `#include` header `tiledarray.h`. imports most TiledArray features into namespace `TiledArray`. For convenience, namespace alias `TA` is also provided. Although the alias can be disabled by defining the `TILEDARRAY_DISABLE_NAMESPACE_TA` preprocessor variable, all examples will assume that the `TA` alias is not disabled.
69+
TiledArray is a library written in standard C++ using features available in the 2020 ISO standard (commonly known as C++20). To use TiledArray it is necessary to `#include` header `tiledarray.h`. imports most TiledArray features into namespace `TiledArray`. For convenience, namespace alias `TA` is also provided. Although the alias can be disabled by defining the `TILEDARRAY_DISABLE_NAMESPACE_TA` preprocessor variable, all examples will assume that the `TA` alias is not disabled.
7070

7171
P.S. It sometimes may be possible to reduce source code couplings by importing only _forwarding_ declarations. This is done by `#include`ing header `tiledarray_fwd.h`.
7272

src/TiledArray/conversions/make_array.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ inline Array make_array(World& world, const detail::trange_t<Array>& trange,
242242
op);
243243
}
244244

245-
/// a make_array variant that uses a sequence of tiles
245+
/// a make_array variant that uses a sequence of {tile_index,tile} pairs
246246
/// to construct a DistArray with default pmap
247247
template <typename Array, typename Tiles>
248248
Array make_array(World& world, const detail::trange_t<Array>& tiled_range,

src/TiledArray/einsum/tiledarray.h

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
600600
std::invoke(update_perm_and_indices, std::get<0>(AB));
601601
std::invoke(update_perm_and_indices, std::get<1>(AB));
602602

603+
// construct result, with "dense" DistArray; the array will be
604+
// reconstructred from local tiles later
603605
ArrayTerm<ArrayC> C = {ArrayC(world, TiledRange(range_map[c])), c};
604606
for (auto idx : e) {
605607
C.tiles *= Range(range_map[idx].tiles_range());
@@ -609,6 +611,16 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
609611
}
610612
C.expr = e;
611613

614+
using Index = Einsum::Index<size_t>;
615+
616+
// this will collect local tiles of C.array, to be used to rebuild C.array
617+
std::vector<std::pair<Index, ResultTensor>> C_local_tiles;
618+
auto build_C_array = [&]() {
619+
C.array = make_array<ArrayC>(world, TiledRange(range_map[c]),
620+
C_local_tiles.begin(), C_local_tiles.end(),
621+
/* replicated = */ false);
622+
};
623+
612624
std::get<0>(AB).expr += inner.a;
613625
std::get<1>(AB).expr += inner.b;
614626

@@ -627,19 +639,56 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
627639
}
628640
}
629641

630-
using Index = Einsum::Index<size_t>;
631-
632642
if (!e) { // hadamard reduction
643+
633644
auto &[A, B] = AB;
634645
TiledRange trange(range_map[i]);
635646
RangeProduct tiles;
636647
for (auto idx : i) {
637648
tiles *= Range(range_map[idx].tiles_range());
638649
}
650+
651+
// the inner product can be either hadamard or a contraction
652+
using TensorT = typename decltype(A.array)::value_type::value_type;
653+
static_assert(
654+
std::is_same_v<TensorT,
655+
typename decltype(A.array)::value_type::value_type>);
656+
constexpr bool is_tot = detail::is_tensor_v<TensorT>;
657+
auto element_hadamard_op =
658+
(is_tot && inner.h)
659+
? std::make_optional(
660+
[&inner, plan = detail::TensorHadamardPlan(inner.A, inner.B,
661+
inner.C)](
662+
auto const &l, auto const &r) -> TensorT {
663+
if (l.empty() || r.empty()) return TensorT{};
664+
return detail::tensor_hadamard(l, r, plan);
665+
})
666+
: std::nullopt;
667+
auto element_contract_op =
668+
(is_tot && !inner.h)
669+
? std::make_optional(
670+
[&inner, plan = detail::TensorContractionPlan(
671+
inner.A, inner.B, inner.C)](
672+
auto const &l, auto const &r) -> TensorT {
673+
if (l.empty() || r.empty()) return TensorT{};
674+
return detail::tensor_contract(l, r, plan);
675+
})
676+
: std::nullopt;
677+
auto element_product_op = [&inner, &element_hadamard_op,
678+
&element_contract_op](
679+
auto const &l, auto const &r) -> TensorT {
680+
TA_ASSERT(inner.h ? element_hadamard_op.has_value()
681+
: element_contract_op.has_value());
682+
return inner.h ? element_hadamard_op.value()(l, r)
683+
: element_contract_op.value()(l, r);
684+
};
685+
639686
auto pa = A.permutation;
640687
auto pb = B.permutation;
641688
for (Index h : H.tiles) {
642-
if (!C.array.is_local(h)) continue;
689+
auto const pc = C.permutation;
690+
auto const c = apply(pc, h);
691+
if (!C.array.is_local(c)) continue;
643692
size_t batch = 1;
644693
for (size_t i = 0; i < h.size(); ++i) {
645694
batch *= H.batch[i].at(h[i]);
@@ -670,16 +719,8 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
670719
auto &el = tile({k});
671720
using TensorT = std::remove_reference_t<decltype(el)>;
672721

673-
auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT {
674-
if (l.empty() || r.empty()) return TensorT{};
675-
return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r,
676-
inner.B, inner.C)
677-
: TA::detail::tensor_contract(l, inner.A, r,
678-
inner.B, inner.C);
679-
};
680-
681722
for (auto i = 0; i < vol; ++i)
682-
el.add_to(mult_op(aik.data()[i], bik.data()[i]));
723+
el.add_to(element_product_op(aik.data()[i], bik.data()[i]));
683724

684725
} else if constexpr (!AreArraySame<ArrayA, ArrayB>) {
685726
auto aik = ai.batch(k);
@@ -702,14 +743,21 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
702743
}
703744
}
704745
}
705-
auto pc = C.permutation;
706-
auto shape = apply_inverse(pc, C.array.trange().tile(h));
746+
// data is stored as h1 h2 ... but all modes folded as 1 batch dim
747+
// first reshape to h = (h1 h2 ...)
748+
// n.b. can't just use shape = C.array.trange().tile(h)
749+
auto shape = apply_inverse(pc, C.array.trange().tile(c));
707750
tile = tile.reshape(shape);
751+
// then permute to target C layout c = (c1 c2 ...)
708752
if (pc) tile = tile.permute(pc);
709-
C.array.set(h, tile);
753+
// and move to C_local_tiles
754+
C_local_tiles.emplace_back(std::move(c), std::move(tile));
710755
}
756+
757+
build_C_array();
758+
711759
return C.array;
712-
}
760+
} // end: hadamard reduction
713761

714762
// generalized contraction
715763

@@ -740,7 +788,6 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
740788
std::invoke(update_tr, std::get<1>(AB));
741789

742790
std::vector<std::shared_ptr<World>> worlds;
743-
std::vector<std::tuple<Index, ResultTensor>> local_tiles;
744791

745792
// iterates over tiles of hadamard indices
746793
for (Index h : H.tiles) {
@@ -798,26 +845,13 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
798845
shape = apply_inverse(P, shape);
799846
tile = tile.reshape(shape);
800847
if (P) tile = tile.permute(P);
801-
local_tiles.push_back({c, tile});
848+
C_local_tiles.emplace_back(std::move(c), std::move(tile));
802849
}
803850
// mark for lazy deletion
804851
C.ei = ArrayC();
805852
}
806853

807-
if constexpr (!ResultShape::is_dense()) {
808-
TiledRange tiled_range = TiledRange(range_map[c]);
809-
std::vector<std::pair<Index, float>> tile_norms;
810-
for (auto &[index, tile] : local_tiles) {
811-
tile_norms.push_back({index, tile.norm()});
812-
}
813-
ResultShape shape(world, tile_norms, tiled_range);
814-
C.array = ArrayC(world, TiledRange(range_map[c]), shape);
815-
}
816-
817-
for (auto &[index, tile] : local_tiles) {
818-
if (C.array.is_zero(index)) continue;
819-
C.array.set(index, tile);
820-
}
854+
build_C_array();
821855

822856
for (auto &w : worlds) {
823857
w->gop.fence();

0 commit comments

Comments
 (0)