From 1a44054cb2aa87e8df40bafa1f410de412bbcc7b Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 7 Jun 2022 17:13:19 +0200 Subject: [PATCH 01/15] Move the code from https://github.com/openmm/NNPOps/pull/58 --- src/pytorch/messages/__init__.py | 10 ++ src/pytorch/messages/messages.cpp | 5 + src/pytorch/messages/messages_cpu.cpp | 43 +++++++++ src/pytorch/messages/messages_cuda.cu | 130 ++++++++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 src/pytorch/messages/__init__.py create mode 100644 src/pytorch/messages/messages.cpp create mode 100644 src/pytorch/messages/messages_cpu.cpp create mode 100644 src/pytorch/messages/messages_cuda.cu diff --git a/src/pytorch/messages/__init__.py b/src/pytorch/messages/__init__.py new file mode 100644 index 0000000..308a3b2 --- /dev/null +++ b/src/pytorch/messages/__init__.py @@ -0,0 +1,10 @@ +import os +import torch as pt +from torch.utils import cpp_extension + +src_dir = os.path.dirname(__file__) +sources = ['messages.cpp', 'messages_cpu.cpp'] + (['messages_cuda.cu'] if pt.cuda.is_available() else []) +sources = [os.path.join(src_dir, name) for name in sources] + +cpp_extension.load(name='messages', sources=sources, is_python_module=False) +pass_messages = pt.ops.messages.pass_messages \ No newline at end of file diff --git a/src/pytorch/messages/messages.cpp b/src/pytorch/messages/messages.cpp new file mode 100644 index 0000000..a13e8b4 --- /dev/null +++ b/src/pytorch/messages/messages.cpp @@ -0,0 +1,5 @@ +#include + +TORCH_LIBRARY(messages, m) { + m.def("pass_messages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor messages)"); +} \ No newline at end of file diff --git a/src/pytorch/messages/messages_cpu.cpp b/src/pytorch/messages/messages_cpu.cpp new file mode 100644 index 0000000..ca7672f --- /dev/null +++ b/src/pytorch/messages/messages_cpu.cpp @@ -0,0 +1,43 @@ +#include + +using torch::kInt32; +using torch::logical_and; +using torch::Tensor; + +static Tensor forward(const Tensor& neighbors, const Tensor& messages, const Tensor& states) { + + TORCH_CHECK(neighbors.dim() == 2, "Expected \"neighbors\" to have two dimensions"); + TORCH_CHECK(neighbors.size(0) == 2, "Expected the 2nd dimension size of \"neighbors\" to be 2"); + TORCH_CHECK(neighbors.scalar_type() == kInt32, "Expected \"neighbors\" to have data type of int32"); + TORCH_CHECK(neighbors.is_contiguous(), "Expected \"neighbors\" to be contiguous"); + + TORCH_CHECK(messages.dim() == 2, "Expected \"messages\" to have two dimensions"); + TORCH_CHECK(messages.size(1) % 32 == 0, "Expected the 2nd dimension size of \"messages\" to be a multiple of 32"); + TORCH_CHECK(messages.size(1) <= 1024, "Expected the 2nd dimension size of \"messages\" to be less than 1024"); + TORCH_CHECK(messages.is_contiguous(), "Expected \"messages\" to be contiguous"); + + TORCH_CHECK(states.dim() == 2, "Expected \"states\" to have two dimensions"); + TORCH_CHECK(states.size(1) == messages.size(1), "Expected the 2nd dimension size of \"messages\" and \"states\" to be the same"); + TORCH_CHECK(states.scalar_type() == messages.scalar_type(), "Expected the data type of \"messages\" and \"states\" to be the same"); + TORCH_CHECK(states.is_contiguous(), "Expected \"messages\" to be contiguous"); + + const Tensor rows = neighbors[0]; + const Tensor columns = neighbors[1]; + + const int num_features = messages.size(1); + + const Tensor mask = logical_and(rows > -1, columns > -1); + const Tensor masked_rows = rows.masked_select(mask).to(torch::kLong); + const Tensor masked_columns = columns.masked_select(mask).to(torch::kLong); + const Tensor masked_messages = messages.masked_select(mask.unsqueeze(1)).reshape({-1, num_features}); + + Tensor new_states = states.clone(); + new_states.index_add_(0, masked_rows, masked_messages); + new_states.index_add_(0, masked_columns, masked_messages); + + return new_states; +} + +TORCH_LIBRARY_IMPL(messages, CPU, m) { + m.impl("pass_messages", &forward); +} \ No newline at end of file diff --git a/src/pytorch/messages/messages_cuda.cu b/src/pytorch/messages/messages_cuda.cu new file mode 100644 index 0000000..661a038 --- /dev/null +++ b/src/pytorch/messages/messages_cuda.cu @@ -0,0 +1,130 @@ +#include +#include +#include + +using c10::cuda::CUDAStreamGuard; +using c10::cuda::getCurrentCUDAStream; +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using torch::autograd::tensor_list; +using torch::kInt32; +using torch::PackedTensorAccessor32; +using torch::RestrictPtrTraits; +using torch::Tensor; +using torch::TensorOptions; + +template + using Accessor = PackedTensorAccessor32; + +template +inline Accessor get_accessor(const Tensor& tensor) { + return tensor.packed_accessor32(); +}; + +template __global__ void kernel_forward( + const Accessor neighbors, + const Accessor messages, + Accessor new_states +) { + const int32_t i_neig = blockIdx.x; + const int32_t i_dir = blockIdx.y; + const int32_t i_atom = neighbors[i_dir][i_neig]; + if (i_atom < 0) return; + + const int32_t i_feat = threadIdx.x; + atomicAdd(&new_states[i_atom][i_feat], messages[i_neig][i_feat]); +} + +template __global__ void kernel_backward( + const Accessor neighbors, + const Accessor grad_new_state, + Accessor grad_messages +) { + const int32_t i_neig = blockIdx.x; + const int32_t i_dir = blockIdx.y; + const int32_t i_atom = neighbors[i_dir][i_neig]; + if (i_atom < 0) return; + + const int32_t i_feat = threadIdx.x; + atomicAdd(&grad_messages[i_neig][i_feat], grad_new_state[i_atom][i_feat]); +} + +class Autograd : public Function { +public: + static tensor_list forward(AutogradContext* ctx, + const Tensor& neighbors, + const Tensor& messages, + const Tensor& states) { + + TORCH_CHECK(neighbors.dim() == 2, "Expected \"neighbors\" to have two dimensions"); + TORCH_CHECK(neighbors.size(0) == 2, "Expected the 2nd dimension size of \"neighbors\" to be 2"); + TORCH_CHECK(neighbors.scalar_type() == kInt32, "Expected \"neighbors\" to have data type of int32"); + TORCH_CHECK(neighbors.is_contiguous(), "Expected \"neighbors\" to be contiguous"); + + TORCH_CHECK(messages.dim() == 2, "Expected \"messages\" to have two dimensions"); + TORCH_CHECK(messages.size(1) % 32 == 0, "Expected the 2nd dimension size of \"messages\" to be a multiple of 32"); + TORCH_CHECK(messages.size(1) <= 1024, "Expected the 2nd dimension size of \"messages\" to be less than 1024"); + TORCH_CHECK(messages.is_contiguous(), "Expected \"messages\" to be contiguous"); + + TORCH_CHECK(states.dim() == 2, "Expected \"states\" to have two dimensions"); + TORCH_CHECK(states.size(1) == messages.size(1), "Expected the 2nd dimension size of \"messages\" and \"states\" to be the same"); + TORCH_CHECK(states.scalar_type() == messages.scalar_type(), "Expected the data type of \"messages\" and \"states\" to be the same"); + TORCH_CHECK(states.is_contiguous(), "Expected \"messages\" to be contiguous"); + + const int num_neighbors = neighbors.size(1); + const int num_features = messages.size(1); + + const dim3 blocks(num_neighbors, 2); + const dim3 threads(num_features); + const auto stream = getCurrentCUDAStream(neighbors.get_device()); + + Tensor new_states = states.clone(); + + AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "pass_messages_forward", [&]() { + const CUDAStreamGuard guard(stream); + kernel_forward<<>>( + get_accessor(neighbors), + get_accessor(messages), + get_accessor(new_states)); + }); + + ctx->save_for_backward({neighbors}); + + return {new_states}; + } + + static tensor_list backward(AutogradContext* ctx, tensor_list grad_inputs) { + + const Tensor neighbors = ctx->get_saved_variables()[0]; + const Tensor grad_new_state = grad_inputs[0]; + + const int num_neighbors = neighbors.size(1); + const int num_features = grad_new_state.size(1); + + const dim3 blocks(num_neighbors, 2); + const dim3 threads(num_features); + const auto stream = getCurrentCUDAStream(neighbors.get_device()); + + Tensor grad_messages = torch::zeros({num_neighbors, num_features}, grad_new_state.options()); + + AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "pass_messages_backward", [&]() { + const CUDAStreamGuard guard(stream); + kernel_backward<<>>( + get_accessor(neighbors), + get_accessor(grad_new_state), + get_accessor(grad_messages)); + }); + + return {Tensor(), // grad_neighbors + grad_messages, + grad_new_state.clone()}; // grad_state + } +}; + +TORCH_LIBRARY_IMPL(messages, AutogradCUDA, m) { + m.impl("pass_messages", [](const Tensor& neighbors, + const Tensor& messages, + const Tensor& states) { + return Autograd::apply(neighbors, messages, states)[0]; + }); +} \ No newline at end of file From 8fd009dd1d908c49c1ec6f935f0c8a07dcb551f5 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 7 Jun 2022 17:39:21 +0200 Subject: [PATCH 02/15] Integrate the message passing kernel --- CMakeLists.txt | 10 ++++++++-- src/pytorch/messages/__init__.py | 13 ++++--------- src/pytorch/messages/messages.cpp | 2 +- src/pytorch/messages/passMessages.py | 9 +++++++++ .../{messages_cpu.cpp => passMessagesCPU.cpp} | 2 +- .../{messages_cuda.cu => passMessagesCUDA.cu} | 6 +++--- 6 files changed, 26 insertions(+), 16 deletions(-) create mode 100644 src/pytorch/messages/passMessages.py rename src/pytorch/messages/{messages_cpu.cpp => passMessagesCPU.cpp} (98%) rename src/pytorch/messages/{messages_cuda.cu => passMessagesCUDA.cu} (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 98722ca..26f9c7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,9 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp src/pytorch/CFConv.cpp src/pytorch/CFConvNeighbors.cpp src/pytorch/SymmetryFunctions.cpp + src/pytorch/messages/messages.cpp + src/pytorch/messages/passMessagesCPU.cpp + src/pytorch/messages/passMessagesCUDA.cu src/schnet/CpuCFConv.cpp src/schnet/CudaCFConv.cu) @@ -28,7 +31,7 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp set(LIBRARY ${NAME}PyTorch) add_library(${LIBRARY} SHARED ${SRC_FILES}) set_property(TARGET ${LIBRARY} PROPERTY CXX_STANDARD 14) -target_include_directories(${LIBRARY} PRIVATE src/ani src/schnet) +target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS} src/ani src/schnet) target_link_libraries(${LIBRARY} ${TORCH_LIBRARIES} ${PYTHON_LIBRARIES}) if(ENABLE_CUDA) set_property(TARGET ${LIBRARY} PROPERTY CUDA_STANDARD 14) @@ -72,4 +75,7 @@ install(FILES src/pytorch/__init__.py src/pytorch/OptimizedTorchANI.py src/pytorch/SpeciesConverter.py src/pytorch/SymmetryFunctions.py - DESTINATION ${Python_SITEARCH}/${NAME}) \ No newline at end of file + DESTINATION ${Python_SITEARCH}/${NAME}) +install(FILES src/pytorch/messages/__init__.py + src/pytorch/messages/passMessages.py + DESTINATION ${Python_SITEARCH}/${NAME}/messages) \ No newline at end of file diff --git a/src/pytorch/messages/__init__.py b/src/pytorch/messages/__init__.py index 308a3b2..978fc67 100644 --- a/src/pytorch/messages/__init__.py +++ b/src/pytorch/messages/__init__.py @@ -1,10 +1,5 @@ -import os -import torch as pt -from torch.utils import cpp_extension +''' +Message passing operations +''' -src_dir = os.path.dirname(__file__) -sources = ['messages.cpp', 'messages_cpu.cpp'] + (['messages_cuda.cu'] if pt.cuda.is_available() else []) -sources = [os.path.join(src_dir, name) for name in sources] - -cpp_extension.load(name='messages', sources=sources, is_python_module=False) -pass_messages = pt.ops.messages.pass_messages \ No newline at end of file +from NNPOps.messages.passMessages import passMessages \ No newline at end of file diff --git a/src/pytorch/messages/messages.cpp b/src/pytorch/messages/messages.cpp index a13e8b4..c30adc6 100644 --- a/src/pytorch/messages/messages.cpp +++ b/src/pytorch/messages/messages.cpp @@ -1,5 +1,5 @@ #include TORCH_LIBRARY(messages, m) { - m.def("pass_messages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor messages)"); + m.def("passMessages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor states)"); } \ No newline at end of file diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py new file mode 100644 index 0000000..d04a0e6 --- /dev/null +++ b/src/pytorch/messages/passMessages.py @@ -0,0 +1,9 @@ +from torch import ops, Tensor + + +def getNeighborPairs(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor: + ''' + TODO + ''' + + return ops.messages.passMessages(neighbors, messages, states) \ No newline at end of file diff --git a/src/pytorch/messages/messages_cpu.cpp b/src/pytorch/messages/passMessagesCPU.cpp similarity index 98% rename from src/pytorch/messages/messages_cpu.cpp rename to src/pytorch/messages/passMessagesCPU.cpp index ca7672f..ae3d4cb 100644 --- a/src/pytorch/messages/messages_cpu.cpp +++ b/src/pytorch/messages/passMessagesCPU.cpp @@ -39,5 +39,5 @@ static Tensor forward(const Tensor& neighbors, const Tensor& messages, const Ten } TORCH_LIBRARY_IMPL(messages, CPU, m) { - m.impl("pass_messages", &forward); + m.impl("passMessages", &forward); } \ No newline at end of file diff --git a/src/pytorch/messages/messages_cuda.cu b/src/pytorch/messages/passMessagesCUDA.cu similarity index 97% rename from src/pytorch/messages/messages_cuda.cu rename to src/pytorch/messages/passMessagesCUDA.cu index 661a038..68ff73d 100644 --- a/src/pytorch/messages/messages_cuda.cu +++ b/src/pytorch/messages/passMessagesCUDA.cu @@ -80,7 +80,7 @@ public: Tensor new_states = states.clone(); - AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "pass_messages_forward", [&]() { + AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "passMessages::forward", [&]() { const CUDAStreamGuard guard(stream); kernel_forward<<>>( get_accessor(neighbors), @@ -107,7 +107,7 @@ public: Tensor grad_messages = torch::zeros({num_neighbors, num_features}, grad_new_state.options()); - AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "pass_messages_backward", [&]() { + AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "passMessages::backward", [&]() { const CUDAStreamGuard guard(stream); kernel_backward<<>>( get_accessor(neighbors), @@ -122,7 +122,7 @@ public: }; TORCH_LIBRARY_IMPL(messages, AutogradCUDA, m) { - m.impl("pass_messages", [](const Tensor& neighbors, + m.impl("passMessages", [](const Tensor& neighbors, const Tensor& messages, const Tensor& states) { return Autograd::apply(neighbors, messages, states)[0]; From b9de467efadd01b3173250c90a68a17a7fc29bd4 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 7 Jun 2022 18:21:50 +0200 Subject: [PATCH 03/15] Support pre-Kepler GPUs --- src/pytorch/messages/passMessagesCUDA.cu | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu index 68ff73d..c422dfc 100644 --- a/src/pytorch/messages/passMessagesCUDA.cu +++ b/src/pytorch/messages/passMessagesCUDA.cu @@ -21,6 +21,21 @@ inline Accessor get_accessor(const Tensor& tensor) { return tensor.packed_accessor32(); }; +// Support pre-Kepler GPUs +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + template __global__ void kernel_forward( const Accessor neighbors, const Accessor messages, From 407023621301e356b404647e20e941ef269d1cce Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 14 Jun 2022 20:05:55 +0200 Subject: [PATCH 04/15] Fix a merging artifact --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e963e7f..305a791 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,15 +21,12 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp src/pytorch/CFConv.cpp src/pytorch/CFConvNeighbors.cpp src/pytorch/SymmetryFunctions.cpp -<<<<<<< HEAD src/pytorch/messages/messages.cpp src/pytorch/messages/passMessagesCPU.cpp src/pytorch/messages/passMessagesCUDA.cu -======= src/pytorch/neighbors/getNeighborPairsCPU.cpp src/pytorch/neighbors/getNeighborPairsCUDA.cu src/pytorch/neighbors/neighbors.cpp ->>>>>>> master src/schnet/CpuCFConv.cpp src/schnet/CudaCFConv.cu) From 7f47b197e72cb6a248e2866a3791d82393c55558 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 14 Jun 2022 20:07:04 +0200 Subject: [PATCH 05/15] Fix another merge artifact --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 305a791..080b4bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,7 @@ install(FILES src/pytorch/__init__.py DESTINATION ${Python_SITEARCH}/${NAME}) install(FILES src/pytorch/messages/__init__.py src/pytorch/messages/passMessages.py - DESTINATION ${Python_SITEARCH}/${NAME}) + DESTINATION ${Python_SITEARCH}/${NAME}/messages) install(FILES src/pytorch/neighbors/__init__.py src/pytorch/neighbors/getNeighborPairs.py DESTINATION ${Python_SITEARCH}/${NAME}/neighbors) \ No newline at end of file From 6f220798033db42d80b2590a472714ad038f5962 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Wed, 15 Jun 2022 16:34:33 +0200 Subject: [PATCH 06/15] Factor out atomicAdd --- CMakeLists.txt | 3 ++- src/pytorch/common/atomicAdd.cuh | 19 +++++++++++++++++++ src/pytorch/messages/passMessagesCUDA.cu | 17 ++--------------- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 17 ++--------------- 4 files changed, 25 insertions(+), 31 deletions(-) create mode 100644 src/pytorch/common/atomicAdd.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index 080b4bf..37a31e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,8 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp set(LIBRARY ${NAME}PyTorch) add_library(${LIBRARY} SHARED ${SRC_FILES}) set_property(TARGET ${LIBRARY} PROPERTY CXX_STANDARD 14) -target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS} src/ani src/schnet) +target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS} + src/ani src/pytorch src/schnet) target_link_libraries(${LIBRARY} ${TORCH_LIBRARIES} ${PYTHON_LIBRARIES}) if(ENABLE_CUDA) set_property(TARGET ${LIBRARY} PROPERTY CUDA_STANDARD 14) diff --git a/src/pytorch/common/atomicAdd.cuh b/src/pytorch/common/atomicAdd.cuh new file mode 100644 index 0000000..2d6ac48 --- /dev/null +++ b/src/pytorch/common/atomicAdd.cuh @@ -0,0 +1,19 @@ +/* +Implement atomicAdd with double precision numbers for pre-Pascal GPUs. +Taken from https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error +NOTE: remove when the support of CUDA 11 is dropped. +*/ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif \ No newline at end of file diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu index c422dfc..da2aa71 100644 --- a/src/pytorch/messages/passMessagesCUDA.cu +++ b/src/pytorch/messages/passMessagesCUDA.cu @@ -2,6 +2,8 @@ #include #include +#include "common/atomicAdd.cuh" + using c10::cuda::CUDAStreamGuard; using c10::cuda::getCurrentCUDAStream; using torch::autograd::AutogradContext; @@ -21,21 +23,6 @@ inline Accessor get_accessor(const Tensor& tensor) { return tensor.packed_accessor32(); }; -// Support pre-Kepler GPUs -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} -#endif - template __global__ void kernel_forward( const Accessor neighbors, const Accessor messages, diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 1b48630..8688491 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -4,6 +4,8 @@ #include #include +#include "common/atomicAdd.cuh" + using c10::cuda::CUDAStreamGuard; using c10::cuda::getCurrentCUDAStream; using std::make_tuple; @@ -33,21 +35,6 @@ template __device__ __forceinline__ scalar_t sqrt_(scalar_t template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); }; template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); }; -// Support pre-Pascal GPUs. Remove when the support of CUDA 11 is dropped. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} -#endif - template __global__ void forward_kernel( const int32_t num_all_pairs, const Accessor positions, From b8e0089837eeb55983c0c59f7e60929472d16be4 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Wed, 15 Jun 2022 16:37:57 +0200 Subject: [PATCH 07/15] Add an include guard --- src/pytorch/common/atomicAdd.cuh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pytorch/common/atomicAdd.cuh b/src/pytorch/common/atomicAdd.cuh index 2d6ac48..80b79ce 100644 --- a/src/pytorch/common/atomicAdd.cuh +++ b/src/pytorch/common/atomicAdd.cuh @@ -1,3 +1,6 @@ +#ifndef NNPOPS_ATOMICADD_H +#define NNPOPS_ATOMICADD_H + /* Implement atomicAdd with double precision numbers for pre-Pascal GPUs. Taken from https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error @@ -16,4 +19,6 @@ __device__ double atomicAdd(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } +#endif + #endif \ No newline at end of file From 71a8469dc5343f541890874f63cfc2d6ab8417e8 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Wed, 15 Jun 2022 16:51:27 +0200 Subject: [PATCH 08/15] Factor out the accessor --- src/pytorch/common/accessor.cuh | 14 ++++++++++++++ src/pytorch/messages/passMessagesCUDA.cu | 11 +---------- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 11 +---------- 3 files changed, 16 insertions(+), 20 deletions(-) create mode 100644 src/pytorch/common/accessor.cuh diff --git a/src/pytorch/common/accessor.cuh b/src/pytorch/common/accessor.cuh new file mode 100644 index 0000000..a984eb1 --- /dev/null +++ b/src/pytorch/common/accessor.cuh @@ -0,0 +1,14 @@ +#ifndef NNPOPS_ACCESSOR_H +#define NNPOPS_ACCESSOR_H + +#include + +template + using Accessor = torch::PackedTensorAccessor32; + +template +inline Accessor get_accessor(const torch::Tensor& tensor) { + return tensor.packed_accessor32(); +}; + +#endif \ No newline at end of file diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu index da2aa71..7553767 100644 --- a/src/pytorch/messages/passMessagesCUDA.cu +++ b/src/pytorch/messages/passMessagesCUDA.cu @@ -2,6 +2,7 @@ #include #include +#include "common/accessor.cuh" #include "common/atomicAdd.cuh" using c10::cuda::CUDAStreamGuard; @@ -10,19 +11,9 @@ using torch::autograd::AutogradContext; using torch::autograd::Function; using torch::autograd::tensor_list; using torch::kInt32; -using torch::PackedTensorAccessor32; -using torch::RestrictPtrTraits; using torch::Tensor; using torch::TensorOptions; -template - using Accessor = PackedTensorAccessor32; - -template -inline Accessor get_accessor(const Tensor& tensor) { - return tensor.packed_accessor32(); -}; - template __global__ void kernel_forward( const Accessor neighbors, const Accessor messages, diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 8688491..2ee94b7 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -4,6 +4,7 @@ #include #include +#include "common/accessor.cuh" #include "common/atomicAdd.cuh" using c10::cuda::CUDAStreamGuard; @@ -16,21 +17,11 @@ using torch::autograd::tensor_list; using torch::empty; using torch::full; using torch::kInt32; -using torch::PackedTensorAccessor32; -using torch::RestrictPtrTraits; using torch::Scalar; using torch::Tensor; using torch::TensorOptions; using torch::zeros; -template - using Accessor = PackedTensorAccessor32; - -template -inline Accessor get_accessor(const Tensor& tensor) { - return tensor.packed_accessor32(); -}; - template __device__ __forceinline__ scalar_t sqrt_(scalar_t x) {}; template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); }; template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); }; From 982e5b5e620231df433ffd304bb30855de6e8fa4 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Tue, 2 Aug 2022 15:53:23 +0200 Subject: [PATCH 09/15] Fix a function name --- src/pytorch/messages/passMessages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py index d04a0e6..b16aa3a 100644 --- a/src/pytorch/messages/passMessages.py +++ b/src/pytorch/messages/passMessages.py @@ -1,7 +1,7 @@ from torch import ops, Tensor -def getNeighborPairs(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor: +def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor: ''' TODO ''' From c0c6142fe735cb6bd02e9afd8ae81cff9ea39b41 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 18 Aug 2022 18:23:53 +0200 Subject: [PATCH 10/15] Add the message passing tests --- src/pytorch/messages/TestMessages.py | 90 ++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 src/pytorch/messages/TestMessages.py diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py new file mode 100644 index 0000000..925b3d2 --- /dev/null +++ b/src/pytorch/messages/TestMessages.py @@ -0,0 +1,90 @@ +import pytest +import torch as pt +from NNPOps.messages import passMessages + + +@pytest.mark.parametrize('device', ['cpu', 'cuda']) +@pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) +@pytest.mark.parametrize('num_pairs', [1, 2, 3, 4, 5, 10, 100]) +@pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100]) +@pytest.mark.parametrize('num_states', [32, 64, 1024]) +def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states): + + device = pt.device(device) + if not pt.cuda.is_available() and device.is_cuda(): + pytest.skip('No GPU') + + # Generate random neighbors + neighbors = pt.randint(0, num_atoms, (2, num_pairs), dtype=pt.int32, device=device) + neighbors[:, pt.rand(num_pairs) > 0.5] = -1 + + # Generate random messages and states + messages = pt.randn((num_pairs, num_states), dtype=dtype, device=device) + states = pt.randn((num_atoms, num_states), dtype=dtype, device=device) + + # Compute reference + mask = pt.logical_and(neighbors[0] > -1, neighbors[1] > -1) + masked_neighbors = neighbors[:, mask].to(pt.long) + masked_messages = messages[mask, :] + ref_new_states = states.index_add(0, masked_neighbors[0], masked_messages)\ + .index_add(0, masked_neighbors[1], masked_messages) + + # Compute results + new_states = passMessages(neighbors, messages, states) + + # Check data type and device + assert new_states.device == neighbors.device + assert new_states.dtype == dtype + + # Check values + if dtype == pt.float32: + assert pt.allclose(ref_new_states, new_states, atol=1e-6, rtol=1e-4) + else: + assert pt.allclose(ref_new_states, new_states, atol=1e-12, rtol=1e-8) + +@pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) +@pytest.mark.parametrize('num_pairs', [1, 2, 3, 4, 5, 10, 100]) +@pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100]) +@pytest.mark.parametrize('num_states', [32, 64, 1024]) +def testPassMessagesGrads(dtype, num_pairs, num_atoms, num_states): + + if not pt.cuda.is_available(): + pytest.skip('No GPU') + + # Generate random neighbors + neighbors = pt.randint(0, num_atoms, (2, num_pairs), dtype=pt.int32) + neighbors[:, pt.rand(num_pairs) > 0.5] = -1 + + # Generate random messages and states + messages = pt.randn((num_pairs, num_states), dtype=dtype) + states = pt.randn((num_atoms, num_states), dtype=dtype) + + # Compute CPU gradients + neighbors_cpu = neighbors.detach().cpu() + messages_cpu = messages.detach().cpu() + states_cpu = states.detach().cpu() + messages_cpu.requires_grad_() + states_cpu.requires_grad_() + passMessages(neighbors_cpu, messages_cpu, states_cpu).norm().backward() + + # Compute CUDA gradients + neighbors_cuda = neighbors.detach().cuda() + messages_cuda = messages.detach().cuda() + states_cuda = states.detach().cuda() + messages_cuda.requires_grad_() + states_cuda.requires_grad_() + passMessages(neighbors_cuda, messages_cuda, states_cuda).norm().backward() + + # Check type and device + assert messages_cuda.grad.dtype == dtype + assert states_cuda.grad.dtype == dtype + assert messages_cuda.grad.device == neighbors_cuda.device + assert states_cuda.grad.device == neighbors_cuda.device + + # Check gradients + if dtype == pt.float32: + assert pt.allclose(messages_cpu.grad, messages_cuda.grad.cpu(), atol=1e-6, rtol=1e-4) + assert pt.allclose(states_cpu.grad, states_cuda.grad.cpu(), atol=1e-6, rtol=1e-4) + else: + assert pt.allclose(messages_cpu.grad, messages_cuda.grad.cpu(), atol=1e-12, rtol=1e-8) + assert pt.allclose(states_cpu.grad, states_cuda.grad.cpu(), atol=1e-12, rtol=1e-8) \ No newline at end of file From 5b6507eeeea532df71b55bafb2c32362c2ad9f12 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Fri, 19 Aug 2022 15:14:45 +0200 Subject: [PATCH 11/15] Tune the test tolerances --- src/pytorch/messages/TestMessages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py index 925b3d2..7198d0f 100644 --- a/src/pytorch/messages/TestMessages.py +++ b/src/pytorch/messages/TestMessages.py @@ -37,7 +37,9 @@ def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states): assert new_states.dtype == dtype # Check values - if dtype == pt.float32: + if dtype == pt.float32 and num_pairs > 10 and num_atoms < 10: + assert pt.allclose(ref_new_states, new_states, atol=1e-5, rtol=1e-3) + elif dtype == pt.float32: assert pt.allclose(ref_new_states, new_states, atol=1e-6, rtol=1e-4) else: assert pt.allclose(ref_new_states, new_states, atol=1e-12, rtol=1e-8) From 88262198de1284de52cad0cec83a25e292243344 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Fri, 19 Aug 2022 15:15:13 +0200 Subject: [PATCH 12/15] Enable the tests --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37a31e7..909134a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,7 @@ add_test(TestEnergyShifter pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestEne add_test(TestOptimizedTorchANI pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestOptimizedTorchANI.py) add_test(TestSpeciesConverter pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpeciesConverter.py) add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py) +add_test(TestMessages pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/messages/TestMessages.py) add_test(TestNeighbors pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py) add_test(TestGetNeighborPairs pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py) From 9bf61c6d54a32189b0b4e31ea4b1419d61cf5030 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Fri, 19 Aug 2022 16:31:14 +0200 Subject: [PATCH 13/15] Fixed the tests --- src/pytorch/messages/TestMessages.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py index 7198d0f..872731e 100644 --- a/src/pytorch/messages/TestMessages.py +++ b/src/pytorch/messages/TestMessages.py @@ -10,8 +10,7 @@ @pytest.mark.parametrize('num_states', [32, 64, 1024]) def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states): - device = pt.device(device) - if not pt.cuda.is_available() and device.is_cuda(): + if not pt.cuda.is_available() and device == 'cuda': pytest.skip('No GPU') # Generate random neighbors From 77cb4188aedcce96653ab2387d248423c561f405 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Fri, 19 Aug 2022 16:56:34 +0200 Subject: [PATCH 14/15] Add documentation --- CMakeLists.txt | 1 + src/pytorch/messages/passMessages.py | 53 +++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 909134a..c98b426 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ add_test(TestSpeciesConverter pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpe add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py) add_test(TestMessages pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/messages/TestMessages.py) add_test(TestNeighbors pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py) +add_test(TestPassMessages pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/messages/passMessages.py) add_test(TestGetNeighborPairs pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py) # Installation diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py index b16aa3a..77bdb69 100644 --- a/src/pytorch/messages/passMessages.py +++ b/src/pytorch/messages/passMessages.py @@ -3,7 +3,58 @@ def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor: ''' - TODO + Pass messages between the neighbor atoms. + + Given a set of `num_atoms` atoms (each atom has a state with `num_features` + features) and a set of `num_neighbors` neighbor atom pairs (each pair has a + message with `num_features` features), the messages of the pairs are added + to the corresponding atom states. + + Parameters + ---------- + neighbors: `torch.Tensor` + Atom pair indices. The shape of the tensor is `(2, num_pairs)`. + The indices can be `[0, num_atoms-1)` or `-1` (ignored pairs). + See for the documentation of `NNPOps.neighbors.getNeighborPairs` for + details. + messages: `torch.Tensor` + Atom pair messages. The shape of the tensor is `(num_pairs, num_features)`. + For efficient, `num_features` has to be a multiple of 32 and <= 1024. + states: `torch.Tensor` + Atom states. The shape of the tensor is `(num_atoms, num_features)`. + + Returns + ------- + new_states: `torch.Tensor` + Update atom states. The shape of the tensor is `(num_atoms, num_features)`. + + Note + ---- + The operation is compatible with CUDA Grahps, i.e. the shapes of the output + tensors are independed of the values of input tensors. + + Examples + -------- + >>> import torch as pt + >>> from NNPOps.messages import passMessages + + >>> num_atoms = 4 + >>> num_neigbors = 3 + >>> num_features = 32 + + >>> neighbors = pt.tensor([[0, -1, 1], [0, -1, 3]], dtype=pt.int32) + + >>> messages = pt.ones((num_neigbors, 32)); messages[1] = 5 + >>> messages[:, 0] + tensor([1., 5., 1.]) + + >>> states = pt.zeros((num_atoms, num_features)); states[1] = 3 + >>> states[:, 0] + tensor([0., 3., 0., 0.]) + + >>> new_states = passMessages(neighbors, messages, states) + >>> new_states[:, 0] + tensor([2., 4., 0., 1.]) ''' return ops.messages.passMessages(neighbors, messages, states) \ No newline at end of file From 47c2495c803331b6040ebbfdc5d48c762b5c2cc0 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Wed, 7 Sep 2022 18:24:57 +0200 Subject: [PATCH 15/15] Fix a typo --- src/pytorch/messages/passMessages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py index 77bdb69..c13ca8a 100644 --- a/src/pytorch/messages/passMessages.py +++ b/src/pytorch/messages/passMessages.py @@ -14,7 +14,7 @@ def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor: ---------- neighbors: `torch.Tensor` Atom pair indices. The shape of the tensor is `(2, num_pairs)`. - The indices can be `[0, num_atoms-1)` or `-1` (ignored pairs). + The indices can be `[0, num_atoms)` or `-1` (ignored pairs). See for the documentation of `NNPOps.neighbors.getNeighborPairs` for details. messages: `torch.Tensor`