From 1a44054cb2aa87e8df40bafa1f410de412bbcc7b Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 7 Jun 2022 17:13:19 +0200
Subject: [PATCH 01/15] Move the code from
 https://github.com/openmm/NNPOps/pull/58

---
 src/pytorch/messages/__init__.py      |  10 ++
 src/pytorch/messages/messages.cpp     |   5 +
 src/pytorch/messages/messages_cpu.cpp |  43 +++++++++
 src/pytorch/messages/messages_cuda.cu | 130 ++++++++++++++++++++++++++
 4 files changed, 188 insertions(+)
 create mode 100644 src/pytorch/messages/__init__.py
 create mode 100644 src/pytorch/messages/messages.cpp
 create mode 100644 src/pytorch/messages/messages_cpu.cpp
 create mode 100644 src/pytorch/messages/messages_cuda.cu

diff --git a/src/pytorch/messages/__init__.py b/src/pytorch/messages/__init__.py
new file mode 100644
index 0000000..308a3b2
--- /dev/null
+++ b/src/pytorch/messages/__init__.py
@@ -0,0 +1,10 @@
+import os
+import torch as pt
+from torch.utils import cpp_extension
+
+src_dir = os.path.dirname(__file__)
+sources = ['messages.cpp', 'messages_cpu.cpp'] + (['messages_cuda.cu'] if pt.cuda.is_available() else [])
+sources = [os.path.join(src_dir, name) for name in sources]
+
+cpp_extension.load(name='messages', sources=sources, is_python_module=False)
+pass_messages = pt.ops.messages.pass_messages
\ No newline at end of file
diff --git a/src/pytorch/messages/messages.cpp b/src/pytorch/messages/messages.cpp
new file mode 100644
index 0000000..a13e8b4
--- /dev/null
+++ b/src/pytorch/messages/messages.cpp
@@ -0,0 +1,5 @@
+#include <torch/extension.h>
+
+TORCH_LIBRARY(messages, m) {
+    m.def("pass_messages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor messages)");
+}
\ No newline at end of file
diff --git a/src/pytorch/messages/messages_cpu.cpp b/src/pytorch/messages/messages_cpu.cpp
new file mode 100644
index 0000000..ca7672f
--- /dev/null
+++ b/src/pytorch/messages/messages_cpu.cpp
@@ -0,0 +1,43 @@
+#include <torch/extension.h>
+
+using torch::kInt32;
+using torch::logical_and;
+using torch::Tensor;
+
+static Tensor forward(const Tensor& neighbors, const Tensor& messages, const Tensor& states) {
+
+    TORCH_CHECK(neighbors.dim() == 2, "Expected \"neighbors\" to have two dimensions");
+    TORCH_CHECK(neighbors.size(0) == 2, "Expected the 2nd dimension size of \"neighbors\" to be 2");
+    TORCH_CHECK(neighbors.scalar_type() == kInt32, "Expected \"neighbors\" to have data type of int32");
+    TORCH_CHECK(neighbors.is_contiguous(), "Expected \"neighbors\" to be contiguous");
+
+    TORCH_CHECK(messages.dim() == 2, "Expected \"messages\" to have two dimensions");
+    TORCH_CHECK(messages.size(1) % 32 == 0, "Expected the 2nd dimension size of \"messages\" to be a multiple of 32");
+    TORCH_CHECK(messages.size(1) <= 1024, "Expected the 2nd dimension size of \"messages\" to be less than 1024");
+    TORCH_CHECK(messages.is_contiguous(), "Expected \"messages\" to be contiguous");
+
+    TORCH_CHECK(states.dim() == 2, "Expected \"states\" to have two dimensions");
+    TORCH_CHECK(states.size(1) == messages.size(1), "Expected the 2nd dimension size of \"messages\" and \"states\" to be the same");
+    TORCH_CHECK(states.scalar_type() == messages.scalar_type(), "Expected the data type of \"messages\" and \"states\" to be the same");
+    TORCH_CHECK(states.is_contiguous(), "Expected \"messages\" to be contiguous");
+
+    const Tensor rows = neighbors[0];
+    const Tensor columns = neighbors[1];
+
+    const int num_features = messages.size(1);
+
+    const Tensor mask = logical_and(rows > -1, columns > -1);
+    const Tensor masked_rows = rows.masked_select(mask).to(torch::kLong);
+    const Tensor masked_columns = columns.masked_select(mask).to(torch::kLong);
+    const Tensor masked_messages = messages.masked_select(mask.unsqueeze(1)).reshape({-1, num_features});
+
+    Tensor new_states = states.clone();
+    new_states.index_add_(0, masked_rows, masked_messages);
+    new_states.index_add_(0, masked_columns, masked_messages);
+
+    return new_states;
+}
+
+TORCH_LIBRARY_IMPL(messages, CPU, m) {
+    m.impl("pass_messages", &forward);
+}
\ No newline at end of file
diff --git a/src/pytorch/messages/messages_cuda.cu b/src/pytorch/messages/messages_cuda.cu
new file mode 100644
index 0000000..661a038
--- /dev/null
+++ b/src/pytorch/messages/messages_cuda.cu
@@ -0,0 +1,130 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+
+using c10::cuda::CUDAStreamGuard;
+using c10::cuda::getCurrentCUDAStream;
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using torch::autograd::tensor_list;
+using torch::kInt32;
+using torch::PackedTensorAccessor32;
+using torch::RestrictPtrTraits;
+using torch::Tensor;
+using torch::TensorOptions;
+
+template <typename scalar_t, int num_dims>
+    using Accessor = PackedTensorAccessor32<scalar_t, num_dims, RestrictPtrTraits>;
+
+template <typename scalar_t, int num_dims> 
+inline Accessor<scalar_t, num_dims> get_accessor(const Tensor& tensor) {
+    return tensor.packed_accessor32<scalar_t, num_dims, RestrictPtrTraits>();
+};
+
+template <typename scalar_t> __global__ void kernel_forward(
+    const Accessor<int32_t, 2> neighbors,
+    const Accessor<scalar_t, 2> messages,
+    Accessor<scalar_t, 2> new_states
+) {
+    const int32_t i_neig = blockIdx.x;
+    const int32_t i_dir = blockIdx.y;
+    const int32_t i_atom = neighbors[i_dir][i_neig];
+    if (i_atom < 0) return;
+
+    const int32_t i_feat = threadIdx.x;
+    atomicAdd(&new_states[i_atom][i_feat], messages[i_neig][i_feat]);
+}
+
+template <typename scalar_t> __global__ void kernel_backward(
+    const Accessor<int32_t, 2> neighbors,
+    const Accessor<scalar_t, 2> grad_new_state,
+    Accessor<scalar_t, 2> grad_messages
+) {
+    const int32_t i_neig = blockIdx.x;
+    const int32_t i_dir = blockIdx.y;
+    const int32_t i_atom = neighbors[i_dir][i_neig];
+    if (i_atom < 0) return;
+
+    const int32_t i_feat = threadIdx.x;
+    atomicAdd(&grad_messages[i_neig][i_feat], grad_new_state[i_atom][i_feat]);
+}
+
+class Autograd : public Function<Autograd> {
+public:
+    static tensor_list forward(AutogradContext* ctx,
+                               const Tensor& neighbors,
+                               const Tensor& messages,
+                               const Tensor& states) {
+
+        TORCH_CHECK(neighbors.dim() == 2, "Expected \"neighbors\" to have two dimensions");
+        TORCH_CHECK(neighbors.size(0) == 2, "Expected the 2nd dimension size of \"neighbors\" to be 2");
+        TORCH_CHECK(neighbors.scalar_type() == kInt32, "Expected \"neighbors\" to have data type of int32");
+        TORCH_CHECK(neighbors.is_contiguous(), "Expected \"neighbors\" to be contiguous");
+
+        TORCH_CHECK(messages.dim() == 2, "Expected \"messages\" to have two dimensions");
+        TORCH_CHECK(messages.size(1) % 32 == 0, "Expected the 2nd dimension size of \"messages\" to be a multiple of 32");
+        TORCH_CHECK(messages.size(1) <= 1024, "Expected the 2nd dimension size of \"messages\" to be less than 1024");
+        TORCH_CHECK(messages.is_contiguous(), "Expected \"messages\" to be contiguous");
+
+        TORCH_CHECK(states.dim() == 2, "Expected \"states\" to have two dimensions");
+        TORCH_CHECK(states.size(1) == messages.size(1), "Expected the 2nd dimension size of \"messages\" and \"states\" to be the same");
+        TORCH_CHECK(states.scalar_type() == messages.scalar_type(), "Expected the data type of \"messages\" and \"states\" to be the same");
+        TORCH_CHECK(states.is_contiguous(), "Expected \"messages\" to be contiguous");
+
+        const int num_neighbors = neighbors.size(1);
+        const int num_features = messages.size(1);
+
+        const dim3 blocks(num_neighbors, 2);
+        const dim3 threads(num_features);
+        const auto stream = getCurrentCUDAStream(neighbors.get_device());
+
+        Tensor new_states = states.clone();
+
+        AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "pass_messages_forward", [&]() {
+            const CUDAStreamGuard guard(stream);
+            kernel_forward<<<blocks, threads, 0, stream>>>(
+                get_accessor<int32_t, 2>(neighbors),
+                get_accessor<scalar_t, 2>(messages),
+                get_accessor<scalar_t, 2>(new_states));
+        });
+
+        ctx->save_for_backward({neighbors});
+
+        return {new_states};
+    }
+
+    static tensor_list backward(AutogradContext* ctx, tensor_list grad_inputs) {
+
+        const Tensor neighbors = ctx->get_saved_variables()[0];
+        const Tensor grad_new_state = grad_inputs[0];
+
+        const int num_neighbors = neighbors.size(1);
+        const int num_features = grad_new_state.size(1);
+
+        const dim3 blocks(num_neighbors, 2);
+        const dim3 threads(num_features);
+        const auto stream = getCurrentCUDAStream(neighbors.get_device());
+
+        Tensor grad_messages = torch::zeros({num_neighbors, num_features}, grad_new_state.options());
+
+        AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "pass_messages_backward", [&]() {
+            const CUDAStreamGuard guard(stream);
+            kernel_backward<<<blocks, threads, 0, stream>>>(
+                get_accessor<int32_t, 2>(neighbors),
+                get_accessor<scalar_t, 2>(grad_new_state),
+                get_accessor<scalar_t, 2>(grad_messages));
+        });
+
+        return {Tensor(), // grad_neighbors
+                grad_messages,
+                grad_new_state.clone()}; // grad_state
+    }
+};
+
+TORCH_LIBRARY_IMPL(messages, AutogradCUDA, m) {
+    m.impl("pass_messages", [](const Tensor& neighbors,
+                               const Tensor& messages,
+                               const Tensor& states) {
+        return Autograd::apply(neighbors, messages, states)[0];
+    });
+}
\ No newline at end of file

From 8fd009dd1d908c49c1ec6f935f0c8a07dcb551f5 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 7 Jun 2022 17:39:21 +0200
Subject: [PATCH 02/15] Integrate the message passing kernel

---
 CMakeLists.txt                                      | 10 ++++++++--
 src/pytorch/messages/__init__.py                    | 13 ++++---------
 src/pytorch/messages/messages.cpp                   |  2 +-
 src/pytorch/messages/passMessages.py                |  9 +++++++++
 .../{messages_cpu.cpp => passMessagesCPU.cpp}       |  2 +-
 .../{messages_cuda.cu => passMessagesCUDA.cu}       |  6 +++---
 6 files changed, 26 insertions(+), 16 deletions(-)
 create mode 100644 src/pytorch/messages/passMessages.py
 rename src/pytorch/messages/{messages_cpu.cpp => passMessagesCPU.cpp} (98%)
 rename src/pytorch/messages/{messages_cuda.cu => passMessagesCUDA.cu} (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98722ca..26f9c7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,9 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp
               src/pytorch/CFConv.cpp
               src/pytorch/CFConvNeighbors.cpp
               src/pytorch/SymmetryFunctions.cpp
+              src/pytorch/messages/messages.cpp
+              src/pytorch/messages/passMessagesCPU.cpp
+              src/pytorch/messages/passMessagesCUDA.cu
               src/schnet/CpuCFConv.cpp
               src/schnet/CudaCFConv.cu)
 
@@ -28,7 +31,7 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp
 set(LIBRARY ${NAME}PyTorch)
 add_library(${LIBRARY} SHARED ${SRC_FILES})
 set_property(TARGET ${LIBRARY} PROPERTY CXX_STANDARD 14)
-target_include_directories(${LIBRARY} PRIVATE src/ani src/schnet)
+target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS} src/ani src/schnet)
 target_link_libraries(${LIBRARY} ${TORCH_LIBRARIES} ${PYTHON_LIBRARIES})
 if(ENABLE_CUDA)
     set_property(TARGET ${LIBRARY} PROPERTY CUDA_STANDARD 14)
@@ -72,4 +75,7 @@ install(FILES src/pytorch/__init__.py
               src/pytorch/OptimizedTorchANI.py
               src/pytorch/SpeciesConverter.py
               src/pytorch/SymmetryFunctions.py
-        DESTINATION ${Python_SITEARCH}/${NAME})
\ No newline at end of file
+        DESTINATION ${Python_SITEARCH}/${NAME})
+install(FILES src/pytorch/messages/__init__.py
+              src/pytorch/messages/passMessages.py
+        DESTINATION ${Python_SITEARCH}/${NAME}/messages)
\ No newline at end of file
diff --git a/src/pytorch/messages/__init__.py b/src/pytorch/messages/__init__.py
index 308a3b2..978fc67 100644
--- a/src/pytorch/messages/__init__.py
+++ b/src/pytorch/messages/__init__.py
@@ -1,10 +1,5 @@
-import os
-import torch as pt
-from torch.utils import cpp_extension
+'''
+Message passing operations
+'''
 
-src_dir = os.path.dirname(__file__)
-sources = ['messages.cpp', 'messages_cpu.cpp'] + (['messages_cuda.cu'] if pt.cuda.is_available() else [])
-sources = [os.path.join(src_dir, name) for name in sources]
-
-cpp_extension.load(name='messages', sources=sources, is_python_module=False)
-pass_messages = pt.ops.messages.pass_messages
\ No newline at end of file
+from NNPOps.messages.passMessages import passMessages
\ No newline at end of file
diff --git a/src/pytorch/messages/messages.cpp b/src/pytorch/messages/messages.cpp
index a13e8b4..c30adc6 100644
--- a/src/pytorch/messages/messages.cpp
+++ b/src/pytorch/messages/messages.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
 
 TORCH_LIBRARY(messages, m) {
-    m.def("pass_messages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor messages)");
+    m.def("passMessages(Tensor neighbors, Tensor messages, Tensor states) -> (Tensor states)");
 }
\ No newline at end of file
diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py
new file mode 100644
index 0000000..d04a0e6
--- /dev/null
+++ b/src/pytorch/messages/passMessages.py
@@ -0,0 +1,9 @@
+from torch import ops, Tensor
+
+
+def getNeighborPairs(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor:
+    '''
+    TODO
+    '''
+
+    return ops.messages.passMessages(neighbors, messages, states)
\ No newline at end of file
diff --git a/src/pytorch/messages/messages_cpu.cpp b/src/pytorch/messages/passMessagesCPU.cpp
similarity index 98%
rename from src/pytorch/messages/messages_cpu.cpp
rename to src/pytorch/messages/passMessagesCPU.cpp
index ca7672f..ae3d4cb 100644
--- a/src/pytorch/messages/messages_cpu.cpp
+++ b/src/pytorch/messages/passMessagesCPU.cpp
@@ -39,5 +39,5 @@ static Tensor forward(const Tensor& neighbors, const Tensor& messages, const Ten
 }
 
 TORCH_LIBRARY_IMPL(messages, CPU, m) {
-    m.impl("pass_messages", &forward);
+    m.impl("passMessages", &forward);
 }
\ No newline at end of file
diff --git a/src/pytorch/messages/messages_cuda.cu b/src/pytorch/messages/passMessagesCUDA.cu
similarity index 97%
rename from src/pytorch/messages/messages_cuda.cu
rename to src/pytorch/messages/passMessagesCUDA.cu
index 661a038..68ff73d 100644
--- a/src/pytorch/messages/messages_cuda.cu
+++ b/src/pytorch/messages/passMessagesCUDA.cu
@@ -80,7 +80,7 @@ public:
 
         Tensor new_states = states.clone();
 
-        AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "pass_messages_forward", [&]() {
+        AT_DISPATCH_FLOATING_TYPES(messages.scalar_type(), "passMessages::forward", [&]() {
             const CUDAStreamGuard guard(stream);
             kernel_forward<<<blocks, threads, 0, stream>>>(
                 get_accessor<int32_t, 2>(neighbors),
@@ -107,7 +107,7 @@ public:
 
         Tensor grad_messages = torch::zeros({num_neighbors, num_features}, grad_new_state.options());
 
-        AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "pass_messages_backward", [&]() {
+        AT_DISPATCH_FLOATING_TYPES(grad_new_state.scalar_type(), "passMessages::backward", [&]() {
             const CUDAStreamGuard guard(stream);
             kernel_backward<<<blocks, threads, 0, stream>>>(
                 get_accessor<int32_t, 2>(neighbors),
@@ -122,7 +122,7 @@ public:
 };
 
 TORCH_LIBRARY_IMPL(messages, AutogradCUDA, m) {
-    m.impl("pass_messages", [](const Tensor& neighbors,
+    m.impl("passMessages", [](const Tensor& neighbors,
                                const Tensor& messages,
                                const Tensor& states) {
         return Autograd::apply(neighbors, messages, states)[0];

From b9de467efadd01b3173250c90a68a17a7fc29bd4 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 7 Jun 2022 18:21:50 +0200
Subject: [PATCH 03/15] Support pre-Kepler GPUs

---
 src/pytorch/messages/passMessagesCUDA.cu | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu
index 68ff73d..c422dfc 100644
--- a/src/pytorch/messages/passMessagesCUDA.cu
+++ b/src/pytorch/messages/passMessagesCUDA.cu
@@ -21,6 +21,21 @@ inline Accessor<scalar_t, num_dims> get_accessor(const Tensor& tensor) {
     return tensor.packed_accessor32<scalar_t, num_dims, RestrictPtrTraits>();
 };
 
+// Support pre-Kepler GPUs
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
 template <typename scalar_t> __global__ void kernel_forward(
     const Accessor<int32_t, 2> neighbors,
     const Accessor<scalar_t, 2> messages,

From 407023621301e356b404647e20e941ef269d1cce Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 14 Jun 2022 20:05:55 +0200
Subject: [PATCH 04/15] Fix a merging artifact

---
 CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e963e7f..305a791 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,15 +21,12 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp
               src/pytorch/CFConv.cpp
               src/pytorch/CFConvNeighbors.cpp
               src/pytorch/SymmetryFunctions.cpp
-<<<<<<< HEAD
               src/pytorch/messages/messages.cpp
               src/pytorch/messages/passMessagesCPU.cpp
               src/pytorch/messages/passMessagesCUDA.cu
-=======
               src/pytorch/neighbors/getNeighborPairsCPU.cpp
               src/pytorch/neighbors/getNeighborPairsCUDA.cu
               src/pytorch/neighbors/neighbors.cpp
->>>>>>> master
               src/schnet/CpuCFConv.cpp
               src/schnet/CudaCFConv.cu)
 

From 7f47b197e72cb6a248e2866a3791d82393c55558 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 14 Jun 2022 20:07:04 +0200
Subject: [PATCH 05/15] Fix another merge artifact

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 305a791..080b4bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,7 +83,7 @@ install(FILES src/pytorch/__init__.py
         DESTINATION ${Python_SITEARCH}/${NAME})
 install(FILES src/pytorch/messages/__init__.py
               src/pytorch/messages/passMessages.py
-        DESTINATION ${Python_SITEARCH}/${NAME})
+        DESTINATION ${Python_SITEARCH}/${NAME}/messages)
 install(FILES src/pytorch/neighbors/__init__.py
               src/pytorch/neighbors/getNeighborPairs.py
         DESTINATION ${Python_SITEARCH}/${NAME}/neighbors)
\ No newline at end of file

From 6f220798033db42d80b2590a472714ad038f5962 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Wed, 15 Jun 2022 16:34:33 +0200
Subject: [PATCH 06/15] Factor out atomicAdd

---
 CMakeLists.txt                                |  3 ++-
 src/pytorch/common/atomicAdd.cuh              | 19 +++++++++++++++++++
 src/pytorch/messages/passMessagesCUDA.cu      | 17 ++---------------
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 17 ++---------------
 4 files changed, 25 insertions(+), 31 deletions(-)
 create mode 100644 src/pytorch/common/atomicAdd.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 080b4bf..37a31e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,8 @@ set(SRC_FILES src/ani/CpuANISymmetryFunctions.cpp
 set(LIBRARY ${NAME}PyTorch)
 add_library(${LIBRARY} SHARED ${SRC_FILES})
 set_property(TARGET ${LIBRARY} PROPERTY CXX_STANDARD 14)
-target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS} src/ani src/schnet)
+target_include_directories(${LIBRARY} PRIVATE ${PYTHON_INCLUDE_DIRS}
+                                              src/ani src/pytorch src/schnet)
 target_link_libraries(${LIBRARY} ${TORCH_LIBRARIES} ${PYTHON_LIBRARIES})
 if(ENABLE_CUDA)
     set_property(TARGET ${LIBRARY} PROPERTY CUDA_STANDARD 14)
diff --git a/src/pytorch/common/atomicAdd.cuh b/src/pytorch/common/atomicAdd.cuh
new file mode 100644
index 0000000..2d6ac48
--- /dev/null
+++ b/src/pytorch/common/atomicAdd.cuh
@@ -0,0 +1,19 @@
+/*
+Implement atomicAdd with double precision numbers for pre-Pascal GPUs.
+Taken from https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error
+NOTE: remove when the support of CUDA 11 is dropped.
+*/
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
\ No newline at end of file
diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu
index c422dfc..da2aa71 100644
--- a/src/pytorch/messages/passMessagesCUDA.cu
+++ b/src/pytorch/messages/passMessagesCUDA.cu
@@ -2,6 +2,8 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
 
+#include "common/atomicAdd.cuh"
+
 using c10::cuda::CUDAStreamGuard;
 using c10::cuda::getCurrentCUDAStream;
 using torch::autograd::AutogradContext;
@@ -21,21 +23,6 @@ inline Accessor<scalar_t, num_dims> get_accessor(const Tensor& tensor) {
     return tensor.packed_accessor32<scalar_t, num_dims, RestrictPtrTraits>();
 };
 
-// Support pre-Kepler GPUs
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-__device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(val + __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-
 template <typename scalar_t> __global__ void kernel_forward(
     const Accessor<int32_t, 2> neighbors,
     const Accessor<scalar_t, 2> messages,
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 1b48630..8688491 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -4,6 +4,8 @@
 #include <algorithm>
 #include <tuple>
 
+#include "common/atomicAdd.cuh"
+
 using c10::cuda::CUDAStreamGuard;
 using c10::cuda::getCurrentCUDAStream;
 using std::make_tuple;
@@ -33,21 +35,6 @@ template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t
 template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
 template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };
 
-// Support pre-Pascal GPUs. Remove when the support of CUDA 11 is dropped.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-__device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(val + __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-
 template <typename scalar_t> __global__ void forward_kernel(
     const int32_t num_all_pairs,
     const Accessor<scalar_t, 2> positions,

From b8e0089837eeb55983c0c59f7e60929472d16be4 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Wed, 15 Jun 2022 16:37:57 +0200
Subject: [PATCH 07/15] Add an include guard

---
 src/pytorch/common/atomicAdd.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/pytorch/common/atomicAdd.cuh b/src/pytorch/common/atomicAdd.cuh
index 2d6ac48..80b79ce 100644
--- a/src/pytorch/common/atomicAdd.cuh
+++ b/src/pytorch/common/atomicAdd.cuh
@@ -1,3 +1,6 @@
+#ifndef NNPOPS_ATOMICADD_H
+#define NNPOPS_ATOMICADD_H
+
 /*
 Implement atomicAdd with double precision numbers for pre-Pascal GPUs.
 Taken from https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error
@@ -16,4 +19,6 @@ __device__ double atomicAdd(double* address, double val)
     } while (assumed != old);
     return __longlong_as_double(old);
 }
+#endif
+
 #endif
\ No newline at end of file

From 71a8469dc5343f541890874f63cfc2d6ab8417e8 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Wed, 15 Jun 2022 16:51:27 +0200
Subject: [PATCH 08/15] Factor out the accessor

---
 src/pytorch/common/accessor.cuh               | 14 ++++++++++++++
 src/pytorch/messages/passMessagesCUDA.cu      | 11 +----------
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 11 +----------
 3 files changed, 16 insertions(+), 20 deletions(-)
 create mode 100644 src/pytorch/common/accessor.cuh

diff --git a/src/pytorch/common/accessor.cuh b/src/pytorch/common/accessor.cuh
new file mode 100644
index 0000000..a984eb1
--- /dev/null
+++ b/src/pytorch/common/accessor.cuh
@@ -0,0 +1,14 @@
+#ifndef NNPOPS_ACCESSOR_H
+#define NNPOPS_ACCESSOR_H
+
+#include <torch/extension.h>
+
+template <typename scalar_t, int num_dims>
+    using Accessor = torch::PackedTensorAccessor32<scalar_t, num_dims, torch::RestrictPtrTraits>;
+
+template <typename scalar_t, int num_dims> 
+inline Accessor<scalar_t, num_dims> get_accessor(const torch::Tensor& tensor) {
+    return tensor.packed_accessor32<scalar_t, num_dims, torch::RestrictPtrTraits>();
+};
+
+#endif
\ No newline at end of file
diff --git a/src/pytorch/messages/passMessagesCUDA.cu b/src/pytorch/messages/passMessagesCUDA.cu
index da2aa71..7553767 100644
--- a/src/pytorch/messages/passMessagesCUDA.cu
+++ b/src/pytorch/messages/passMessagesCUDA.cu
@@ -2,6 +2,7 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
 
+#include "common/accessor.cuh"
 #include "common/atomicAdd.cuh"
 
 using c10::cuda::CUDAStreamGuard;
@@ -10,19 +11,9 @@ using torch::autograd::AutogradContext;
 using torch::autograd::Function;
 using torch::autograd::tensor_list;
 using torch::kInt32;
-using torch::PackedTensorAccessor32;
-using torch::RestrictPtrTraits;
 using torch::Tensor;
 using torch::TensorOptions;
 
-template <typename scalar_t, int num_dims>
-    using Accessor = PackedTensorAccessor32<scalar_t, num_dims, RestrictPtrTraits>;
-
-template <typename scalar_t, int num_dims> 
-inline Accessor<scalar_t, num_dims> get_accessor(const Tensor& tensor) {
-    return tensor.packed_accessor32<scalar_t, num_dims, RestrictPtrTraits>();
-};
-
 template <typename scalar_t> __global__ void kernel_forward(
     const Accessor<int32_t, 2> neighbors,
     const Accessor<scalar_t, 2> messages,
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 8688491..2ee94b7 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <tuple>
 
+#include "common/accessor.cuh"
 #include "common/atomicAdd.cuh"
 
 using c10::cuda::CUDAStreamGuard;
@@ -16,21 +17,11 @@ using torch::autograd::tensor_list;
 using torch::empty;
 using torch::full;
 using torch::kInt32;
-using torch::PackedTensorAccessor32;
-using torch::RestrictPtrTraits;
 using torch::Scalar;
 using torch::Tensor;
 using torch::TensorOptions;
 using torch::zeros;
 
-template <typename scalar_t, int num_dims>
-    using Accessor = PackedTensorAccessor32<scalar_t, num_dims, RestrictPtrTraits>;
-
-template <typename scalar_t, int num_dims>
-inline Accessor<scalar_t, num_dims> get_accessor(const Tensor& tensor) {
-    return tensor.packed_accessor32<scalar_t, num_dims, RestrictPtrTraits>();
-};
-
 template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t x) {};
 template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
 template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };

From 982e5b5e620231df433ffd304bb30855de6e8fa4 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Tue, 2 Aug 2022 15:53:23 +0200
Subject: [PATCH 09/15] Fix a function name

---
 src/pytorch/messages/passMessages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py
index d04a0e6..b16aa3a 100644
--- a/src/pytorch/messages/passMessages.py
+++ b/src/pytorch/messages/passMessages.py
@@ -1,7 +1,7 @@
 from torch import ops, Tensor
 
 
-def getNeighborPairs(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor:
+def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor:
     '''
     TODO
     '''

From c0c6142fe735cb6bd02e9afd8ae81cff9ea39b41 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Thu, 18 Aug 2022 18:23:53 +0200
Subject: [PATCH 10/15] Add the message passing tests

---
 src/pytorch/messages/TestMessages.py | 90 ++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 src/pytorch/messages/TestMessages.py

diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py
new file mode 100644
index 0000000..925b3d2
--- /dev/null
+++ b/src/pytorch/messages/TestMessages.py
@@ -0,0 +1,90 @@
+import pytest
+import torch as pt
+from NNPOps.messages import passMessages
+
+
+@pytest.mark.parametrize('device', ['cpu', 'cuda'])
+@pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
+@pytest.mark.parametrize('num_pairs', [1, 2, 3, 4, 5, 10, 100])
+@pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100])
+@pytest.mark.parametrize('num_states', [32, 64, 1024])
+def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states):
+
+    device = pt.device(device)
+    if not pt.cuda.is_available() and device.is_cuda():
+        pytest.skip('No GPU')
+
+    # Generate random neighbors
+    neighbors = pt.randint(0, num_atoms, (2, num_pairs), dtype=pt.int32, device=device)
+    neighbors[:, pt.rand(num_pairs) > 0.5] = -1
+
+    # Generate random messages and states
+    messages = pt.randn((num_pairs, num_states), dtype=dtype, device=device)
+    states = pt.randn((num_atoms, num_states), dtype=dtype, device=device)
+
+    # Compute reference
+    mask = pt.logical_and(neighbors[0] > -1, neighbors[1] > -1)
+    masked_neighbors = neighbors[:, mask].to(pt.long)
+    masked_messages = messages[mask, :]
+    ref_new_states = states.index_add(0, masked_neighbors[0], masked_messages)\
+                           .index_add(0, masked_neighbors[1], masked_messages)
+
+    # Compute results
+    new_states = passMessages(neighbors, messages, states)
+
+    # Check data type and device
+    assert new_states.device == neighbors.device
+    assert new_states.dtype == dtype
+
+    # Check values
+    if dtype == pt.float32:
+        assert pt.allclose(ref_new_states, new_states, atol=1e-6, rtol=1e-4)
+    else:
+        assert pt.allclose(ref_new_states, new_states, atol=1e-12, rtol=1e-8)
+
+@pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
+@pytest.mark.parametrize('num_pairs', [1, 2, 3, 4, 5, 10, 100])
+@pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100])
+@pytest.mark.parametrize('num_states', [32, 64, 1024])
+def testPassMessagesGrads(dtype, num_pairs, num_atoms, num_states):
+
+    if not pt.cuda.is_available():
+        pytest.skip('No GPU')
+
+    # Generate random neighbors
+    neighbors = pt.randint(0, num_atoms, (2, num_pairs), dtype=pt.int32)
+    neighbors[:, pt.rand(num_pairs) > 0.5] = -1
+
+    # Generate random messages and states
+    messages = pt.randn((num_pairs, num_states), dtype=dtype)
+    states = pt.randn((num_atoms, num_states), dtype=dtype)
+
+    # Compute CPU gradients
+    neighbors_cpu = neighbors.detach().cpu()
+    messages_cpu = messages.detach().cpu()
+    states_cpu = states.detach().cpu()
+    messages_cpu.requires_grad_()
+    states_cpu.requires_grad_()
+    passMessages(neighbors_cpu, messages_cpu, states_cpu).norm().backward()
+
+    # Compute CUDA gradients
+    neighbors_cuda = neighbors.detach().cuda()
+    messages_cuda = messages.detach().cuda()
+    states_cuda = states.detach().cuda()
+    messages_cuda.requires_grad_()
+    states_cuda.requires_grad_()
+    passMessages(neighbors_cuda, messages_cuda, states_cuda).norm().backward()
+
+    # Check type and device
+    assert messages_cuda.grad.dtype == dtype
+    assert states_cuda.grad.dtype == dtype
+    assert messages_cuda.grad.device == neighbors_cuda.device
+    assert states_cuda.grad.device == neighbors_cuda.device
+
+    # Check gradients
+    if dtype == pt.float32:
+        assert pt.allclose(messages_cpu.grad, messages_cuda.grad.cpu(), atol=1e-6, rtol=1e-4)
+        assert pt.allclose(states_cpu.grad, states_cuda.grad.cpu(), atol=1e-6, rtol=1e-4)
+    else:
+        assert pt.allclose(messages_cpu.grad, messages_cuda.grad.cpu(), atol=1e-12, rtol=1e-8)
+        assert pt.allclose(states_cpu.grad, states_cuda.grad.cpu(), atol=1e-12, rtol=1e-8)
\ No newline at end of file

From 5b6507eeeea532df71b55bafb2c32362c2ad9f12 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Fri, 19 Aug 2022 15:14:45 +0200
Subject: [PATCH 11/15] Tune the test tolerances

---
 src/pytorch/messages/TestMessages.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py
index 925b3d2..7198d0f 100644
--- a/src/pytorch/messages/TestMessages.py
+++ b/src/pytorch/messages/TestMessages.py
@@ -37,7 +37,9 @@ def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states):
     assert new_states.dtype == dtype
 
     # Check values
-    if dtype == pt.float32:
+    if dtype == pt.float32 and num_pairs > 10 and num_atoms < 10:
+        assert pt.allclose(ref_new_states, new_states, atol=1e-5, rtol=1e-3)
+    elif dtype == pt.float32:
         assert pt.allclose(ref_new_states, new_states, atol=1e-6, rtol=1e-4)
     else:
         assert pt.allclose(ref_new_states, new_states, atol=1e-12, rtol=1e-8)

From 88262198de1284de52cad0cec83a25e292243344 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Fri, 19 Aug 2022 15:15:13 +0200
Subject: [PATCH 12/15] Enable the tests

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37a31e7..909134a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,7 @@ add_test(TestEnergyShifter     pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestEne
 add_test(TestOptimizedTorchANI pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestOptimizedTorchANI.py)
 add_test(TestSpeciesConverter  pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpeciesConverter.py)
 add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py)
+add_test(TestMessages          pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/messages/TestMessages.py)
 add_test(TestNeighbors         pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py)
 add_test(TestGetNeighborPairs  pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py)
 

From 9bf61c6d54a32189b0b4e31ea4b1419d61cf5030 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Fri, 19 Aug 2022 16:31:14 +0200
Subject: [PATCH 13/15] Fixed the tests

---
 src/pytorch/messages/TestMessages.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pytorch/messages/TestMessages.py b/src/pytorch/messages/TestMessages.py
index 7198d0f..872731e 100644
--- a/src/pytorch/messages/TestMessages.py
+++ b/src/pytorch/messages/TestMessages.py
@@ -10,8 +10,7 @@
 @pytest.mark.parametrize('num_states', [32, 64, 1024])
 def testPassMessageValues(device, dtype, num_pairs, num_atoms, num_states):
 
-    device = pt.device(device)
-    if not pt.cuda.is_available() and device.is_cuda():
+    if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
 
     # Generate random neighbors

From 77cb4188aedcce96653ab2387d248423c561f405 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Fri, 19 Aug 2022 16:56:34 +0200
Subject: [PATCH 14/15] Add documentation

---
 CMakeLists.txt                       |  1 +
 src/pytorch/messages/passMessages.py | 53 +++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 909134a..c98b426 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ add_test(TestSpeciesConverter  pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpe
 add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py)
 add_test(TestMessages          pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/messages/TestMessages.py)
 add_test(TestNeighbors         pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py)
+add_test(TestPassMessages      pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/messages/passMessages.py)
 add_test(TestGetNeighborPairs  pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py)
 
 # Installation
diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py
index b16aa3a..77bdb69 100644
--- a/src/pytorch/messages/passMessages.py
+++ b/src/pytorch/messages/passMessages.py
@@ -3,7 +3,58 @@
 
 def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor:
     '''
-    TODO
+    Pass messages between the neighbor atoms.
+
+    Given a set of `num_atoms` atoms (each atom has a state with `num_features`
+    features) and a set of `num_neighbors` neighbor atom pairs (each pair has a
+    message with `num_features` features), the messages of the pairs are added
+    to the corresponding atom states.
+
+    Parameters
+    ----------
+    neighbors: `torch.Tensor`
+        Atom pair indices. The shape of the tensor is `(2, num_pairs)`.
+        The indices can be `[0, num_atoms-1)` or `-1` (ignored pairs).
+        See for the documentation of `NNPOps.neighbors.getNeighborPairs` for
+        details.
+    messages: `torch.Tensor`
+        Atom pair messages. The shape of the tensor is `(num_pairs, num_features)`.
+        For efficient, `num_features` has to be a multiple of 32 and <= 1024.
+    states: `torch.Tensor`
+        Atom states. The shape of the tensor is `(num_atoms, num_features)`.
+
+    Returns
+    -------
+    new_states: `torch.Tensor`
+        Update atom states. The shape of the tensor is `(num_atoms, num_features)`.
+
+    Note
+    ----
+    The operation is compatible with CUDA Grahps, i.e. the shapes of the output
+    tensors are independed of the values of input tensors.
+
+    Examples
+    --------
+    >>> import torch as pt
+    >>> from NNPOps.messages import passMessages
+
+    >>> num_atoms = 4
+    >>> num_neigbors = 3
+    >>> num_features = 32
+
+    >>> neighbors = pt.tensor([[0, -1, 1], [0, -1, 3]], dtype=pt.int32)
+
+    >>> messages = pt.ones((num_neigbors, 32)); messages[1] = 5
+    >>> messages[:, 0]
+    tensor([1., 5., 1.])
+
+    >>> states = pt.zeros((num_atoms, num_features)); states[1] = 3
+    >>> states[:, 0]
+    tensor([0., 3., 0., 0.])
+
+    >>> new_states = passMessages(neighbors, messages, states)
+    >>> new_states[:, 0]
+    tensor([2., 4., 0., 1.])
     '''
 
     return ops.messages.passMessages(neighbors, messages, states)
\ No newline at end of file

From 47c2495c803331b6040ebbfdc5d48c762b5c2cc0 Mon Sep 17 00:00:00 2001
From: Raimondas Galvelis <r.galvelis@acellera.com>
Date: Wed, 7 Sep 2022 18:24:57 +0200
Subject: [PATCH 15/15] Fix a typo

---
 src/pytorch/messages/passMessages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/messages/passMessages.py b/src/pytorch/messages/passMessages.py
index 77bdb69..c13ca8a 100644
--- a/src/pytorch/messages/passMessages.py
+++ b/src/pytorch/messages/passMessages.py
@@ -14,7 +14,7 @@ def passMessages(neighbors: Tensor, messages: Tensor, states: Tensor) -> Tensor:
     ----------
     neighbors: `torch.Tensor`
         Atom pair indices. The shape of the tensor is `(2, num_pairs)`.
-        The indices can be `[0, num_atoms-1)` or `-1` (ignored pairs).
+        The indices can be `[0, num_atoms)` or `-1` (ignored pairs).
         See for the documentation of `NNPOps.neighbors.getNeighborPairs` for
         details.
     messages: `torch.Tensor`