NVIDIA
diff --git a/‎setup.py‎
Lines changed: 5 additions & 0 deletions b/‎setup.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention.h‎
Lines changed: 1 addition & 6 deletions b/‎torch_harmonics/attention/csrc/attention.h‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention_cpu.h‎
Lines changed: 2 additions & 0 deletions b/‎torch_harmonics/attention/csrc/attention_cpu.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention_cuda.cuh‎
Lines changed: 2 additions & 5 deletions b/‎torch_harmonics/attention/csrc/attention_cuda.cuh‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention_cuda_bwd.cu‎
Lines changed: 2 additions & 147 deletions b/‎torch_harmonics/attention/csrc/attention_cuda_bwd.cu‎
Lines changed: 2 additions & 147 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention_cuda_fwd.cu‎
Lines changed: 1 addition & 2 deletions b/‎torch_harmonics/attention/csrc/attention_cuda_fwd.cu‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎torch_harmonics/attention/csrc/attention_cuda_utils.cu‎
Lines changed: 1 addition & 1 deletion b/‎torch_harmonics/attention/csrc/attention_cuda_utils.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch_harmonics/attention/csrc/attention_cuda_utils.cuh‎
Lines changed: 2 additions & 2 deletions b/‎torch_harmonics/attention/csrc/attention_cuda_utils.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch_harmonics/disco/csrc/cudamacro.h‎
Lines changed: 0 additions & 47 deletions b/‎torch_harmonics/disco/csrc/cudamacro.h‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎torch_harmonics/disco/csrc/disco.h‎
Lines changed: 0 additions & 7 deletions b/‎torch_harmonics/disco/csrc/disco.h‎
Lines changed: 0 additions & 7 deletions
@@ -119,6 +119,7 @@ def get_ext_modules():
             [
                 "torch_harmonics/disco/csrc/disco_helpers.cpp",
             ],
+            include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],
             extra_compile_args=get_helpers_compile_args(),
         )
     )
@@ -130,6 +131,7 @@ def get_ext_modules():
             [
                 "torch_harmonics/attention/csrc/attention_helpers.cpp",
             ],
+            include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],
             extra_compile_args=get_helpers_compile_args(),
         )
     )
@@ -189,6 +191,7 @@ def get_ext_modules():
                 CppExtension(
                     "torch_harmonics.disco._C", 
                     disco_sources,
+                    include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],
                     extra_compile_args=get_compile_args("disco")
                 )
             )
@@ -212,6 +215,7 @@ def get_ext_modules():
                 CUDAExtension(
                     "torch_harmonics.attention._C",
                     attention_sources,
+                    include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],
                     extra_compile_args=get_compile_args("attention")
                 )
             )
@@ -220,6 +224,7 @@ def get_ext_modules():
                 CppExtension(
                     "torch_harmonics.attention._C",
                     attention_sources,
+                    include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],
                     extra_compile_args=get_compile_args("attention")
                 )
             )
 
@@ -36,9 +36,4 @@
 #include <torch/library.h>
 #include <cassert>
 
-#define CHECK_CPU_TENSOR(x) TORCH_INTERNAL_ASSERT(x.device().type() == torch::kCPU)
-#define CHECK_CONTIGUOUS_TENSOR(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT_TENSOR(x) CHECK_CONTIGUOUS_TENSOR(x)
-#define CHECK_CPU_INPUT_TENSOR(x)                                                                                     \
-    CHECK_CPU_TENSOR(x);                                                                                              \
-    CHECK_CONTIGUOUS_TENSOR(x)
+#include "cppmacro.h"
@@ -34,6 +34,8 @@
 #include <array>
 #include <vector>
 
+#include "cppmacro.h"
+
 #define CACHE_BLOCK_SIZE (64)
 
 namespace attention_kernels {
 
@@ -34,11 +34,8 @@
 #include <cstdint>
 #include <torch/torch.h>
 
-#define CHECK_CUDA_TENSOR(x) TORCH_INTERNAL_ASSERT(x.device().type() == torch::kCUDA)
-#define CHECK_CONTIGUOUS_TENSOR(x) TORCH_INTERNAL_ASSERT(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_CUDA_INPUT_TENSOR(x)                                                                                     \
-    CHECK_CUDA_TENSOR(x);                                                                                              \
-    CHECK_CONTIGUOUS_TENSOR(x)
+#include "cudamacro.h"
+
 
 namespace attention_kernels {
 
 
@@ -41,7 +41,7 @@
 #include <cub/cub.cuh>
 #include <limits>
 
-#include "cudamacro.h"
+//#include "cudamacro.h"
 #include "attention_cuda_utils.cuh"
 
 #include <iostream>
@@ -52,153 +52,8 @@
 
 #define MAX_LOCAL_ARR_LEN (16)
 
-namespace attention_kernels {
-
-#if 0
-class ScopeTimer
-{
-  public:
-    explicit ScopeTimer(const std::string &label = "") :
-        label_(label), start_(std::chrono::high_resolution_clock::now())
-    {
-    }
-
-    ~ScopeTimer()
-    {
-        auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_);
-        std::cout << label_ << "Elapsed time: " << elapsed.count() << " ms" << std::endl;
-    }
-
-  private:
-    std::string label_;
-    std::chrono::high_resolution_clock::time_point start_;
-};
-
-// easier to understand version of manual shfl_xor_sync, performance appears similar
-static __device__ float __warp_sum_cub(float val)
-{
-    // use cub to reduce within a warp
-    __shared__ typename cub::WarpReduce<float>::TempStorage temp_storage;
-
-    // 1. Compute sum (initially only in lane 0)
-    float sum = cub::WarpReduce<float>(temp_storage).Sum(val);
-    // 2. Broadcast sum to all threads
-    sum = __shfl_sync(0xFFFFFFFF, sum, 0);
-    return sum;
-}
-
-// This kernel computes the backward pass for the S2 attention mechanism, using
-// shared memory as a cache and one warp per output point, warp-parallel over
-// channels, which should be layed out in the fastest dimension for coalesced
-// memory access.
-template <int BDIM_X>
-__global__ __launch_bounds__(BDIM_X) void s2_attention_bwd_dkvq_kernel(
-    int num_channels, int nlon_in, int nlat_out, int nlon_out,
-    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> kx,
-    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> vx,
-    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> qy,
-    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dy,
-    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydk,
-    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydv,
-    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydq,
-    const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_col_idx,
-    const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_row_offset,
-    const torch::PackedTensorAccessor32<float, 1, torch::RestrictPtrTraits> quad_weights)
-{
 
-    extern __shared__ float sh[];
-    float *sh_alpha_k = sh + threadIdx.y * num_channels * 5;
-    float *sh_alpha_vw = sh_alpha_k + num_channels;
-    float *sh_alpha_kvw = sh_alpha_vw + num_channels;
-    float *sh_dy = sh_alpha_kvw + num_channels;
-    float *sh_qy = sh_dy + num_channels;
-    // (optionally, could use more shared memory for other intermediates)
-
-    const uint64_t batchId = blockIdx.y;
-    const uint64_t wid = uint64_t(blockIdx.x) * blockDim.y + threadIdx.y;
-    if (wid >= uint64_t(nlat_out) * nlon_in) return;
-    const int tidx = threadIdx.x;
-    const int ho = wid / nlon_out;
-    const int wo = wid - (ho * nlon_out);
-
-    // Zero shared memory
-    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-        sh_alpha_k[chan] = 0.0f;
-        sh_alpha_vw[chan] = 0.0f;
-        sh_alpha_kvw[chan] = 0.0f;
-        sh_dy[chan] = dy[batchId][chan][ho][wo];
-        sh_qy[chan] = qy[batchId][chan][ho][wo];
-    }
-    float alpha_sum = 0.0f;
-    float qdotk_max = -FLT_MAX;
-    float integral = 0.0f;
-    __syncthreads();
-
-    const int64_t rbeg = psi_row_offset[ho];
-    const int64_t rend = psi_row_offset[ho + 1];
-    const int rlen = rend - rbeg;
-
-    // 1st pass: accumulate alpha_sum, integral, and shared stats, along with a progressively computed qdotk_max.
-    for (int off = 0; off < rlen; off++) {
-        const int64_t col = psi_col_idx[rbeg + off];
-        const int hi = col / nlon_in;
-        const int wi = col - (hi * nlon_in);
-        const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-        float qdotk = 0.0f, gdotv = 0.0f;
-        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-            qdotk += sh_qy[chan] * kx[batchId][chan][hi][wip];
-            gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
-        }
-        qdotk = __warp_sum_cub(qdotk);
-        gdotv = __warp_sum_cub(gdotv);
-        float qdotk_max_tmp = max(qdotk_max, qdotk);
-        float alpha_inz = expf(qdotk - qdotk_max_tmp) * quad_weights[hi];
-        float max_correction = expf(qdotk_max - qdotk_max_tmp);
-        alpha_sum = alpha_sum * max_correction + alpha_inz;
-        integral = integral * max_correction + alpha_inz * gdotv;
-        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-            float kxval = kx[batchId][chan][hi][wip];
-            sh_alpha_k[chan] = sh_alpha_k[chan] * max_correction + alpha_inz * kxval;
-            sh_alpha_vw[chan] = sh_alpha_vw[chan] * max_correction + alpha_inz * gdotv;
-            sh_alpha_kvw[chan] = sh_alpha_kvw[chan] * max_correction + alpha_inz * kxval * gdotv;
-        }
-        qdotk_max = qdotk_max_tmp;
-    }
-
-    integral /= alpha_sum;
-
-    // Write dydq
-    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-        dydq[batchId][chan][ho][wo]
-            = (sh_alpha_kvw[chan] * alpha_sum - sh_alpha_vw[chan] * sh_alpha_k[chan]) / (alpha_sum * alpha_sum);
-    }
-
-    // Third pass: accumulate gradients for k and v
-    for (int off = 0; off < rlen; off++) {
-        const int64_t col = psi_col_idx[rbeg + off];
-        const int hi = col / nlon_in;
-        const int wi = col - (hi * nlon_in);
-        const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-        float qdotk = 0.0f, gdotv = 0.0f;
-        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-            qdotk += qy[batchId][chan][ho][wo] * kx[batchId][chan][hi][wip];
-            gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
-        }
-        qdotk = __warp_sum_cub(qdotk);
-        gdotv = __warp_sum_cub(gdotv);
-        float alpha_inz = expf(qdotk - qdotk_max) * quad_weights[hi];
-        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-            float qyval = qy[batchId][chan][ho][wo];
-            float dyval = sh_dy[chan];
-            atomicAdd(&dydk[batchId][chan][hi][wip], qyval * (alpha_inz / alpha_sum) * (gdotv - integral));
-            atomicAdd(&dydv[batchId][chan][hi][wip], (alpha_inz / alpha_sum) * dyval);
-        }
-    }
-}
-#endif
-
-// BEGIN backward kernels and functions
+namespace attention_kernels {
 
 // called with (blockDim.x=32 and blockDim.y>1, BDIM=blockDim.x*blockDim.y)
 template<int BDIM_X,
 
@@ -39,14 +39,13 @@
 #include <cub/cub.cuh>
 #include <limits>
 
-#include "cudamacro.h"
+//#include "cudamacro.h"
 #include "attention_cuda_utils.cuh"
 
 #define THREADS (64)
 
 #define MAX_LOCAL_ARR_LEN (16)
 
-// BEGIN - forward kernels and functions
 
 namespace attention_kernels {
 
 
@@ -39,7 +39,7 @@
 #include <cub/cub.cuh>
 #include <limits>
 
-#include "cudamacro.h"
+//#include "cudamacro.h"
 #include "attention_cuda.cuh"
 
 #define THREADS (64)
 
@@ -34,9 +34,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
 
+#include "cudamacro.h"
+
 #define WARP_SIZE (32)
-#define FULL_MASK (0xFFFFFFFF)
-#define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
 namespace attention_kernels {
 
 
@@ -35,10 +35,3 @@
 #include <torch/all.h>
 #include <torch/library.h>
 #include <cassert>
-
-#define CHECK_CPU_TENSOR(x) TORCH_INTERNAL_ASSERT(x.device().type() == torch::kCPU)
-#define CHECK_CONTIGUOUS_TENSOR(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT_TENSOR(x) CHECK_CONTIGUOUS_TENSOR(x)
-#define CHECK_CPU_INPUT_TENSOR(x)                                                                                     \
-    CHECK_CPU_TENSOR(x);                                                                                              \
-    CHECK_CONTIGUOUS_TENSOR(x)
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@ def get_ext_modules():`
`119`	`119`	`[`
`120`	`120`	`"torch_harmonics/disco/csrc/disco_helpers.cpp",`
`121`	`121`	`],`
	`122`	`+ include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],`
`122`	`123`	`extra_compile_args=get_helpers_compile_args(),`
`123`	`124`	`)`
`124`	`125`	`)`
`@@ -130,6 +131,7 @@ def get_ext_modules():`
`130`	`131`	`[`
`131`	`132`	`"torch_harmonics/attention/csrc/attention_helpers.cpp",`
`132`	`133`	`],`
	`134`	`+ include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],`
`133`	`135`	`extra_compile_args=get_helpers_compile_args(),`
`134`	`136`	`)`
`135`	`137`	`)`
`@@ -189,6 +191,7 @@ def get_ext_modules():`
`189`	`191`	`CppExtension(`
`190`	`192`	`"torch_harmonics.disco._C",`
`191`	`193`	`disco_sources,`
	`194`	`+ include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],`
`192`	`195`	`extra_compile_args=get_compile_args("disco")`
`193`	`196`	`)`
`194`	`197`	`)`
`@@ -212,6 +215,7 @@ def get_ext_modules():`
`212`	`215`	`CUDAExtension(`
`213`	`216`	`"torch_harmonics.attention._C",`
`214`	`217`	`attention_sources,`
	`218`	`+ include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],`
`215`	`219`	`extra_compile_args=get_compile_args("attention")`
`216`	`220`	`)`
`217`	`221`	`)`
`@@ -220,6 +224,7 @@ def get_ext_modules():`
`220`	`224`	`CppExtension(`
`221`	`225`	`"torch_harmonics.attention._C",`
`222`	`226`	`attention_sources,`
	`227`	`+ include_dirs=[os.path.join(setup_dir, "torch_harmonics/utils/csrc")],`
`223`	`228`	`extra_compile_args=get_compile_args("attention")`
`224`	`229`	`)`
`225`	`230`	`)`