turboderp-org
diff --git a/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 7 additions & 0 deletions b/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net.cpp‎
Lines changed: 59 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net.cpp‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net.h‎
Lines changed: 95 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net.h‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net_bc.h‎
Lines changed: 51 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_delta_net_bc.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.cpp‎
Lines changed: 12 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.h‎
Lines changed: 26 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm_bc.h‎
Lines changed: 12 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm_bc.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/linear.cpp‎
Lines changed: 36 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/linear.cpp‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/linear.h‎
Lines changed: 58 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/linear.h‎
Lines changed: 58 additions & 0 deletions
@@ -40,6 +40,9 @@
 #include "parallel/all_reduce.cuh"
 
 #include "libtorch/blocksparse_mlp.h"
+#include "libtorch/gated_delta_net.h"
+#include "libtorch/linear.h"
+#include "libtorch/gated_rmsnorm.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -116,4 +119,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("histogram", &histogram, "histogram");
 
     m.def("blocksparse_mlp_routing", &blocksparse_mlp_routing, "blocksparse_mlp_routing");
+
+    #include "libtorch/linear_bc.h"
+    #include "libtorch/gated_delta_net_bc.h"
+    #include "libtorch/gated_rmsnorm_bc.h"
 }
@@ -0,0 +1,59 @@
+#include <Python.h>
+#include "gated_delta_net.h"
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "../util.h"
+#include "../hgemm.cuh"
+#include "../quant/exl3_gemm.cuh"
+#include "../gdn.cuh"
+
+using namespace torch::indexing;
+
+at::Tensor BC_GatedDeltaNet::run_bsz1_a
+(
+    const at::Tensor& x
+)
+{
+    py::gil_scoped_release _;
+
+    qkvz_proj->run(x, qkvz);
+    ba_proj->run(x, ba);
+
+    gated_delta_net_fused_op
+    (
+        qkvz, ba,
+        dt_bias, a_log,
+        mixed_qkv, z, beta, g,
+        num_k_heads,
+        num_v_heads,
+        k_head_dim,
+        v_head_dim
+    );
+
+    return mixed_qkv;
+}
+
+void BC_GatedDeltaNet::run_bsz1_b
+(
+    at::Tensor& mixed_qkv,
+    at::Tensor& y,
+    at::Tensor& recurrent_state
+)
+{
+    cuda_recurrent_gated_delta_rule
+    (
+        mixed_qkv.transpose(1, 2),
+        g,
+        beta,
+        recurrent_state,
+        core_attn_out,
+        num_k_heads,
+        num_v_heads,
+        k_head_dim,
+        v_head_dim
+    );
+
+    norm->run(core_attn_out, core_attn_out_f, z);
+    o_proj->run(core_attn_out_f, y);
+}
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <vector>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+#include "linear.h"
+#include "gated_rmsnorm.h"
+
+struct BC_GatedDeltaNet
+{
+    at::Tensor mixed_qkv;
+    at::Tensor z;
+    at::Tensor beta;
+    at::Tensor g;
+    at::Tensor qkvz;
+    at::Tensor ba;
+    at::Tensor conv_temp_a;
+    at::Tensor conv_temp_b;
+    at::Tensor core_attn_out;
+    at::Tensor core_attn_out_f;
+    std::shared_ptr<BC_LinearEXL3> qkvz_proj;
+    std::shared_ptr<BC_LinearFP16> ba_proj;
+    at::Tensor dt_bias;
+    at::Tensor a_log;
+    int num_k_heads;
+    int num_v_heads;
+    int k_head_dim;
+    int v_head_dim;
+    at::Tensor conv1d_weight;
+    c10::optional<at::Tensor> conv1d_bias;
+    std::shared_ptr<BC_GatedRMSNorm> norm;
+    std::shared_ptr<BC_LinearEXL3> o_proj;
+
+    BC_GatedDeltaNet
+    (
+        at::Tensor _mixed_qkv,
+        at::Tensor _z,
+        at::Tensor _beta,
+        at::Tensor _g,
+        at::Tensor _qkvz,
+        at::Tensor _ba,
+        at::Tensor _conv_temp_a,
+        at::Tensor _conv_temp_b,
+        at::Tensor _core_attn_out,
+        at::Tensor _core_attn_out_f,
+        std::shared_ptr<BC_LinearEXL3> _qkvz_proj,
+        std::shared_ptr<BC_LinearFP16> _ba_proj,
+        at::Tensor _dt_bias,
+        at::Tensor _a_log,
+        int _num_k_heads,
+        int _num_v_heads,
+        int _k_head_dim,
+        int _v_head_dim,
+        at::Tensor _conv1d_weight,
+        c10::optional<at::Tensor> _conv1d_bias,
+        std::shared_ptr<BC_GatedRMSNorm> _norm,
+        std::shared_ptr<BC_LinearEXL3> _o_proj
+    ) :
+        mixed_qkv       (std::move(_mixed_qkv)),
+        z               (std::move(_z)),
+        beta            (std::move(_beta)),
+        g               (std::move(_g)),
+        qkvz            (std::move(_qkvz)),
+        ba              (std::move(_ba)),
+        conv_temp_a     (std::move(_conv_temp_a)),
+        conv_temp_b     (std::move(_conv_temp_b)),
+        core_attn_out   (std::move(_core_attn_out)),
+        core_attn_out_f (std::move(_core_attn_out_f)),
+        qkvz_proj       (_qkvz_proj),
+        ba_proj         (_ba_proj),
+        dt_bias         (std::move(_dt_bias)),
+        a_log           (std::move(_a_log)),
+        num_k_heads     (_num_k_heads),
+        num_v_heads     (_num_v_heads),
+        k_head_dim      (_k_head_dim),
+        v_head_dim      (_v_head_dim),
+        conv1d_weight   (std::move(_conv1d_weight)),
+        conv1d_bias     (std::move(_conv1d_bias)),
+        norm            (_norm),
+        o_proj          (_o_proj)
+    {}
+
+    at::Tensor run_bsz1_a
+    (
+        const at::Tensor& x
+    );
+
+    void run_bsz1_b
+    (
+        at::Tensor& mixed_qkv,
+        at::Tensor& y,
+        at::Tensor& recurrent_state
+    );
+};
@@ -0,0 +1,51 @@
+py::class_<BC_GatedDeltaNet, std::shared_ptr<BC_GatedDeltaNet>>(m, "BC_GatedDeltaNet").def
+(
+    py::init<
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        std::shared_ptr<BC_LinearEXL3>,
+        std::shared_ptr<BC_LinearFP16>,
+        at::Tensor,
+        at::Tensor,
+        int,
+        int,
+        int,
+        int,
+        at::Tensor,
+        c10::optional<at::Tensor>,
+        std::shared_ptr<BC_GatedRMSNorm>,
+        std::shared_ptr<BC_LinearEXL3>
+    >(),
+    py::arg("mixed_qkv"),
+    py::arg("z"),
+    py::arg("beta"),
+    py::arg("g"),
+    py::arg("qkvz"),
+    py::arg("ba"),
+    py::arg("conv_temp_a"),
+    py::arg("conv_temp_b"),
+    py::arg("core_attn_out"),
+    py::arg("core_attn_out_f"),
+    py::arg("qkvz_proj"),
+    py::arg("ba_proj"),
+    py::arg("dt_bias"),
+    py::arg("a_log"),
+    py::arg("num_k_heads"),
+    py::arg("num_v_heads"),
+    py::arg("k_head_dim"),
+    py::arg("v_head_dim"),
+    py::arg("conv1d_weight"),
+    py::arg("conv1d_bias"),
+    py::arg("norm"),
+    py::arg("o_proj")
+)
+.def("run_bsz1_a", &BC_GatedDeltaNet::run_bsz1_a)
+.def("run_bsz1_b", &BC_GatedDeltaNet::run_bsz1_b);
@@ -0,0 +1,12 @@
+#include <Python.h>
+#include "gated_rmsnorm.h"
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "../util.h"
+#include "../norm.cuh"
+
+void BC_GatedRMSNorm::run(const at::Tensor& x, at::Tensor& y, const at::Tensor& gate)
+{
+    gated_rms_norm(x, weight, y, gate, rms_norm_eps, constant_bias);
+}
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <vector>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+struct BC_GatedRMSNorm
+{
+    at::Tensor weight;
+    float rms_norm_eps;
+    float constant_bias;
+
+    BC_GatedRMSNorm
+    (
+        at::Tensor _weight,
+        float _rms_norm_eps,
+        float _constant_bias
+    ) :
+        weight(std::move(_weight)),
+        rms_norm_eps(_rms_norm_eps),
+        constant_bias(_constant_bias)
+    {}
+
+    void run(const at::Tensor& x, at::Tensor& y, const at::Tensor& gate);
+};
@@ -0,0 +1,12 @@
+py::class_<BC_GatedRMSNorm, std::shared_ptr<BC_GatedRMSNorm>>(m, "BC_GatedRMSNorm").def
+    (
+        py::init<
+            at::Tensor,
+            float,
+            float
+        >(),
+        py::arg("weight"),
+        py::arg("rms_norm_eps"),
+        py::arg("constant_bias")
+    )
+    .def("run", &BC_GatedRMSNorm::run);
@@ -0,0 +1,36 @@
+#include <Python.h>
+#include "linear.h"
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "../util.h"
+#include "../hgemm.cuh"
+#include "../quant/exl3_gemm.cuh"
+
+void BC_LinearFP16::run(const at::Tensor& x, at::Tensor& y)
+{
+    if (x.dtype() == y.dtype())
+        at::matmul_out(weight, x, y);
+    else
+        hgemm(x, weight, y);
+
+    if (bias)
+        y.add_(bias.value());
+}
+
+
+void BC_LinearEXL3::run(const at::Tensor& x, at::Tensor& y)
+{
+    if (x.numel() == x.size(-1))
+    {
+        exl3_gemm(x, trellis, y, suh, xh, svh, -1, mcg_mult, mul1_mult);
+    }
+    else
+    {
+        at::Tensor xh_ = at::empty_like(x);
+        exl3_gemm(x, trellis, y, suh, xh_, svh, -1, mcg_mult, mul1_mult);
+    }
+
+    if (bias) y.add_(bias.value());
+}
+
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <vector>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+struct BC_LinearFP16
+{
+    at::Tensor weight;
+    c10::optional<at::Tensor> bias;
+
+    BC_LinearFP16
+    (
+        at::Tensor _weight,
+        c10::optional<at::Tensor> _bias
+    ) :
+        weight(std::move(_weight)),
+        bias(std::move(_bias))
+    {}
+
+    void run(const at::Tensor& x, at::Tensor& y);
+};
+
+struct BC_LinearEXL3
+{
+    at::Tensor trellis;
+    at::Tensor suh;
+    at::Tensor svh;
+    int K;
+    c10::optional<at::Tensor> bias;
+    int mcg_mult;
+    int mul1_mult;
+    at::Tensor xh;
+
+    BC_LinearEXL3
+    (
+        at::Tensor _trellis,
+        at::Tensor _suh,
+        at::Tensor _svh,
+        int _K,
+        c10::optional<at::Tensor> _bias,
+        int _mcg_mult,
+        int _mul1_mult,
+        at::Tensor _xh
+    ) :
+        trellis(std::move(_trellis)),
+        suh(std::move(_suh)),
+        svh(std::move(_svh)),
+        K(_K),
+        bias(std::move(_bias)),
+        mcg_mult(_mcg_mult),
+        mul1_mult(_mul1_mult),
+        xh(std::move(_xh))
+    {}
+
+    void run(const at::Tensor& x, at::Tensor& y);
+};