turboderp-org
diff --git a/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 4 additions & 1 deletion b/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.cpp‎
Lines changed: 85 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.cpp‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.h‎
Lines changed: 113 additions & 2 deletions b/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.h‎
Lines changed: 113 additions & 2 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp_bc.h‎
Lines changed: 68 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/blocksparse_mlp_bc.h‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm_bc.h‎
Lines changed: 11 additions & 11 deletions b/‎exllamav3/exllamav3_ext/libtorch/gated_rmsnorm_bc.h‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/linear.cpp‎
Lines changed: 8 additions & 0 deletions b/‎exllamav3/exllamav3_ext/libtorch/linear.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/libtorch/linear.h‎
Lines changed: 5 additions & 4 deletions b/‎exllamav3/exllamav3_ext/libtorch/linear.h‎
Lines changed: 5 additions & 4 deletions
@@ -40,10 +40,11 @@
 #include "parallel/gather.cuh"
 #include "parallel/all_reduce.cuh"
 
-#include "libtorch/blocksparse_mlp.h"
 #include "libtorch/gated_delta_net.h"
 #include "libtorch/linear.h"
 #include "libtorch/gated_rmsnorm.h"
+#include "libtorch/mlp.h"
+#include "libtorch/blocksparse_mlp.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -124,4 +125,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     #include "libtorch/linear_bc.h"
     #include "libtorch/gated_delta_net_bc.h"
     #include "libtorch/gated_rmsnorm_bc.h"
+    #include "libtorch/mlp_bc.h"
+    #include "libtorch/blocksparse_mlp_bc.h"
 }
@@ -5,6 +5,8 @@
 #include <torch/extension.h>
 #include "../util.h"
 #include "../hgemm.cuh"
+#include "../quant/exl3_gemm.cuh"
+#include "../activation.cuh"
 
 std::tuple<at::Tensor, at::Tensor> blocksparse_mlp_routing(
     int bsz,
@@ -56,4 +58,87 @@ std::tuple<at::Tensor, at::Tensor> blocksparse_mlp_routing(
 
         return {selected_experts, routing_weights};
     }
+}
+
+void BC_BlockSparseMLP::run_bsz1
+(
+    const at::Tensor& y,
+    at::Tensor& selected_experts,
+    at::Tensor& routing_weights
+)
+{
+    py::gil_scoped_release _;
+    const at::Tensor& yi = y.unsqueeze(0);
+
+    exl3_mgemm
+    (
+        yi,
+        gate_ptrs_trellis,
+        interm_g,
+        gate_ptrs_suh,
+        yh,
+        gate_ptrs_svh,
+        selected_experts,
+        {},
+        gate_K,
+        -1,
+        gate_mcg_mult,
+        gate_mul1_mult,
+        min_expert,
+        max_expert
+    );
+
+    exl3_mgemm(
+        yi,
+        up_ptrs_trellis,
+        interm_u,
+        up_ptrs_suh,
+        yh,
+        up_ptrs_svh,
+        selected_experts,
+        {},
+        up_K,
+        -1,
+        up_mcg_mult,
+        up_mul1_mult,
+        min_expert,
+        max_expert
+    );
+
+    if (act_silu)
+        silu_mul(interm_g, interm_u, interm_a);
+    else if (act_gelu)
+        gelu_mul(interm_g, interm_u, interm_a);
+
+    exl3_mgemm(
+        interm_a,
+        down_ptrs_trellis,
+        out_d,
+        down_ptrs_suh,
+        interm_a,
+        down_ptrs_svh,
+        selected_experts,
+        routing_weights,
+        down_K,
+        -1,
+        down_mcg_mult,
+        down_mul1_mult,
+        min_expert,
+        max_expert
+    );
+
+    if (shared_experts)
+    {
+        shared_experts->run_bsz1(yi, out_d_sh.value());
+        if (shared_gate)
+        {
+            shared_gate->run_cublas(yi, z.value());
+            add_sigmoid_gate(out_d_sh.value(), z.value(), out_d);
+        }
+        else
+        {
+            out_d.add_(out_d_sh.value());
+        }
+    }
+
 }
@@ -3,12 +3,123 @@
 #include <ATen/Tensor.h>
 #include <vector>
 #include <pybind11/pybind11.h>
-
 namespace py = pybind11;
 
+#include "mlp.h"
+#include "linear.h"
+
 std::tuple<at::Tensor, at::Tensor> blocksparse_mlp_routing(
     int bsz,
     const py::object& cfg,
     const at::Tensor& y,
     const py::dict& params
-);
+);
+
+struct BC_BlockSparseMLP
+{
+    at::Tensor yh;
+    at::Tensor interm_g;
+    at::Tensor interm_u;
+    at::Tensor interm_a;
+    at::Tensor out_d;
+    c10::optional<at::Tensor> out_d_sh;
+    c10::optional<at::Tensor> z;
+    int min_expert;
+    int max_expert;
+    at::Tensor gate_ptrs_trellis;
+    at::Tensor gate_ptrs_suh;
+    at::Tensor gate_ptrs_svh;
+    int gate_K;
+    uint32_t gate_mcg_mult;
+    uint32_t gate_mul1_mult;
+    at::Tensor up_ptrs_trellis;
+    at::Tensor up_ptrs_suh;
+    at::Tensor up_ptrs_svh;
+    int up_K;
+    uint32_t up_mcg_mult;
+    uint32_t up_mul1_mult;
+    at::Tensor down_ptrs_trellis;
+    at::Tensor down_ptrs_suh;
+    at::Tensor down_ptrs_svh;
+    int down_K;
+    uint32_t down_mcg_mult;
+    uint32_t down_mul1_mult;
+    bool act_silu;
+    bool act_gelu;
+    std::shared_ptr<BC_GatedMLP> shared_experts;
+    std::shared_ptr<BC_LinearFP16> shared_gate;
+
+    BC_BlockSparseMLP
+    (
+        at::Tensor _yh,
+        at::Tensor _interm_g,
+        at::Tensor _interm_u,
+        at::Tensor _interm_a,
+        at::Tensor _out_d,
+        c10::optional<at::Tensor> _out_d_sh,
+        c10::optional<at::Tensor> _z,
+        int _min_expert,
+        int _max_expert,
+        at::Tensor _gate_ptrs_trellis,
+        at::Tensor _gate_ptrs_suh,
+        at::Tensor _gate_ptrs_svh,
+        int _gate_K,
+        uint32_t _gate_mcg_mult,
+        uint32_t _gate_mul1_mult,
+        at::Tensor _up_ptrs_trellis,
+        at::Tensor _up_ptrs_suh,
+        at::Tensor _up_ptrs_svh,
+        int _up_K,
+        uint32_t _up_mcg_mult,
+        uint32_t _up_mul1_mult,
+        at::Tensor _down_ptrs_trellis,
+        at::Tensor _down_ptrs_suh,
+        at::Tensor _down_ptrs_svh,
+        int _down_K,
+        uint32_t _down_mcg_mult,
+        uint32_t _down_mul1_mult,
+        bool _act_silu,
+        bool _act_gelu,
+        std::shared_ptr<BC_GatedMLP> _shared_experts,
+        std::shared_ptr<BC_LinearFP16> _shared_gate
+    ) :
+        yh                  (std::move(_yh)),
+        interm_g            (std::move(_interm_g)),
+        interm_u            (std::move(_interm_u)),
+        interm_a            (std::move(_interm_a)),
+        out_d               (std::move(_out_d)),
+        out_d_sh            (std::move(_out_d_sh)),
+        z                   (std::move(_z)),
+        min_expert          (_min_expert),
+        max_expert          (_max_expert),
+        gate_ptrs_trellis   (std::move(_gate_ptrs_trellis)),
+        gate_ptrs_suh       (std::move(_gate_ptrs_suh)),
+        gate_ptrs_svh       (std::move(_gate_ptrs_svh)),
+        gate_K              (_gate_K),
+        gate_mcg_mult       (_gate_mcg_mult),
+        gate_mul1_mult      (_gate_mul1_mult),
+        up_ptrs_trellis     (std::move(_up_ptrs_trellis)),
+        up_ptrs_suh         (std::move(_up_ptrs_suh)),
+        up_ptrs_svh         (std::move(_up_ptrs_svh)),
+        up_K                (_up_K),
+        up_mcg_mult         (_up_mcg_mult),
+        up_mul1_mult        (_up_mul1_mult),
+        down_ptrs_trellis   (std::move(_down_ptrs_trellis)),
+        down_ptrs_suh       (std::move(_down_ptrs_suh)),
+        down_ptrs_svh       (std::move(_down_ptrs_svh)),
+        down_K              (_down_K),
+        down_mcg_mult       (_down_mcg_mult),
+        down_mul1_mult      (_down_mul1_mult),
+        act_silu            (_act_silu),
+        act_gelu            (_act_gelu),
+        shared_experts      (_shared_experts),
+        shared_gate         (_shared_gate)
+    {}
+
+    void run_bsz1
+    (
+        const at::Tensor& y,
+        at::Tensor& selected_experts,
+        at::Tensor& routing_weights
+    );
+};
@@ -0,0 +1,68 @@
+py::class_<BC_BlockSparseMLP, std::shared_ptr<BC_BlockSparseMLP>>(m, "BC_BlockSparseMLP").def
+(
+    py::init<
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        c10::optional<at::Tensor>,
+        c10::optional<at::Tensor>,
+        int,
+        int,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        int,
+        int,
+        int,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        int,
+        int,
+        int,
+        at::Tensor,
+        at::Tensor,
+        at::Tensor,
+        int,
+        int,
+        int,
+        bool,
+        bool,
+        std::shared_ptr<BC_GatedMLP>,
+        std::shared_ptr<BC_LinearFP16>
+    >(),
+    py::arg("yh"),
+    py::arg("interm_g"),
+    py::arg("interm_u"),
+    py::arg("interm_a"),
+    py::arg("out_d"),
+    py::arg("out_d_sh"),
+    py::arg("z"),
+    py::arg("min_expert"),
+    py::arg("max_expert"),
+    py::arg("gate_ptrs_trellis"),
+    py::arg("gate_ptrs_suh"),
+    py::arg("gate_ptrs_svh"),
+    py::arg("gate_K"),
+    py::arg("gate_mcg_mult"),
+    py::arg("gate_mul1_mult"),
+    py::arg("up_ptrs_trellis"),
+    py::arg("up_ptrs_suh"),
+    py::arg("up_ptrs_svh"),
+    py::arg("up_K"),
+    py::arg("up_mcg_mult"),
+    py::arg("up_mul1_mult"),
+    py::arg("down_ptrs_trellis"),
+    py::arg("down_ptrs_suh"),
+    py::arg("down_ptrs_svh"),
+    py::arg("down_K"),
+    py::arg("down_mcg_mult"),
+    py::arg("down_mul1_mult"),
+    py::arg("act_silu"),
+    py::arg("act_gelu"),
+    py::arg("shared_experts"),
+    py::arg("shared_gate")
+)
+.def("run_bsz1", &BC_BlockSparseMLP::run_bsz1);
@@ -1,12 +1,12 @@
 py::class_<BC_GatedRMSNorm, std::shared_ptr<BC_GatedRMSNorm>>(m, "BC_GatedRMSNorm").def
-    (
-        py::init<
-            at::Tensor,
-            float,
-            float
-        >(),
-        py::arg("weight"),
-        py::arg("rms_norm_eps"),
-        py::arg("constant_bias")
-    )
-    .def("run", &BC_GatedRMSNorm::run);
+(
+    py::init<
+        at::Tensor,
+        float,
+        float
+    >(),
+    py::arg("weight"),
+    py::arg("rms_norm_eps"),
+    py::arg("constant_bias")
+)
+.def("run", &BC_GatedRMSNorm::run);
@@ -19,6 +19,14 @@ void BC_LinearFP16::run(const at::Tensor& x, at::Tensor& y)
 }
 
 
+void BC_LinearFP16::run_cublas(const at::Tensor& x, at::Tensor& y)
+{
+    hgemm(x, weight, y);
+    if (bias)
+        y.add_(bias.value());
+}
+
+
 void BC_LinearEXL3::run(const at::Tensor& x, at::Tensor& y)
 {
     if (x.numel() == x.size(-1))
 
@@ -20,6 +20,7 @@ struct BC_LinearFP16
     {}
 
     void run(const at::Tensor& x, at::Tensor& y);
+    void run_cublas(const at::Tensor& x, at::Tensor& y);
 };
 
 struct BC_LinearEXL3
@@ -29,8 +30,8 @@ struct BC_LinearEXL3
     at::Tensor svh;
     int K;
     c10::optional<at::Tensor> bias;
-    int mcg_mult;
-    int mul1_mult;
+    uint32_t mcg_mult;
+    uint32_t mul1_mult;
     at::Tensor xh;
 
     BC_LinearEXL3
@@ -40,8 +41,8 @@ struct BC_LinearEXL3
         at::Tensor _svh,
         int _K,
         c10::optional<at::Tensor> _bias,
-        int _mcg_mult,
-        int _mul1_mult,
+        uint32_t _mcg_mult,
+        uint32_t _mul1_mult,
         at::Tensor _xh
     ) :
         trellis(std::move(_trellis)),
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,14 @@ void BC_LinearFP16::run(const at::Tensor& x, at::Tensor& y)`
`19`	`19`	`}`
`20`	`20`
`21`	`21`
	`22`	`+void BC_LinearFP16::run_cublas(const at::Tensor& x, at::Tensor& y)`
	`23`	`+{`
	`24`	`+ hgemm(x, weight, y);`
	`25`	`+ if (bias)`
	`26`	`+ y.add_(bias.value());`
	`27`	`+}`
	`28`	`+`
	`29`	`+`
`22`	`30`	`void BC_LinearEXL3::run(const at::Tensor& x, at::Tensor& y)`
`23`	`31`	`{`
`24`	`32`	`if (x.numel() == x.size(-1))`