turboderp-org
diff --git a/‎exllamav3/architecture/gemma3.py‎
Lines changed: 10 additions & 0 deletions b/‎exllamav3/architecture/gemma3.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎exllamav3/conversion/convert_model.py‎
Lines changed: 14 additions & 0 deletions b/‎exllamav3/conversion/convert_model.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/add.cu‎
Lines changed: 99 additions & 0 deletions b/‎exllamav3/exllamav3_ext/add.cu‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/add.cuh‎
Lines changed: 19 additions & 0 deletions b/‎exllamav3/exllamav3_ext/add.cuh‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 5 additions & 0 deletions b/‎exllamav3/exllamav3_ext/bindings.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/graph.cu‎
Lines changed: 180 additions & 0 deletions b/‎exllamav3/exllamav3_ext/graph.cu‎
Lines changed: 180 additions & 0 deletions
@@ -138,6 +138,11 @@ def __init__(
         self.vision_pp.size = read_dict(read_prep_config, dict, ["size"], no_default)
 
 
+    def default_max_position_embeddings(self):
+        # Fixed for Gemma3, usually not present in config.json
+        return 131072
+
+
 class Gemma3TextConfig(Config):
     arch_string = "Gemma3ForCausalLM"
 
@@ -218,6 +223,11 @@ def __init__(
         self.final_logit_softcapping = self.read_cfg(float, "final_logit_softcapping", 0.0)
 
 
+    def default_max_position_embeddings(self):
+        # Fixed for Gemma2, usually not present in config.json
+        return 8192
+
+
 class Gemma3Model(Model):
     config_class = Gemma3Config
 
 
@@ -51,6 +51,18 @@
 
 num_ref_states = 5
 
+def check_system():
+    print("asdasdasd")
+    if os.environ.get("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") is not None:
+        print(
+            "\n"
+            f" !! {col_red}IMPORTANT: The environment variable TORCH_ALLOW_TF32_CUBLAS_OVERRIDE is set!{col_default}\n"
+            f" !! {col_red}This causes Torch to run in reduced precision mode, which is likely to cause this "
+            f"script to fail or result in broken models.{col_default}\n"
+            "\n"
+        )
+
+
 def save_dict(filename, dict_, args):
     path = os.path.join(args["work_dir"], filename)
     with open(path, "w", encoding = "utf8") as f:
@@ -104,6 +116,8 @@ def prepare_env(args):
 
 
 def prepare(args) -> (dict, dict, bool, str):
+    check_system()
+
     if not args.work_dir:
         return None, None, False, "Must specify --work_dir"
     if not args.in_dir and not args.resume:
 
@@ -0,0 +1,99 @@
+#include <cuda_fp16.h>
+#include "add.cuh"
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "util.h"
+#include "util.cuh"
+
+#define NUM_THREADS 1024
+
+#define KERNEL_DEF(xt, yt, zt, kernel, fn) \
+__launch_bounds__(NUM_THREADS) \
+__global__ void kernel \
+( \
+    const xt* __restrict__ x, \
+    const yt* __restrict__ y, \
+    zt* __restrict__ z, \
+    const uint64_t numel \
+) \
+{ \
+    uint64_t idx = ((uint64_t)blockIdx.x * NUM_THREADS + (uint64_t)threadIdx.x); \
+    if (idx >= numel) return; \
+    xt a = x[idx]; \
+    yt b = y[idx]; \
+    z[idx] = fn; \
+}
+
+KERNEL_DEF(half,  half,  half,  add_kernel_hhh, __hadd(a, b))
+KERNEL_DEF(half,  half,  float, add_kernel_hhf, __half2float(__hadd(a, b)))
+KERNEL_DEF(half,  float, half,  add_kernel_hfh, __float2half_rn(__half2float(a) + b))
+KERNEL_DEF(half,  float, float, add_kernel_hff, __half2float(a) + b)
+KERNEL_DEF(float, half,  half,  add_kernel_fhh, __float2half_rn(a + __half2float(b)))
+KERNEL_DEF(float, half,  float, add_kernel_fhf, a + __half2float(b))
+KERNEL_DEF(float, float, half,  add_kernel_ffh, __float2half_rn(a + b))
+KERNEL_DEF(float, float, float, add_kernel_fff, a + b)
+
+#undef KERNEL_DEF
+
+/*
+x + y -> z
+Works inplace if x == z or y == z
+*/
+
+void add_gr
+(
+    const at::Tensor& x,
+    const at::Tensor& y,
+    at::Tensor& z,
+    Graph* graph
+)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(x.device());
+    cudaStream_t stream = graph ? graph->capture_stream : at::cuda::getCurrentCUDAStream().stream();
+
+    auto xt = x.dtype();
+    auto yt = y.dtype();
+    auto zt = z.dtype();
+    uint64_t numel = x.numel();
+    int blocks = (int) CEIL_DIVIDE(numel, (uint64_t) NUM_THREADS);
+
+    #define INSTANCE(xt_, yt_, zt_, xt__, yt__, zt__, kernel) \
+    if (xt == xt_ && yt == yt_ && zt == zt_) \
+    { \
+        kernel<<<blocks, NUM_THREADS, 0, stream>>> \
+        ( \
+            (const xt__*) x.data_ptr(), \
+            (const yt__*) y.data_ptr(), \
+            (zt__*) z.data_ptr(), \
+            numel \
+        ); \
+        if (graph) graph->record_param((void*) &kernel, GP_add_x, 0); \
+        if (graph) graph->record_param((void*) &kernel, GP_add_y, 1); \
+        if (graph) graph->record_param((void*) &kernel, GP_add_z, 2); \
+        if (graph) graph->record_param((void*) &kernel, GP_end, 0); \
+        cuda_check(cudaPeekAtLastError()); \
+    }
+
+    INSTANCE(at::kHalf,  at::kHalf,  at::kHalf,  half,  half,  half , add_kernel_hhh)
+    INSTANCE(at::kHalf,  at::kHalf,  at::kFloat, half,  half,  float, add_kernel_hhf)
+    INSTANCE(at::kHalf,  at::kFloat, at::kHalf,  half,  float, half , add_kernel_hfh)
+    INSTANCE(at::kHalf,  at::kFloat, at::kFloat, half,  float, float, add_kernel_hff)
+    INSTANCE(at::kFloat, at::kHalf,  at::kHalf,  float, half,  half , add_kernel_fhh)
+    INSTANCE(at::kFloat, at::kHalf,  at::kFloat, float, half,  float, add_kernel_fhf)
+    INSTANCE(at::kFloat, at::kFloat, at::kHalf,  float, float, half , add_kernel_ffh)
+    INSTANCE(at::kFloat, at::kFloat, at::kFloat, float, float, float, add_kernel_fff)
+
+    #undef INSTANCE
+
+    cuda_check(cudaPeekAtLastError());
+}
+
+void add
+(
+    const at::Tensor& x,
+    const at::Tensor& y,
+    at::Tensor& z
+)
+{
+    add_gr(x, y, z, nullptr);
+}
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include "graph.cuh"
+
+void add_gr
+(
+    const at::Tensor& x,
+    const at::Tensor& y,
+    at::Tensor& z,
+    Graph* graph
+);
+
+void add
+(
+    const at::Tensor& x,
+    const at::Tensor& y,
+    at::Tensor& z
+);
@@ -15,6 +15,7 @@
 #include "routing.cuh"
 #include "gdn.cuh"
 #include "causal_conv1d.cuh"
+#include "add.cuh"
 
 #include "quant/quantize.cuh"
 #include "quant/pack.cuh"
@@ -23,6 +24,7 @@
 #include "quant/exl3_gemm.cuh"
 #include "quant/exl3_kernel_map.cuh"
 #include "quant/util.cuh"
+#include "quant/exl3_devctx.cuh"
 
 #include "generator/strings.h"
 #include "generator/sampling_basic.cuh"
@@ -87,6 +89,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("exl3_gemm", &exl3_gemm, "exl3_gemm");
     m.def("exl3_gemm_num_kernel_shapes", &exl3_gemm_num_kernel_shapes, "exl3_gemm_num_kernel_shapes");
     m.def("exl3_gemm_shape_compat", &exl3_gemm_shape_compat, "exl3_gemm_shape_compat");
+    m.def("g_get_cc", &g_get_cc, "g_get_cc");
+    m.def("g_get_num_sms", &g_get_num_sms, "g_get_num_sms");
     m.def("exl3_mgemm", &exl3_mgemm, "exl3_mgemm");
     m.def("hgemm", &hgemm, "hgemm");
     m.def("rope", &rope, "rope");
@@ -95,6 +99,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("relu2_mul", &relu2_mul, "relu2_mul");
     m.def("xielu", &xielu, "xielu");
     m.def("add_sigmoid_gate", &add_sigmoid_gate, "add_sigmoid_gate");
+    m.def("add", &add, "add");
 
     m.def("gated_delta_net_fused_op", &gated_delta_net_fused_op, "gated_delta_net_fused_op");
     m.def("cuda_recurrent_gated_delta_rule", &cuda_recurrent_gated_delta_rule, "cuda_recurrent_gated_delta_rule");
 
@@ -0,0 +1,180 @@
+#include <Python.h>
+#include "graph.cuh"
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+//#include <torch/extension.h>
+#include "util.h"
+#include "util.cuh"
+
+Graph::Graph()
+{
+    ready = false;
+    graph = NULL;
+    graph_exec = NULL;
+}
+
+Graph::~Graph()
+{
+    if (graph) cudaGraphDestroy(graph);
+    if (graph_exec) cudaGraphExecDestroy(graph_exec);
+}
+
+cudaStream_t Graph::capture_begin()
+{
+    // Make sure nothing is pending
+    cudaDeviceSynchronize();
+
+    // Create capture stream
+    cuda_check(cudaStreamCreateWithFlags(&capture_stream, cudaStreamNonBlocking));
+
+    // Begin capture
+    cuda_check(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
+    return capture_stream;
+}
+
+void Graph::capture_end()
+{
+    // End capture
+    cuda_check(cudaStreamEndCapture(capture_stream, &graph));
+    cuda_check(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    //inspect_graph();
+
+    // Get graph nodes
+    size_t num_nodes;
+    cudaGraphGetNodes(graph, nullptr, &num_nodes);
+    nodes.resize(num_nodes);
+    cudaGraphGetNodes(graph, nodes.data(), &num_nodes);
+
+    // Store copies of all node param structures
+    node_params.resize(num_nodes);
+    node_needs_update.resize(num_nodes);
+    for (int i = 0; i < num_nodes; ++i)
+        node_needs_update[i] = false;
+
+    int n = 0;
+    int c = 0;
+    while (true)
+    {
+        cudaGraphNodeType t{};
+        cudaGraphNodeGetType(nodes[n], &t);
+
+        // Node type: kernel
+        if (t == cudaGraphNodeTypeKernel)
+        {
+            cudaGraphKernelNodeGetParams(nodes[n], &node_params[n]);
+//            DBGX(node_params[n].func);
+
+            for(; c < graph_sites.size(); c++)
+            {
+                void* func = std::get<0>(graph_sites[c]);
+//                DBGX(func);
+
+                if (func != node_params[n].func) break;
+
+                int param_id     = std::get<1>(graph_sites[c]);
+                int param_offset = std::get<2>(graph_sites[c]);
+
+                graph_node_sites.push_back(std::tuple<int, int, int>(n, param_id, param_offset));
+                if (param_id == GP_end) { c++; break; }
+
+//                DBGI2(param_id, param_offset);
+            }
+        }
+
+        n++;
+        if (c == graph_sites.size()) break;
+        if (n == num_nodes) TORCH_CHECK(false, "Graph recording failed");
+    };
+
+    // Destroy capture stream
+    cuda_check(cudaStreamDestroy(capture_stream));
+
+    // Graph is ready
+    ready = true;
+}
+
+void Graph::record_param(void* kernel, int param_id, int param_offset)
+{
+    graph_sites.push_back(std::tuple<void*, int, int>(kernel, param_id, param_offset));
+}
+
+void Graph::launch(std::vector<PPTR> params, cudaStream_t stream)
+{
+    int p = 0;
+    int n = 0;
+    while (true)
+    {
+        if (std::get<1>(graph_node_sites[n]) == std::get<0>(params[p]))
+        {
+            if (std::get<0>(params[p]) != GP_end)
+            {
+                void* new_value  = std::get<1>(params[p]);
+                int node_idx     = std::get<0>(graph_node_sites[n]);
+                int param_offset = std::get<2>(graph_node_sites[n]);
+
+//                DBGI3(p, node_idx, param_offset);
+
+                void** p_old_value = (void**) node_params[node_idx].kernelParams[param_offset];
+                if (*p_old_value != new_value)
+                {
+                    *p_old_value = new_value;
+                    node_needs_update[node_idx] = true;
+                }
+            }
+            else
+            {
+//                DBGI(p);
+            }
+            p++;
+        }
+
+        n++;
+        if (p == params.size()) break;
+        if (n == graph_node_sites.size()) TORCH_CHECK(false, "Graph update failed");
+    }
+
+    for (int n = 0; n < nodes.size(); ++n)
+    {
+//        DBGI(n);
+        if (!node_needs_update[n]) continue;
+//        printf("update\n");
+        cudaGraphExecKernelNodeSetParams(graph_exec, nodes[n], &node_params[n]);
+        node_needs_update[n] = false;
+    }
+
+    cudaGraphLaunch(graph_exec, stream);
+}
+
+void Graph::inspect_graph()
+{
+    // Get the number of nodes in the graph
+    size_t numNodes;
+    cudaGraphGetNodes(graph, nullptr, &numNodes);
+
+    // Get the nodes in the graph
+    std::vector<cudaGraphNode_t> nodes(numNodes);
+    cudaGraphGetNodes(graph, nodes.data(), &numNodes);
+    DBGI(nodes.size());
+
+    // Inspect each node
+    for (size_t i = 0; i < numNodes; ++i)
+    {
+        cudaGraphNodeType nodeType;
+        cudaGraphNodeGetType(nodes[i], &nodeType);
+
+        if (nodeType == cudaGraphNodeTypeKernel)
+        {
+            cudaKernelNodeParams nodeParams;
+            cudaGraphKernelNodeGetParams(nodes[i], &nodeParams);
+            std::cout << "Kernel node " << i << ":" << std::endl;
+            std::cout << "  Function pointer: " << nodeParams.func << std::endl;
+            std::cout << "  Grid dimensions: (" << nodeParams.gridDim.x << ", " << nodeParams.gridDim.y << ", " << nodeParams.gridDim.z << ")" << std::endl;
+            std::cout << "  Block dimensions: (" << nodeParams.blockDim.x << ", " << nodeParams.blockDim.y << ", " << nodeParams.blockDim.z << ")" << std::endl;
+            std::cout << "  Shared memory: " << nodeParams.sharedMemBytes << " bytes" << std::endl;
+
+        } else {
+            std::cout << "Node " << i << " is not a kernel node." << std::endl;
+        }
+    }
+}
+