Cleanup

turboderp · turboderp · commit bbfd4e706999 · 2025-09-27T23:48:40.000+02:00
diff --git a/exllamav3/architecture/gemma3.py b/exllamav3/architecture/gemma3.py
@@ -387,6 +387,9 @@ def __init__(
         self.patches_per_image = patches_per_image
         self.tokens_per_side = tokens_per_side
 
+    def optimizer_targets(self):
+        raise NotImplementedError()
+
     @override
     def load(self, device: torch.device, **kwargs):
         pass
diff --git a/exllamav3/architecture/mistral3.py b/exllamav3/architecture/mistral3.py
@@ -256,6 +256,8 @@ def __init__(
 
         self.register_submodule(self.merging_layer)
 
+    def optimizer_targets(self):
+        raise NotImplementedError()
 
     @override
     def forward(
diff --git a/exllamav3/exllamav3_ext/gnd.cu b/exllamav3/exllamav3_ext/gnd.cu
@@ -54,7 +54,8 @@ void gated_delta_net_fused_op_kernel
     size_t Ng,
     size_t Hk,
     size_t Hv
-){
+)
+{
     const size_t Nv   = Nk * Ng;
     const size_t Fseg = 2 * Hk + 2 * Ng * Hv;   // per-khead segment in mixed_qkvz
     const size_t Fba  = 2 * Ng;                 // per-khead segment in mixed_ba
diff --git a/exllamav3/exllamav3_ext/hgemm.cu b/exllamav3/exllamav3_ext/hgemm.cu
@@ -28,19 +28,19 @@ void hgemm
 
     TORCH_CHECK_DTYPE(a, kHalf);
     TORCH_CHECK_DTYPE(b, kHalf);
-    TORCH_CHECK_DIM(a, 2);
     TORCH_CHECK_DIM(b, 2);
-    TORCH_CHECK_DIM(c, 2);
-    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
-    TORCH_CHECK_SHAPES(a, 1, b, 0, 1);
-    TORCH_CHECK_SHAPES(b, 1, c, 1, 1);
+    // TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
+    TORCH_CHECK_SHAPES(a, -1, b, 0, 1);
+    TORCH_CHECK_SHAPES(b, 1, c, -1, 1);
 
     const half* a_ptr = (const half*) a.data_ptr();
     const half* b_ptr = (const half*) b.data_ptr();
 
-    int size_m = a.size(0);
-    int size_k = a.size(1);
-    int size_n = b.size(1);
+    int size_m = 1;
+    int dim = a.dim();
+    for (int d = 0; d < dim - 1; ++d) size_m *= a.size(d);
+    int size_k = a.size(-1);
+    int size_n = b.size(-1);
 
     cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
     cublasSetStream(cublas_handle, stream);
diff --git a/exllamav3/exllamav3_ext/norm.cu b/exllamav3/exllamav3_ext/norm.cu
@@ -362,7 +362,7 @@ void gated_rms_norm
     TORCH_CHECK_DTYPE(g, kBFloat16);
     TORCH_CHECK_DIV(x, -1, 4);
     TORCH_CHECK_SHAPES(x, -1, w, 0, 1);
-    TORCH_CHECK_SHAPES_FULL(x, y);
+    // TORCH_CHECK_SHAPES_FULL(x, y);
     TORCH_CHECK_SHAPES_FULL(x, g);
 
     bool output_fp32 = y.dtype() == at::kFloat;
diff --git a/exllamav3/exllamav3_ext/quant/exl3_gemm.cu b/exllamav3/exllamav3_ext/quant/exl3_gemm.cu
@@ -46,7 +46,7 @@ int exl3_gemm
     cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
 
     TORCH_CHECK_DIM(B, 3);
-    TORCH_CHECK_SHAPES(A, 1, B, 0, 16);
+    TORCH_CHECK_SHAPES(A, -1, B, 0, 16);
     TORCH_CHECK_SHAPES(C, -1, B, 1, 16);
 //    TORCH_CHECK_SHAPES(A, 0, C, 0, 1);
     TORCH_CHECK_DTYPE(A, kHalf);
@@ -82,8 +82,11 @@ int exl3_gemm
     const half* A_ptr = (const half*) A.data_ptr();
     const uint16_t* B_ptr = (const uint16_t*) B.data_ptr();
     void* C_ptr = (void*) C.data_ptr();
-    int size_m = A.size(0);
-    int size_k = A.size(1);
+
+    int size_m = 1;
+    int dim = A.dim();
+    for (int d = 0; d < dim - 1; ++d) size_m *= A.size(d);
+    int size_k = A.size(-1);
     int size_n = B.size(1) * 16;
 
     // Select kernel
diff --git a/exllamav3/modules/gated_delta_net.py b/exllamav3/modules/gated_delta_net.py
@@ -1,22 +1,16 @@
 from __future__ import annotations
-from dataclasses import dataclass
 from typing_extensions import override
 import torch
 import torch.nn.functional as F
 from ..model.config import Config
-from ..util.rope import RopeSettings, RoPE
 from ..util.tensor import get_for_device, to2
-from . import Module, Linear, RMSNorm, LayerNorm
-from ..constants import PAGE_SIZE
-from ..cache import Cache
-from flash_attn import flash_attn_func, flash_attn_with_kvcache
+from . import Module, Linear
 from ..util import profile_opt
-from .multilinear import MultiLinear
 from ..ext import exllamav3_ext as ext
 from ..model.model_tp_alloc import TPAllocation
-import torch.distributed as dist
 from .gated_rmsnorm import GatedRMSNorm
 from ..cache import CacheableState
+from ..util.tensor import g_tensor_cache
 
 """
 causal_conv1d wrappers and fallback functions