Support partial_rotary_factor (Phi-4 mini)

turboderp · turboderp · commit d8fa1a825060 · 2025-02-28T08:51:11.000+01:00
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -317,6 +317,7 @@ def load(self, device_context: bool = True):
                 cfg.max_seq_len,
                 self.has_residual,
                 self.archparams.rope_style.value,
+                int(cfg.head_dim * cfg.partial_rotary_factor),
                 q_norm,
                 k_norm,
                 post_norm_weight,
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -127,6 +127,7 @@ class ExLlamaV2Config:
     checkpoint_offset_qzeros: bool
     mrope_section: list | None
     attention_multiplier: float | None
+    partial_rotary_factor: float | None
 
     vision_model_type: str | None
     vision_head_dim: int | None
@@ -361,6 +362,8 @@ def prepare(self, no_tensors: bool = False):
         self.sliding_window = read(read_config, int, ["sliding_window", "sliding_window_size"], 0, opt_subkey = "text_config")
         self.sliding_window_pattern = read(read_config, int, ["sliding_window_pattern"], 1)
 
+        self.partial_rotary_factor = read(read_config, float, "partial_rotary_factor", 1.0)
+
         rs = read(read_config, dict, "rope_scaling", None)
         if rs:
             scaling_type = rs.get("type", None)
diff --git a/exllamav2/exllamav2_ext/cuda/q_attn.cu b/exllamav2/exllamav2_ext/cuda/q_attn.cu
@@ -104,6 +104,7 @@ QAttn::QAttn
     int _max_seq_len,
     bool _has_residual,
     int _rope_style,
+    int _sincos_size,
     half* _q_norm,
     half* _k_norm,
     half* _post_layernorm,
@@ -132,6 +133,7 @@ QAttn::QAttn
     max_seq_len(_max_seq_len),
     has_residual(_has_residual),
     rope_style(_rope_style),
+    sincos_size(_sincos_size),
     q_norm(_q_norm),
     k_norm(_k_norm),
     post_layernorm(_post_layernorm),
@@ -305,6 +307,7 @@ void QAttn::forward_cuda_1_run
             past_len,
             past_lens,
             rope_style == ROPE_STYLE_NEOX,
+            sincos_size,
             graph,
             KernelLabels::ROPE
         );
diff --git a/exllamav2/exllamav2_ext/cuda/q_attn.cuh b/exllamav2/exllamav2_ext/cuda/q_attn.cuh
@@ -76,6 +76,7 @@ public:
     bool has_residual;
     bool residual_fp32;
     int rope_style;
+    int sincos_size;
 
     bool use_graphs;
     std::unordered_map<QAttn_params_const, Graph*, QAttn_params_const_hash> graph_map;
@@ -103,6 +104,7 @@ public:
         int _max_seq_len,
         bool _has_residual,
         int _rope_style,
+        int _sincos_size,
         half* _q_norm,
         half* _k_norm,
         half* _post_layernorm,
diff --git a/exllamav2/exllamav2_ext/cuda/rope.cu b/exllamav2/exllamav2_ext/cuda/rope.cu
@@ -17,15 +17,16 @@ __forceinline__ __device__ void rope_cuda_arr_neox
     int num_heads,
     int past_len,
     const int32_t* __restrict__ past_lens,
-    int threads_y
+    int threads_y,
+    int sincos_size
 )
 {
     MatrixView_half_rw x_(x, MAX_ROWS, head_dim);
-    MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
-    MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
+    MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, sincos_size);
+    MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, sincos_size);
 
     int column = (blockIdx.x * THREADS_X + threadIdx.x) * 2;
-    int half_dim = head_dim / 2;
+    int half_dim = sincos_size / 2;
     if (column >= half_dim) return;
 
     int row = blockIdx.y * threads_y + threadIdx.y;
@@ -76,15 +77,16 @@ __forceinline__ __device__ void rope_cuda_arr_gptj
     int num_heads,
     int past_len,
     const int32_t* __restrict__ past_lens,
-    int threads_y
+    int threads_y,
+    int sincos_size
 )
 {
     MatrixView_half_rw x_(x, MAX_ROWS, head_dim);
-    MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
-    MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
+    MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, sincos_size);
+    MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, sincos_size);
 
     int column = (blockIdx.x * THREADS_X + threadIdx.x) * 2;
-    if (column >= head_dim) return;
+    if (column >= sincos_size) return;
 
     int row = blockIdx.y * threads_y + threadIdx.y;
     if (row >= rows_per_batch) return;
@@ -131,13 +133,14 @@ __global__ void rope_cuda_kernel
     int past_len,
     const int32_t* __restrict__ past_lens,
     int threads_y,
-    const bool neox_style
+    const bool neox_style,
+    int sincos_size
 )
 {
     if (neox_style)
-        rope_cuda_arr_neox(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y);
+        rope_cuda_arr_neox(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y, sincos_size);
     else
-        rope_cuda_arr_gptj(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y);
+        rope_cuda_arr_gptj(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y, sincos_size);
 }
 
 __global__ void rope_cuda_qk_kernel
@@ -154,18 +157,19 @@ __global__ void rope_cuda_qk_kernel
     int past_len,
     const int32_t* __restrict__ past_lens,
     int threads_y,
-    const bool neox_style
+    const bool neox_style,
+    int sincos_size
 )
 {
     if (neox_style)
     {
-        rope_cuda_arr_neox(x_q, sin, cos, rows_per_batch_q, head_dim, num_heads_q, past_len, past_lens, threads_y);
-        rope_cuda_arr_neox(x_k, sin, cos, rows_per_batch_k, head_dim, num_heads_k, past_len, past_lens, threads_y);
+        rope_cuda_arr_neox(x_q, sin, cos, rows_per_batch_q, head_dim, num_heads_q, past_len, past_lens, threads_y, sincos_size);
+        rope_cuda_arr_neox(x_k, sin, cos, rows_per_batch_k, head_dim, num_heads_k, past_len, past_lens, threads_y, sincos_size);
     }
     else
     {
-        rope_cuda_arr_gptj(x_q, sin, cos, rows_per_batch_q, head_dim, num_heads_q, past_len, past_lens, threads_y);
-        rope_cuda_arr_gptj(x_k, sin, cos, rows_per_batch_k, head_dim, num_heads_k, past_len, past_lens, threads_y);
+        rope_cuda_arr_gptj(x_q, sin, cos, rows_per_batch_q, head_dim, num_heads_q, past_len, past_lens, threads_y, sincos_size);
+        rope_cuda_arr_gptj(x_k, sin, cos, rows_per_batch_k, head_dim, num_heads_k, past_len, past_lens, threads_y, sincos_size);
     }
 }
 
@@ -181,7 +185,8 @@ void rope_cuda
     const int num_heads,
     const int past_len,
     const int32_t* past_lens,
-    const bool neox_style
+    const bool neox_style,
+    int sincos_size
 )
 {
     // For large batch sizes we risk exceeding grid dimension of 65535, so shift to block dimension instead
@@ -207,7 +212,8 @@ void rope_cuda
         past_len,
         past_lens,
         threads_y,
-        neox_style
+        neox_style,
+        sincos_size
     );
 }
 
@@ -227,6 +233,7 @@ void rope_cuda_qk
     const int past_len,
     const int32_t* past_lens,
     const bool neox_style,
+    int sincos_size,
     Graph* graph,
     int label
 )
@@ -258,7 +265,8 @@ void rope_cuda_qk
         past_len,
         past_lens,
         threads_y,
-        neox_style
+        neox_style,
+        sincos_size
     );
 
     if (graph) graph->attach_label(stream, label, 0);
diff --git a/exllamav2/exllamav2_ext/cuda/rope.cuh b/exllamav2/exllamav2_ext/cuda/rope.cuh
@@ -19,7 +19,8 @@ void rope_cuda
     const int num_heads,
     const int past_len,
     const int32_t* past_lens,
-    const bool neox_style
+    const bool neox_style,
+    int sincos_size
 );
 
 void rope_cuda_qk
@@ -38,6 +39,7 @@ void rope_cuda_qk
     const int past_len,
     const int32_t* past_lens,
     const bool neox_style,
+    int sincos_size,
     Graph* graph = NULL,
     int label = 0
 );
diff --git a/exllamav2/exllamav2_ext/ext_qattn.cpp b/exllamav2/exllamav2_ext/ext_qattn.cpp
@@ -44,6 +44,7 @@ uintptr_t make_q_attn
     int max_seq_len,
     bool has_residual,
     int rope_style,
+    int sincos_size,
     torch::Tensor q_norm,
     torch::Tensor k_norm,
     torch::Tensor post_layernorm,
@@ -88,6 +89,7 @@ uintptr_t make_q_attn
         max_seq_len,
         has_residual,
         rope_style,
+        sincos_size,
         q_norm.is_meta() ? NULL : (half*) q_norm.data_ptr(),
         k_norm.is_meta() ? NULL : (half*) k_norm.data_ptr(),
         post_layernorm.is_meta() ? NULL : (half*) post_layernorm.data_ptr(),
@@ -377,7 +379,8 @@ void tp_attn_forward_paged_
                     num_kv_heads,
                     0, //past_len,
                     (int32_t*) past_lens[i].data_ptr(),
-                    rope_style == ROPE_STYLE_NEOX
+                    rope_style == ROPE_STYLE_NEOX,
+                    head_dim  // TODO: partial_rotary_factor
                 );
             }
         }
@@ -613,7 +616,8 @@ void tp_attn_forward_
                     num_kv_heads,
                     0, //past_len,
                     (int32_t*) past_len_tp[i].data_ptr(),
-                    rope_style == ROPE_STYLE_NEOX
+                    rope_style == ROPE_STYLE_NEOX,
+                    head_dim  // TODO: partial_rotary_factor
                 );
             }
         }
diff --git a/exllamav2/exllamav2_ext/ext_qattn.h b/exllamav2/exllamav2_ext/ext_qattn.h
@@ -22,6 +22,7 @@ uintptr_t make_q_attn
     int max_seq_len,
     bool has_residual,
     int rope_style,
+    int sincos_size,
     torch::Tensor q_norm,
     torch::Tensor k_norm,
     torch::Tensor post_layernorm,
diff --git a/exllamav2/exllamav2_ext/ext_rope.cpp b/exllamav2/exllamav2_ext/ext_rope.cpp
@@ -32,12 +32,14 @@ void rope_
     TORCH_CHECK_DTYPE(x, kHalf);
     TORCH_CHECK_DTYPE(sin, kHalf);
     TORCH_CHECK_DTYPE(cos, kHalf);
-    TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
-    TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
+//    TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
+//    TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
+    TORCH_CHECK(cos.size(-1) == sin.size(-1), "sin table does not cos table");
     TORCH_CHECK_DTYPE_OPT(offsets, kInt);
 
     int batch_size = x.size(0);
     int rows_per_batch = x.numel() / head_dim / batch_size;
+    int sincos_size = cos.size(-1);
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
     cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
@@ -54,7 +56,8 @@ void rope_
         num_heads,
         past_len,
         offsets.device().is_meta() ? NULL : (int32_t*) offsets.data_ptr(),
-        neox_style
+        neox_style,
+        sincos_size
     );
 }
 
diff --git a/exllamav2/rope.py b/exllamav2/rope.py
@@ -13,7 +13,7 @@ def get_rope_params_su(
     device: torch.Device,
     cfg: ExLlamaV2Config,
 ):
-    head_dim = cfg.head_dim
+    head_dim = int(cfg.head_dim * cfg.partial_rotary_factor)
     base = cfg.rotary_embedding_base
     if cfg.scale_alpha_value and cfg.scale_alpha_value != 1.0:
         base *= cfg.scale_alpha_value ** (cfg.head_dim / (cfg.head_dim - 2))
@@ -36,7 +36,7 @@ def get_rope_params_llama3(
     device: torch.Device,
     cfg: ExLlamaV2Config,
 ):
-    head_dim = cfg.head_dim
+    head_dim = int(cfg.head_dim * cfg.partial_rotary_factor)
     base = cfg.rotary_embedding_base
     if cfg.scale_alpha_value and cfg.scale_alpha_value != 1.0:
         base *= cfg.scale_alpha_value ** (cfg.head_dim / (cfg.head_dim - 2))
@@ -81,7 +81,8 @@ def get_rope_params_yarn(
     device: torch.Device,
     cfg: ExLlamaV2Config,
 ):
-    head_dim = cfg.head_dim
+    head_dim = int(cfg.head_dim * cfg.partial_rotary_factor)
+
     base = cfg.rotary_embedding_base
     if cfg.scale_alpha_value and cfg.scale_alpha_value != 1.0:
         base *= cfg.scale_alpha_value ** (cfg.head_dim / (cfg.head_dim - 2))
@@ -91,9 +92,7 @@ def get_rope_params_yarn(
     # Only activate if longer than original ctx
     if cfg.max_seq_len > cfg.yarn_rope_original_max_position_embeddings:
 
-        partial_rotary_factor = 1.0  # Placeholder, assume no partial_rotary_factor in config.
-        dim = int(head_dim * partial_rotary_factor)
-
+        head_dim = int(cfg.head_dim * cfg.partial_rotary_factor)
         factor = cfg.yarn_rope_factor
 
         # Sets the attention factor as suggested in the paper
@@ -126,14 +125,14 @@ def linear_ramp_factor(min, max, dim):
 
         # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
         # to expand the possible context length. In other words, interpolation = apply scaling factor.
-        pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+        pos_freqs = base ** (torch.arange(0, head_dim, 2).float().to(device) / head_dim)
         inv_freq_extrapolation = 1.0 / pos_freqs
         inv_freq_interpolation = 1.0 / (factor * pos_freqs)
 
-        low, high = find_correction_range(beta_fast, beta_slow, dim, base, yarn_max_position_embeddings)
+        low, high = find_correction_range(beta_fast, beta_slow, head_dim, base, yarn_max_position_embeddings)
 
         # Get n-dimensional rotational scaling corrected for extrapolation
-        inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+        inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, head_dim // 2).float().to(device)
         inv_freq = (
                 inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
                 + inv_freq_extrapolation * inv_freq_extrapolation_factor
@@ -150,10 +149,11 @@ def get_rope_params_default(
     device: torch.Device,
     cfg: ExLlamaV2Config,
 ):
-    head_dim = cfg.head_dim
+    head_dim = int(cfg.head_dim * cfg.partial_rotary_factor)
+
     base = cfg.rotary_embedding_base
     if cfg.scale_alpha_value and cfg.scale_alpha_value != 1.0:
-        base *= cfg.scale_alpha_value ** (cfg.head_dim / (cfg.head_dim - 2))
+        base *= cfg.scale_alpha_value ** (head_dim / (head_dim - 2))
 
     inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = device).float() / head_dim))
     return inv_freq, 1.0