Expose offload_kqv to control GPU KV cache & KQV ops

brittlewis12 · brittlewis12 · commit b10bd0abded5 · 2024-09-25T09:19:42.000-04:00
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -227,6 +227,36 @@ impl LlamaContextParams {
         self.context_params.flash_attn
     }
 
+    /// Set the `offload_kqv` parameter to control offloading KV cache & KQV ops to GPU
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default()
+    ///     .with_offload_kqv(false);
+    /// assert_eq!(params.offload_kqv(), false);
+    /// ```
+    #[must_use]
+    pub fn with_offload_kqv(mut self, enabled: bool) -> Self {
+        self.context_params.offload_kqv = enabled;
+        self
+    }
+
+    /// Get the `offload_kqv` parameter
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default();
+    /// assert_eq!(params.offload_kqv(), true);
+    /// ```
+    #[must_use]
+    pub fn offload_kqv(&self) -> bool {
+        self.context_params.offload_kqv
+    }
+
     /// Set the type of rope scaling.
     ///
     /// # Examples