applied changes from header, still need to update C#

martindevans · martindevans · commit d761188d981c · 2024-10-25T01:47:23.000+01:00
diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs
@@ -25,6 +25,7 @@ public unsafe struct LLamaNativeBatch
 
     /// <summary>
     /// the positions of the respective token in the sequence
+    /// (if set to NULL, the token position will be tracked automatically by llama_decode)
     /// </summary>
     public LLamaPos* pos;
 
@@ -35,18 +36,13 @@ public unsafe struct LLamaNativeBatch
 
     /// <summary>
     /// the sequence to which the respective token belongs
+    /// (if set to NULL, the sequence ID will be assumed to be 0)
     /// </summary>
     public LLamaSeqId** seq_id;
 
     /// <summary>
     /// if zero, the logits for the respective token will not be output
+    /// (if set to NULL, only the logits for last token will be returned)
     /// </summary>
     public byte* logits;
-
-    // Note from llama.cpp:
-    // > helpers for smooth API transition - can be deprecated in the future
-    // > for future-proof code, use the above fields instead and ignore everything below
-    private LLamaPos _all_pos_0;
-    private LLamaPos _all_pos_1;
-    private LLamaSeqId _all_seq_id;
 }
diff --git a/LLama/Native/LLamaPoolingType.cs b/LLama/Native/LLamaPoolingType.cs
@@ -29,4 +29,9 @@ public enum LLamaPoolingType
     CLS = 2,
 
     Last = 3,
+
+    /// <summary>
+    /// Used by reranking models to attach the classification head to the graph
+    /// </summary>
+    Rank,
 }
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
@@ -33,4 +33,5 @@ internal enum LLamaVocabPreType
     BLOOM = 23,
     GPT3_FINNISH = 24,
     EXAONE = 25,
+    CHAMELEON = 26,
 }
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -49,6 +49,14 @@ public static void llama_empty_call()
         [return: MarshalAs(UnmanagedType.U1)]
         public static extern bool llama_supports_gpu_offload();
 
+        /// <summary>
+        /// Check if RPC offload is supported
+        /// </summary>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        [return: MarshalAs(UnmanagedType.U1)]
+        public static extern bool llama_supports_rpc();
+
         /// <summary>
         /// Initialize the llama + ggml backend. Call once at the start of the program.
         ///
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -368,8 +368,10 @@ static SafeLLamaContextHandle()
         private static extern LLamaPoolingType llama_pooling_type(SafeLLamaContextHandle ctx);
 
         /// <summary>
-        /// Get the embeddings for the a specific sequence.
-        /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+        /// Get the embeddings for a sequence id.
+        /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+        /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+        /// otherwise: float[n_embd] (1-dimensional)
         /// </summary>
         /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
@@ -249,19 +249,6 @@ public void AddMirostat2Sampler(uint seed, float tau, float eta)
         static extern IntPtr llama_sampler_init_mirostat_v2(uint seed, float tau, float eta);
     }
 
-
-    /// <summary>
-    /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    /// </summary>
-    /// <returns></returns>
-    public void AddSoftmax()
-    {
-        llama_sampler_chain_add(this, llama_sampler_init_softmax());
-
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        static extern IntPtr llama_sampler_init_softmax();
-    }
-
     /// <summary>
     /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// </summary>
@@ -291,7 +278,6 @@ public void AddTopP(float p, nint minKeep)
     /// <summary>
     /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
     /// </summary>
-    /// <returns></returns>
     public void AddMinP(float p, nint minKeep)
     {
         llama_sampler_chain_add(this, llama_sampler_init_min_p(p, minKeep));
@@ -305,7 +291,6 @@ public void AddMinP(float p, nint minKeep)
     /// <summary>
     /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
     /// </summary>
-    /// <returns></returns>
     public void AddTailFree(float z, nint minKeep)
     {
         llama_sampler_chain_add(this, llama_sampler_init_tail_free(z, minKeep));
@@ -319,7 +304,6 @@ public void AddTailFree(float z, nint minKeep)
     /// <summary>
     /// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     /// </summary>
-    /// <returns></returns>
     public void AddTypical(float p, nint minKeep)
     {
         llama_sampler_chain_add(this, llama_sampler_init_typical(p, minKeep));
@@ -331,14 +315,15 @@ public void AddTypical(float p, nint minKeep)
     }
 
     /// <summary>
-    /// Apply temperature to the logits
+    /// Apply temperature to the logits.
+    /// If temperature is less than zero the maximum logit is left unchanged and the rest are set to -infinity
     /// </summary>
     /// <param name="t"></param>
-    /// <returns></returns>
     public void AddTemperature(float t)
     {
         llama_sampler_chain_add(this, llama_sampler_init_temp(t));
 
+        // #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr llama_sampler_init_temp(float t);
     }
@@ -349,7 +334,6 @@ public void AddTemperature(float t)
     /// <param name="t"></param>
     /// <param name="delta"></param>
     /// <param name="exponent"></param>
-    /// <returns></returns>
     public void AddDynamicTemperature(float t, float delta, float exponent)
     {
         llama_sampler_chain_add(this, llama_sampler_init_temp_ext(t, delta, exponent));
@@ -358,6 +342,51 @@ public void AddDynamicTemperature(float t, float delta, float exponent)
         static extern IntPtr llama_sampler_init_temp_ext(float t, float delta, float exponent);
     }
 
+    /// <summary>
+    /// XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    /// </summary>
+    /// <param name="p"></param>
+    /// <param name="t"></param>
+    /// <param name="minKeep"></param>
+    /// <param name="seed"></param>
+    public void AddXTC(float p, float t, int minKeep, uint seed)
+    {
+        llama_sampler_chain_add(this, llama_sampler_init_xtc(p, t, minKeep, seed));
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern IntPtr llama_sampler_init_xtc(float p, float t, nint minKeep, uint seed);
+    }
+
+    /// <summary>
+    /// This sampler is meant to be used for fill-in-the-middle infilling, after top_k + top_p sampling
+    ///<br />
+    /// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG<br />
+    /// 2. combine probs of tokens that have the same prefix<br />
+    /// <br />
+    /// example:<br />
+    /// <br />
+    /// - before:<br />
+    ///   "hel":   0.5<br />
+    ///   "hell":  0.2<br />
+    ///   "hello": 0.1<br />
+    ///   "dummy": 0.1<br />
+    ///<br />
+    /// - after:<br />
+    ///   "hel":   0.8<br />
+    ///   "dummy": 0.1<br />
+    ///<br />
+    /// 3. discard non-EOG tokens with low prob<br />
+    /// 4. if no tokens are left -> pick EOT
+    /// </summary>
+    /// <param name="model"></param>
+    public void AddFillInMiddleInfill(SafeLlamaModelHandle model)
+    {
+        llama_sampler_chain_add(this, llama_sampler_init_infill(model));
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern IntPtr llama_sampler_init_infill(SafeLlamaModelHandle model);
+    }
+
     /// <summary>
     /// Create a sampler which makes tokens impossible unless they match the grammar
     /// </summary>
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
@@ -386,32 +386,29 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         private static extern LLamaToken llama_token_pad(SafeLlamaModelHandle model);
 
         /// <summary>
-        /// codellama infill tokens, Beginning of infill prefix
+        /// codellama infill tokens, End of infill middle
         /// </summary>
         /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_prefix(SafeLlamaModelHandle model);
+        private static extern int llama_token_eot(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, Beginning of infill middle
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_middle(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_pre(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, Beginning of infill suffix
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_suffix(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_suf(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, End of infill middle
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_eot(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_mid(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_pad(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_rep(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_sep(SafeLlamaModelHandle model);
 
         /// <summary>
         /// For encoder-decoder models, this function returns id of the token that must be provided

Original file line number	Diff line number	Diff line change
`@@ -29,4 +29,9 @@ public enum LLamaPoolingType`
`29`	`29`	`CLS = 2,`
`30`	`30`
`31`	`31`	`Last = 3,`
	`32`	`+`
	`33`	`+ /// <summary>`
	`34`	`+ /// Used by reranking models to attach the classification head to the graph`
	`35`	`+ /// </summary>`
	`36`	`+ Rank,`
`32`	`37`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,4 +33,5 @@ internal enum LLamaVocabPreType`
`33`	`33`	`BLOOM = 23,`
`34`	`34`	`GPT3_FINNISH = 24,`
`35`	`35`	`EXAONE = 25,`
	`36`	`+ CHAMELEON = 26,`
`36`	`37`	`}`