core: Add topK, topP, XTC sampling parameters

shubham0204 · shubham0204 · commit 552378920a6b · 2025-09-14T08:49:28.000+05:30
diff --git a/app/src/main/java/io/shubham0204/smollmandroid/data/AppDB.kt b/app/src/main/java/io/shubham0204/smollmandroid/data/AppDB.kt
@@ -1,19 +1,33 @@
 package io.shubham0204.smollmandroid.data
 
 import android.content.Context
+import androidx.room.AutoMigration
 import androidx.room.Database
 import androidx.room.Room
 import androidx.room.RoomDatabase
 import androidx.room.TypeConverters
+import androidx.room.migration.Migration
+import androidx.sqlite.db.SupportSQLiteDatabase
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.runBlocking
 import org.koin.core.annotation.Single
 import java.util.Date
 
+val MIGRATION_1_2 =
+    object : Migration(1, 2) {
+        override fun migrate(database: SupportSQLiteDatabase) {
+            database.execSQL("ALTER TABLE Chat ADD COLUMN topK INTEGER NOT NULL DEFAULT 40")
+            database.execSQL("ALTER TABLE Chat ADD COLUMN topP REAL NOT NULL DEFAULT 0.9")
+            database.execSQL("ALTER TABLE Chat ADD COLUMN xtcP REAL NOT NULL DEFAULT 0.0")
+            database.execSQL("ALTER TABLE Chat ADD COLUMN xtcT REAL NOT NULL DEFAULT 1.0")
+        }
+    }
+
 @Database(
     entities = [Chat::class, ChatMessage::class, LLMModel::class, Task::class, Folder::class],
-    version = 1,
+    version = 2,
+    exportSchema = true,
 )
 @TypeConverters(Converters::class)
 abstract class AppRoomDatabase : RoomDatabase() {
@@ -38,7 +52,8 @@ class AppDB(
                 context,
                 AppRoomDatabase::class.java,
                 "app-database",
-            ).build()
+            ).addMigrations(MIGRATION_1_2)
+            .build()
 
     /** Get all chats from the database sorted by dateUsed in descending order. */
     fun getChats(): Flow<List<Chat>> = db.chatsDao().getChats()
diff --git a/app/src/main/java/io/shubham0204/smollmandroid/data/ChatsDB.kt b/app/src/main/java/io/shubham0204/smollmandroid/data/ChatsDB.kt
@@ -59,11 +59,15 @@ data class Chat(
     /**
      * LLM inference parameters that are used for this chat.
      */
-    var minP: Float = 0.1f,
-    var temperature: Float = 0.8f,
+    var minP: Float = 0.05f,
+    var temperature: Float = 1.0f,
     var nThreads: Int = 4,
     var useMmap: Boolean = true,
     var useMlock: Boolean = false,
+    var topK: Int = 50,
+    var topP: Float = 1.0f,
+    var xtcP: Float = 0.0f,
+    var xtcT: Float = 1.0f,
     /**
      * The maximum number of tokens that can be used as context to the model
      * This is editable by users in the EditChatSettingsScreen.kt.
diff --git a/app/src/main/java/io/shubham0204/smollmandroid/ui/screens/chat/ChatScreenViewModel.kt b/app/src/main/java/io/shubham0204/smollmandroid/ui/screens/chat/ChatScreenViewModel.kt
@@ -329,6 +329,10 @@ class ChatScreenViewModel(
                         chat.nThreads,
                         chat.useMmap,
                         chat.useMlock,
+                        chat.topP,
+                        chat.topK,
+                        chat.xtcP,
+                        chat.xtcT,
                     ),
                     onError = { e ->
                         _modelLoadState.value = ModelLoadingState.FAILURE
diff --git a/app/src/main/java/io/shubham0204/smollmandroid/ui/screens/chat/EditChatSettingsScreen.kt b/app/src/main/java/io/shubham0204/smollmandroid/ui/screens/chat/EditChatSettingsScreen.kt
@@ -78,6 +78,10 @@ fun EditChatSettingsScreen(
         var chatTemplate by remember { mutableStateOf(chat.chatTemplate) }
         var useMmap by remember { mutableStateOf(chat.useMmap) }
         var useMlock by remember { mutableStateOf(chat.useMlock) }
+        var topP by remember { mutableStateOf(chat.topP) }
+        var topK by remember { mutableStateOf(chat.topK) }
+        var xtcP by remember { mutableStateOf(chat.xtcP) }
+        var xtcT by remember { mutableStateOf(chat.xtcT) }
         val context = LocalContext.current
         val llmModel = viewModel.modelsRepository.getModelFromId(chat.llmModelId)
 
@@ -110,6 +114,8 @@ fun EditChatSettingsScreen(
                                             nThreads = nThreads,
                                             useMmap = useMmap,
                                             useMlock = useMlock,
+                                            topP = topP,
+                                            topK = topK,
                                         )
                                     if (chat != updatedChat) {
                                         viewModel.updateChat(updatedChat)
@@ -234,6 +240,90 @@ fun EditChatSettingsScreen(
 
                     Spacer(modifier = Modifier.height(24.dp))
 
+                    Text(
+                        stringResource(R.string.chat_settings_label_topP),
+                        style = MaterialTheme.typography.titleMedium,
+                    )
+                    Text(
+                        stringResource(R.string.chat_settings_desc_topP),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+                    Slider(
+                        value = topP,
+                        onValueChange = { topP = it },
+                        valueRange = 0.0f..1.0f,
+                        steps = 100,
+                    )
+                    Text(
+                        text = "%.1f".format(topP),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+
+                    Spacer(modifier = Modifier.height(24.dp))
+
+                    Text(
+                        stringResource(R.string.chat_settings_label_topK),
+                        style = MaterialTheme.typography.titleMedium,
+                    )
+                    Text(
+                        stringResource(R.string.chat_settings_desc_topK),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+                    Slider(
+                        value = topK.toFloat(),
+                        onValueChange = { topK = it.toInt() },
+                        valueRange = 0.0f..128.0f,
+                        steps = 128,
+                    )
+                    Text(
+                        text = topK.toString(),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+
+                    Spacer(modifier = Modifier.height(24.dp))
+
+                    Text(
+                        stringResource(R.string.chat_settings_label_xtcT),
+                        style = MaterialTheme.typography.titleMedium,
+                    )
+                    Text(
+                        stringResource(R.string.chat_settings_desc_xtcT),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+                    Slider(
+                        value = xtcT,
+                        onValueChange = { xtcT = it },
+                        valueRange = 0.0f..1.0f,
+                        steps = 100,
+                    )
+                    Text(
+                        text = "%.1f".format(xtcT),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+
+                    Spacer(modifier = Modifier.height(24.dp))
+
+                    Text(
+                        stringResource(R.string.chat_settings_label_xtcP),
+                        style = MaterialTheme.typography.titleMedium,
+                    )
+                    Text(
+                        stringResource(R.string.chat_settings_desc_xtcP),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+                    Slider(
+                        value = xtcP,
+                        onValueChange = { xtcP = it },
+                        valueRange = 0.0f..1.0f,
+                        steps = 100,
+                    )
+                    Text(
+                        text = "%.1f".format(xtcP),
+                        style = MaterialTheme.typography.labelSmall,
+                    )
+
+                    Spacer(modifier = Modifier.height(24.dp))
+
                     Text(
                         stringResource(R.string.chat_settings_label_ctx_size),
                         style = MaterialTheme.typography.titleMedium,
diff --git a/app/src/main/res/values-zh-rCN/strings.xml b/app/src/main/res/values-zh-rCN/strings.xml
@@ -53,6 +53,14 @@
     <string name="chat_settings_label_sys_prompt">系统提示</string>
     <string name="chat_settings_label_chat_name">聊天名称</string>
     <string name="chat_settings_take_from_gguf">从 GGUF 模型获取</string>
+    <string name="chat_settings_desc_topP">Top-p（核）采样选择累积概率超过某个阈值的最小最可能词元集合，从而输出更具动态性和多样性的内容。</string>
+    <string name="chat_settings_desc_topK">Top-k 采样将模型的选择范围限制在 $k$ 个最可能的下一个词元内，从而确保输出内容更集中且更具可预测性。</string>
+    <string name="chat_settings_label_topP">Top P</string>
+    <string name="chat_settings_label_topK">Top K</string>
+    <string name="chat_settings_desc_xtcP">从采样中移除所有令牌，但留下概率最低的一个，移除概率为 xtcP</string>
+    <string name="chat_settings_desc_xtcT">如果多个令牌的预测概率都达到或超过阈值 xtcT…</string>
+    <string name="chat_settings_label_xtcP">XTC 概率</string>
+    <string name="chat_settings_label_xtcT">XTC 阈值</string>
     <string name="context_size_taken_from_model">上下文大小取自模型</string>
     <string name="chat_settings_title_num_tokens">令牌数量</string>
     <string name="chat_settings_err_min_ctx_size">上下文大小应至少为 200 个令牌</string>
diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml
@@ -32,11 +32,19 @@
     <string name="chat_settings_desc_temp">Temperature is a parameter that controls the randomness and creativity of LLM outputs, with lower temperatures producing more deterministic and focused responses, and higher temperatures leading to more diverse and creative outputs.</string>
     <string name="chat_settings_desc_ctx_length">The context length of a large language model (LLM) refers to the maximum number of tokens (words or subwords) it can process in a single input or output sequence. Larger context sizes need more memory.</string>
     <string name="chat_settings_desc_n_threads">The number of CPU threads to use for inference.</string>
+    <string name="chat_settings_desc_topP">Top-p sampling selects the smallest set of most probable tokens whose cumulative probability exceeds a threshold, allowing for a more dynamic and diverse output.</string>
+    <string name="chat_settings_desc_topK">Top-k sampling limits the model\'s choices to the k most likely next tokens, ensuring a more focused and predictable output.</string>
+    <string name="chat_settings_desc_xtcP">...remove all except the least probable one from sampling, with probability xtcP</string>
+    <string name="chat_settings_desc_xtcT">If there are multiple tokens with predicted probability at least the threshold xtcT...</string>
     <string name="chat_settings_label_ctx_size">Context Size</string>
     <string name="chat_settings_label_temp">Temperature</string>
     <string name="chat_settings_label_minp">min-p</string>
     <string name="chat_settings_label_sys_prompt">System Prompt</string>
     <string name="chat_settings_label_chat_name">Chat Name</string>
+    <string name="chat_settings_label_topP">top P</string>
+    <string name="chat_settings_label_topK">top K</string>
+    <string name="chat_settings_label_xtcP">XTC Probability</string>
+    <string name="chat_settings_label_xtcT">XTC Threshold</string>
     <string name="chat_settings_take_from_gguf">Take from GGUF Model</string>
     <string name="context_size_taken_from_model">Context size taken from model</string>
     <string name="chat_settings_title_num_tokens">No. of tokens</string>
diff --git a/smollm/build.gradle.kts b/smollm/build.gradle.kts
@@ -34,11 +34,11 @@ android {
                 // allow compiling 16 KB page-aligned shared libraries
                 // https://developer.android.com/guide/practices/page-sizes#compile-r27
                 arguments += listOf("-DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON")
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                // arguments += "-DCMAKE_BUILD_TYPE=Release"
 
                 // (debugging) uncomment the following line to enable debug builds
                 // and attach hardware-assisted address sanitizer
-                // arguments += "-DCMAKE_BUILD_TYPE=Debug"
+                arguments += "-DCMAKE_BUILD_TYPE=Debug"
                 // arguments += listOf("-DANDROID_SANITIZE=hwaddress")
             }
         }
diff --git a/smollm/src/main/cpp/LLMInference.cpp b/smollm/src/main/cpp/LLMInference.cpp
@@ -8,19 +8,24 @@
 #define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
 
 void
-LLMInference::loadModel(const char *model_path, float minP, float temperature, bool storeChats, long contextSize,
-                        const char *chatTemplate, int nThreads, bool useMmap, bool useMlock) {
+LLMInference::loadModel(const char* model_path, float minP, float temperature, bool storeChats, long contextSize,
+                        const char* chatTemplate, int nThreads, bool useMmap, bool useMlock, float topP, int topK,
+                        float xtcP, float xtcT) {
     LOGi("loading model with"
          "\n\tmodel_path = %s"
          "\n\tminP = %f"
          "\n\ttemperature = %f"
          "\n\tstoreChats = %d"
-         "\n\tcontextSize = %li"
+         "\n\tcontextSize = %d"
          "\n\tchatTemplate = %s"
          "\n\tnThreads = %d"
          "\n\tuseMmap = %d"
-         "\n\tuseMlock = %d",
-         model_path, minP, temperature, storeChats, contextSize, chatTemplate, nThreads, useMmap, useMlock);
+         "\n\tuseMlock = %d"
+         "\n\ttopP = %f"
+         "\n\ttopK = %i"
+         "\n\txtcP = %f"
+         "\n\txtcT = %f",
+         model_path, minP, temperature, (int)storeChats, (int)contextSize, chatTemplate, nThreads, useMmap, useMlock, topP, topK, xtcP, xtcT);
 
     // load dynamic backends
     ggml_backend_load_all();
@@ -53,6 +58,23 @@ LLMInference::loadModel(const char *model_path, float minP, float temperature, b
     _sampler = llama_sampler_chain_init(sampler_params);
     llama_sampler_chain_add(_sampler, llama_sampler_init_temp(temperature));
     llama_sampler_chain_add(_sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+    if (minP >= 0.01f) {
+        // minP = 0.0 (disabled)
+        // minP can be adjusted across 100 steps between [0.0,1.0], the smallest step being 0.01
+        llama_sampler_chain_add(_sampler, llama_sampler_init_min_p(minP, 1));
+    }
+    if (topK > 0) {
+        LOGi("Enabled top-k sampling with k=%d", topK);
+        llama_sampler_chain_add(_sampler, llama_sampler_init_top_k(topK));
+    }
+    if (topP <= 0.99) {
+        LOGi("Enabled top-p sampling with p=%f", topP);
+        llama_sampler_chain_add(_sampler, llama_sampler_init_top_p(topP, 1));
+    }
+    if (xtcT <= 0.99 || xtcP >= 0.01) {
+        LOGi("Enabled XTC sampling with p=%f, t=%f", xtcP, xtcT);
+        llama_sampler_chain_add(_sampler, llama_sampler_init_xtc(xtcP, xtcT, 1, LLAMA_DEFAULT_SEED));
+    }
 
     _formattedMessages = std::vector<char>(llama_n_ctx(_ctx));
     _messages.clear();
diff --git a/smollm/src/main/cpp/LLMInference.h b/smollm/src/main/cpp/LLMInference.h
@@ -40,7 +40,8 @@ class LLMInference {
 
   public:
     void loadModel(const char* modelPath, float minP, float temperature, bool storeChats, long contextSize,
-                   const char* chatTemplate, int nThreads, bool useMmap, bool useMlock);
+                   const char* chatTemplate, int nThreads, bool useMmap, bool useMlock, float topP, int topK,
+                   float xtcP, float xtcT);
 
     void addChatMessage(const char* message, const char* role);
 
diff --git a/smollm/src/main/cpp/smollm.cpp b/smollm/src/main/cpp/smollm.cpp
@@ -4,15 +4,16 @@
 extern "C" JNIEXPORT jlong JNICALL
 Java_io_shubham0204_smollm_SmolLM_loadModel(JNIEnv* env, jobject thiz, jstring modelPath, jfloat minP,
                                             jfloat temperature, jboolean storeChats, jlong contextSize,
-                                            jstring chatTemplate, jint nThreads, jboolean useMmap, jboolean useMlock) {
+                                            jstring chatTemplate, jint nThreads, jboolean useMmap, jboolean useMlock,
+                                            jfloat topP, jint topK, jfloat xtcP, jfloat xtcT) {
     jboolean    isCopy           = true;
     const char* modelPathCstr    = env->GetStringUTFChars(modelPath, &isCopy);
     auto*       llmInference     = new LLMInference();
     const char* chatTemplateCstr = env->GetStringUTFChars(chatTemplate, &isCopy);
 
     try {
         llmInference->loadModel(modelPathCstr, minP, temperature, storeChats, contextSize, chatTemplateCstr, nThreads,
-                                useMmap, useMlock);
+                                useMmap, useMlock, topP, topK, xtcP, xtcT);
     } catch (std::runtime_error& error) {
         env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), error.what());
     }
diff --git a/smollm/src/main/java/io/shubham0204/smollm/SmolLM.kt b/smollm/src/main/java/io/shubham0204/smollm/SmolLM.kt
@@ -145,9 +145,9 @@ class SmolLM {
      * Data class to hold the inference parameters for the LLM.
      *
      * @property minP The minimum probability for a token to be considered.
-     *                Also known as top-P sampling. (Default: 0.1f)
+     *                Also known as top-P sampling. (Default: 0.01f)
      * @property temperature The temperature for sampling. Higher values make the output more random.
-     *                       (Default: 0.8f)
+     *                       (Default: 1.0f)
      * @property storeChats Whether to store the chat history in memory. If true, the LLM will
      *                      remember previous interactions in the current session. (Default: true)
      * @property contextSize The context size (in tokens) for the LLM. This determines how much
@@ -165,14 +165,18 @@ class SmolLM {
      *                    being swapped out to disk, potentially improving performance. (Default: false)
      */
     data class InferenceParams(
-        val minP: Float = 0.1f,
-        val temperature: Float = 0.8f,
+        val minP: Float = 0.01f,
+        val temperature: Float = 1.0f,
         val storeChats: Boolean = true,
         val contextSize: Long? = null,
         val chatTemplate: String? = null,
         val numThreads: Int = 4,
         val useMmap: Boolean = true,
         val useMlock: Boolean = false,
+        val topP: Float = 1.0f,
+        val topK: Int = 1,
+        val xtcP: Float = 0.0f,
+        val xtcT: Float = 1.0f,
     )
 
     /**
@@ -210,6 +214,10 @@ class SmolLM {
                 params.numThreads,
                 params.useMmap,
                 params.useMlock,
+                params.topP,
+                params.topK,
+                params.xtcP,
+                params.xtcT,
             )
     }
 
@@ -334,6 +342,10 @@ class SmolLM {
         nThreads: Int,
         useMmap: Boolean,
         useMlock: Boolean,
+        topP: Float,
+        topK: Int,
+        xtcP: Float,
+        xtcT: Float,
     ): Long
 
     private external fun addChatMessage(

Original file line number	Diff line number	Diff line change
`@@ -34,11 +34,11 @@ android {`
`34`	`34`	`// allow compiling 16 KB page-aligned shared libraries`
`35`	`35`	`// https://developer.android.com/guide/practices/page-sizes#compile-r27`
`36`	`36`	`arguments += listOf("-DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON")`
`37`		`- arguments += "-DCMAKE_BUILD_TYPE=Release"`
	`37`	`+ // arguments += "-DCMAKE_BUILD_TYPE=Release"`
`38`	`38`
`39`	`39`	`// (debugging) uncomment the following line to enable debug builds`
`40`	`40`	`// and attach hardware-assisted address sanitizer`
`41`		`- // arguments += "-DCMAKE_BUILD_TYPE=Debug"`
	`41`	`+ arguments += "-DCMAKE_BUILD_TYPE=Debug"`
`42`	`42`	`// arguments += listOf("-DANDROID_SANITIZE=hwaddress")`
`43`	`43`	`}`
`44`	`44`	`}`