lmstudio-ai · ncoghlan · Jun 24, 2025 · Jun 24, 2025
diff --git a/sdk-schema/lms-with-inferred-unions.json b/sdk-schema/lms-with-inferred-unions.json
diff --git a/sdk-schema/lms.json b/sdk-schema/lms.json
diff --git a/sdk-schema/lmstudio-js b/sdk-schema/lmstudio-js
diff --git a/sdk-schema/sync-sdk-schema.py b/sdk-schema/sync-sdk-schema.py
@@ -22,6 +22,7 @@
 import ast
 import builtins
 import json
+import re
 import shutil
 import subprocess
 import sys
@@ -410,11 +411,15 @@ def _generate_data_model_from_json_schema() -> None:
         raise RuntimeError(f"Failed to create {_MODEL_PATH!r}")
     # Generated source code post-processing:
     #
+    # * Fix up miscellaneous issues the code generator currently mishandles
     # * Fix up typed dicts to be defined in terms of nested dicts
     # * Add an `__all__` definition for wildcard imports (which also
     #   serves as a top level summary of the defined schemas)
     print("Post-processing generated source code...")
-    model_source = _MODEL_PATH.read_text()
+    # Replace unsupported regex character classes with `.`
+    # https://github.com/python/cpython/issues/95555
+    # https://github.com/jcrist/msgspec/issues/860
+    model_source = re.sub(r"\\\\p\{[^}]*\}", ".", _MODEL_PATH.read_text())
     model_ast = ast.parse(model_source)
     dict_token_replacements: dict[str, str] = {}
     exported_names: list[str] = []

diff --git a/src/lmstudio/_kv_config.py b/src/lmstudio/_kv_config.py
@@ -120,7 +120,9 @@ def to_kv_field(
     def update_client_config(
         self, client_config: MutableDictObject, server_value: DictObject
     ) -> None:
-        client_container: MutableDictObject = client_config.setdefault(self.client_key, {})
+        client_container: MutableDictObject = client_config.setdefault(
+            self.client_key, {}
+        )
         self.server_to_client(server_value, client_container)
 
 
@@ -216,6 +218,7 @@ def _gpu_split_config_to_gpu_settings(
         **_COMMON_MODEL_LOAD_KEYS,
         "numExperts": ConfigField("numExperts"),
         "seed": CheckboxField("seed"),
+        "offloadKVCacheToGpu": ConfigField("offloadKVCacheToGpu"),
         "llama": {
             **_COMMON_LLAMA_LOAD_KEYS,
             "evalBatchSize": ConfigField("evalBatchSize"),

diff --git a/src/lmstudio/_sdk_models/__init__.py b/src/lmstudio/_sdk_models/__init__.py
diff --git a/src/lmstudio/json_api.py b/src/lmstudio/json_api.py
@@ -700,6 +700,8 @@ def result(self) -> T:
         return self._result
 
     def raise_unknown_message_error(self, unknown_message: Any) -> NoReturn:
+        # TODO: improve forward compatibility by switching this to use warnings.warn
+        # instead of failing immediately for all unknown messages
         raise LMStudioUnknownMessageError(
             f"{self._NOTICE_PREFIX} unexpected message contents: {unknown_message!r}"
         )
@@ -1234,20 +1236,20 @@ def iter_message_events(
                     # Ignore status updates after cancellation (avoids race condition)
                     return
                 yield from self._update_prompt_processing_progress(progress)
-            case {
-                "type": "toolCallGenerationStart",
-            }:
+            case {"type": "toolCallGenerationStart"}:
                 self._logger.debug("Notified of pending tool call request generation.")
+            case {"type": "toolCallGenerationNameReceived"}:
+                pass  # UI event, currently ignored by Python SDK
+            case {"type": "toolCallGenerationArgumentFragmentGenerated"}:
+                pass  # UI event, currently ignored by Python SDK
             case {
                 "type": "toolCallGenerationEnd",
                 "toolCallRequest": tool_call_request,
             }:
                 yield PredictionToolCallEvent(
                     ToolCallRequest._from_api_dict(tool_call_request)
                 )
-            case {
-                "type": "toolCallGenerationFailed",
-            }:
+            case {"type": "toolCallGenerationFailed"}:
                 self._logger.warn("Tool call processing generation failed.")
                 yield PredictionToolCallAbortedEvent(None)
             case {"type": "error", "error": {} as error}:

diff --git a/src/lmstudio/schemas.py b/src/lmstudio/schemas.py
@@ -94,8 +94,9 @@ def model_json_schema(cls) -> DictSchema:
 
 
 _CAMEL_CASE_OVERRIDES = {
-    # This is the one key in the API that capitalizes the `V` in `KV`
+    # `_kv_` in snake_case becomes KV in camelCase
     "useFp16ForKvCache": "useFp16ForKVCache",
+    "offloadKvCacheToGpu": "offloadKVCacheToGpu",
 }
 
 _SKIP_FIELD_RECURSION = set(

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -287,5 +287,5 @@ def _handle_invalid_request(
         assert isinstance(tool_failure_exc.__cause__, ZeroDivisionError)
         # If the content checks prove too flaky in practice, they can be dropped
         completed_response = predictions[-1].content.lower()
-        assert "divid" in completed_response # Accepts both "divide" and "dividing"
+        assert "divid" in completed_response  # Accepts both "divide" and "dividing"
         assert "zero" in completed_response
diff --git a/tests/test_kv_config.py b/tests/test_kv_config.py
@@ -76,6 +76,7 @@
     "llamaKCacheQuantizationType": "q8_0",
     "llamaVCacheQuantizationType": "f32",
     "numExperts": 0,
+    "offloadKVCacheToGpu": False,
     "ropeFrequencyBase": 10.0,
     "ropeFrequencyScale": 1.5,
     "seed": 313,
@@ -93,6 +94,7 @@
     "llama_k_cache_quantization_type": "q8_0",
     "llama_v_cache_quantization_type": "f32",
     "num_experts": 0,
+    "offload_kv_cache_to_gpu": False,
     "rope_frequency_base": 10.0,
     "rope_frequency_scale": 1.5,
     "seed": 313,
@@ -221,6 +223,9 @@ class LlmPredictionConfigStrict(LlmPredictionConfig, forbid_unknown_fields=True)
     LlmPredictionConfigStrict,
 )
 
+# The "raw" debugging field is a special case, with TBD handling
+_NOT_YET_MAPPED = {"raw"}
+
 
 @pytest.mark.parametrize("config_dict,config_type", zip(CONFIG_DICTS, CONFIG_TYPES))
 def test_struct_field_coverage(
@@ -232,7 +237,7 @@ def test_struct_field_coverage(
     missing_keys = expected_keys - mapped_keys
     assert not missing_keys
     # Ensure no extra keys are mistakenly defined
-    unknown_keys = mapped_keys - expected_keys
+    unknown_keys = mapped_keys - expected_keys - _NOT_YET_MAPPED
     assert not unknown_keys
     # Ensure the config can be loaded
     config_struct = config_type._from_api_dict(config_dict)
@@ -260,7 +265,7 @@ def test_kv_stack_field_coverage(
     # Ensure all expected keys are covered (even those with default values)
     mapped_keys = keymap.keys()
     expected_keys = set(config_type.__struct_encode_fields__)
-    missing_keys = expected_keys - mapped_keys
+    missing_keys = expected_keys - mapped_keys - _NOT_YET_MAPPED
     assert not missing_keys
     # Ensure no extra keys are mistakenly defined
     unknown_keys = mapped_keys - expected_keys
@@ -342,6 +347,7 @@ def test_kv_stack_field_coverage(
                     {"key": "llm.load.llama.tryMmap", "value": False},
                     {"key": "llm.load.llama.useFp16ForKVCache", "value": True},
                     {"key": "llm.load.numExperts", "value": 0},
+                    {"key": "llm.load.offloadKVCacheToGpu", "value": False},
                     {"key": "llm.load.seed", "value": {"checked": True, "value": 313}},
                     {"key": "load.gpuStrictVramCap", "value": False},
                 ]