diff --git a/sdk-schema/lms-with-inferred-unions.json b/sdk-schema/lms-with-inferred-unions.json index 5497948..5d774b6 100644 --- a/sdk-schema/lms-with-inferred-unions.json +++ b/sdk-schema/lms-with-inferred-unions.json @@ -67,12 +67,16 @@ }, { "$ref": "#/definitions/presetManifest" + }, + { + "$ref": "#/definitions/modelManifest" } ], "discriminator": { "mapping": { "plugin": "#/definitions/pluginManifest", - "preset": "#/definitions/presetManifest" + "preset": "#/definitions/presetManifest", + "model": "#/definitions/modelManifest" }, "propertyName": "type" } @@ -84,10 +88,14 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "type": "string", + "pattern": "^[a-z0-9]+(?:-[a-z0-9]+)*$", + "minLength": 1, + "maxLength": 100 }, "description": { - "type": "string" + "type": "string", + "maxLength": 1000 }, "revision": { "type": "integer" @@ -425,7 +433,7 @@ "embeddingLoadModelConfig": { "type": "object", "properties": { - "gpuOffload": { + "gpu": { "$ref": "#/definitions/gpuSetting" }, "contextLength": { @@ -592,6 +600,9 @@ }, { "$ref": "#/definitions/errorDisplayDataGenericEngineDoesNotSupportFeature" + }, + { + "$ref": "#/definitions/errorDisplayDataGenericPresetNotFound" } ], "discriminator": { @@ -601,7 +612,8 @@ "generic.pathNotFound": "#/definitions/errorDisplayDataGenericPathNotFound", "generic.identifierNotFound": "#/definitions/errorDisplayDataGenericIdentifierNotFound", "generic.domainMismatch": "#/definitions/errorDisplayDataGenericDomainMismatch", - "generic.engineDoesNotSupportFeature": "#/definitions/errorDisplayDataGenericEngineDoesNotSupportFeature" + "generic.engineDoesNotSupportFeature": "#/definitions/errorDisplayDataGenericEngineDoesNotSupportFeature", + "generic.presetNotFound": "#/definitions/errorDisplayDataGenericPresetNotFound" }, "propertyName": "code" } @@ -656,6 +668,50 @@ "unknown" ] }, + "gpuSplitConfig": { + "type": "object", + "properties": { + "strategy": { + "$ref": "#/definitions/gpuSplitStrategy" + }, + "disabledGpus": { + "type": "array", + "items": { + "type": "integer", + "minimum": 0 + } + }, + "priority": { + "type": "array", + "items": { + "type": "integer", + "minimum": 0 + } + }, + "customRatio": { + "type": "array", + "items": { + "type": "number", + "minimum": 0 + } + } + }, + "required": [ + "strategy", + "disabledGpus", + "priority", + "customRatio" + ], + "additionalProperties": false + }, + "gpuSplitStrategy": { + "type": "string", + "enum": [ + "evenly", + "priorityOrder", + "custom" + ] + }, "kvConfigFieldDependency": { "type": "object", "properties": { @@ -688,10 +744,12 @@ "kvConfigLayerName": { "type": "string", "enum": [ + "currentlyEditing", "currentlyLoaded", "apiOverride", "conversationSpecific", "conversationGlobal", + "preset", "serverSession", "httpServerRequestOverride", "completeModeFormatting", @@ -852,9 +910,12 @@ "llmLoadModelConfig": { "type": "object", "properties": { - "gpuOffload": { + "gpu": { "$ref": "#/definitions/gpuSetting" }, + "gpuStrictVramCap": { + "type": "boolean" + }, "contextLength": { "type": "integer", "minimum": 1 @@ -1697,6 +1758,9 @@ }, "jsonSchema": { "$ref": "#/definitions/functionToolCallRequest/properties/arguments/additionalProperties" + }, + "gbnfGrammar": { + "type": "string" } }, "required": [ @@ -1708,7 +1772,8 @@ "type": "string", "enum": [ "none", - "json" + "json", + "gbnf" ] }, "llmToolArray": { @@ -2449,6 +2514,41 @@ ], "additionalProperties": false }, + "modelManifest": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model", + "title": "Type" + }, + "virtual": { + "type": "boolean", + "const": true + }, + "owner": { + "$ref": "#/definitions/kebabCase" + }, + "name": { + "$ref": "#/definitions/artifactManifestBase/properties/name" + }, + "description": { + "$ref": "#/definitions/artifactManifestBase/properties/description" + }, + "revision": { + "$ref": "#/definitions/artifactManifestBase/properties/revision" + } + }, + "required": [ + "type", + "virtual", + "owner", + "name", + "description" + ], + "additionalProperties": false + }, "modelQuery": { "type": "object", "properties": { @@ -2500,7 +2600,7 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "$ref": "#/definitions/artifactManifestBase/properties/name" }, "description": { "$ref": "#/definitions/artifactManifestBase/properties/description" @@ -2537,7 +2637,7 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "$ref": "#/definitions/artifactManifestBase/properties/name" }, "description": { "$ref": "#/definitions/artifactManifestBase/properties/description" @@ -4149,6 +4249,9 @@ "predictionConfigStack": { "$ref": "#/definitions/kvConfigStack" }, + "fuzzyPresetIdentifier": { + "type": "string" + }, "ignoreServerSessionConfig": { "type": "boolean" } @@ -5585,6 +5688,49 @@ ], "additionalProperties": false }, + "errorDisplayDataGenericPresetNotFound": { + "type": "object", + "properties": { + "code": { + "type": "string", + "const": "generic.presetNotFound", + "default": "generic.presetNotFound", + "title": "Code" + }, + "specifiedFuzzyPresetIdentifier": { + "type": "string" + }, + "availablePresetsSample": { + "type": "array", + "items": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": [ + "identifier", + "name" + ], + "additionalProperties": false + } + }, + "totalAvailablePresets": { + "type": "integer" + } + }, + "required": [ + "code", + "specifiedFuzzyPresetIdentifier", + "availablePresetsSample", + "totalAvailablePresets" + ], + "additionalProperties": false + }, "parsedFileIdentifierLocal": { "type": "object", "properties": { diff --git a/sdk-schema/lms.json b/sdk-schema/lms.json index 04a249e..936f506 100644 --- a/sdk-schema/lms.json +++ b/sdk-schema/lms.json @@ -67,6 +67,9 @@ }, { "$ref": "#/definitions/presetManifest" + }, + { + "$ref": "#/definitions/modelManifest" } ] }, @@ -77,10 +80,14 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "type": "string", + "pattern": "^[a-z0-9]+(?:-[a-z0-9]+)*$", + "minLength": 1, + "maxLength": 100 }, "description": { - "type": "string" + "type": "string", + "maxLength": 1000 }, "revision": { "type": "integer" @@ -492,7 +499,7 @@ "embeddingLoadModelConfig": { "type": "object", "properties": { - "gpuOffload": { + "gpu": { "$ref": "#/definitions/gpuSetting" }, "contextLength": { @@ -795,6 +802,47 @@ "supportedVersion" ], "additionalProperties": false + }, + { + "type": "object", + "properties": { + "code": { + "type": "string", + "const": "generic.presetNotFound" + }, + "specifiedFuzzyPresetIdentifier": { + "type": "string" + }, + "availablePresetsSample": { + "type": "array", + "items": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": [ + "identifier", + "name" + ], + "additionalProperties": false + } + }, + "totalAvailablePresets": { + "type": "integer" + } + }, + "required": [ + "code", + "specifiedFuzzyPresetIdentifier", + "availablePresetsSample", + "totalAvailablePresets" + ], + "additionalProperties": false } ] }, @@ -869,6 +917,50 @@ "unknown" ] }, + "gpuSplitConfig": { + "type": "object", + "properties": { + "strategy": { + "$ref": "#/definitions/gpuSplitStrategy" + }, + "disabledGpus": { + "type": "array", + "items": { + "type": "integer", + "minimum": 0 + } + }, + "priority": { + "type": "array", + "items": { + "type": "integer", + "minimum": 0 + } + }, + "customRatio": { + "type": "array", + "items": { + "type": "number", + "minimum": 0 + } + } + }, + "required": [ + "strategy", + "disabledGpus", + "priority", + "customRatio" + ], + "additionalProperties": false + }, + "gpuSplitStrategy": { + "type": "string", + "enum": [ + "evenly", + "priorityOrder", + "custom" + ] + }, "kvConfigFieldDependency": { "type": "object", "properties": { @@ -930,10 +1022,12 @@ "kvConfigLayerName": { "type": "string", "enum": [ + "currentlyEditing", "currentlyLoaded", "apiOverride", "conversationSpecific", "conversationGlobal", + "preset", "serverSession", "httpServerRequestOverride", "completeModeFormatting", @@ -1217,9 +1311,12 @@ "llmLoadModelConfig": { "type": "object", "properties": { - "gpuOffload": { + "gpu": { "$ref": "#/definitions/gpuSetting" }, + "gpuStrictVramCap": { + "type": "boolean" + }, "contextLength": { "type": "integer", "minimum": 1 @@ -2115,6 +2212,9 @@ }, "jsonSchema": { "$ref": "#/definitions/functionToolCallRequest/properties/arguments/additionalProperties" + }, + "gbnfGrammar": { + "type": "string" } }, "required": [ @@ -2126,7 +2226,8 @@ "type": "string", "enum": [ "none", - "json" + "json", + "gbnf" ] }, "llmToolArray": { @@ -2866,6 +2967,39 @@ ], "additionalProperties": false }, + "modelManifest": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model" + }, + "virtual": { + "type": "boolean", + "const": true + }, + "owner": { + "$ref": "#/definitions/kebabCase" + }, + "name": { + "$ref": "#/definitions/artifactManifestBase/properties/name" + }, + "description": { + "$ref": "#/definitions/artifactManifestBase/properties/description" + }, + "revision": { + "$ref": "#/definitions/artifactManifestBase/properties/revision" + } + }, + "required": [ + "type", + "virtual", + "owner", + "name", + "description" + ], + "additionalProperties": false + }, "modelQuery": { "type": "object", "properties": { @@ -2936,7 +3070,7 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "$ref": "#/definitions/artifactManifestBase/properties/name" }, "description": { "$ref": "#/definitions/artifactManifestBase/properties/description" @@ -2971,7 +3105,7 @@ "$ref": "#/definitions/kebabCase" }, "name": { - "$ref": "#/definitions/kebabCase" + "$ref": "#/definitions/artifactManifestBase/properties/name" }, "description": { "$ref": "#/definitions/artifactManifestBase/properties/description" @@ -4979,6 +5113,9 @@ "predictionConfigStack": { "$ref": "#/definitions/kvConfigStack" }, + "fuzzyPresetIdentifier": { + "type": "string" + }, "ignoreServerSessionConfig": { "type": "boolean" } diff --git a/sdk-schema/lmstudio-js b/sdk-schema/lmstudio-js index 7155368..2ae5ac6 160000 --- a/sdk-schema/lmstudio-js +++ b/sdk-schema/lmstudio-js @@ -1 +1 @@ -Subproject commit 71553680bd102eaeb86c0cbc1a42eb1908b6ddda +Subproject commit 2ae5ac65245f5b2a11c7b4a3fdb8065f19cc1d33 diff --git a/src/lmstudio/_kv_config.py b/src/lmstudio/_kv_config.py index 3c3ec8a..e1126f5 100644 --- a/src/lmstudio/_kv_config.py +++ b/src/lmstudio/_kv_config.py @@ -3,7 +3,7 @@ # Known KV config settings are defined in # https://github.com/lmstudio-ai/lmstudio-js/blob/main/packages/lms-kv-config/src/schema.ts from dataclasses import dataclass -from typing import Any, Iterable, Sequence, Type, TypeVar +from typing import Any, Container, Iterable, Sequence, Type, TypeVar from .sdk_api import LMStudioValueError from .schemas import DictSchema, DictObject, ModelSchema, MutableDictObject @@ -115,7 +115,7 @@ def update_client_config( "ropeFrequencyScale": CheckboxField("ropeFrequencyScale"), "tryMmap": ConfigField("tryMmap"), "acceleration": { - "offloadRatio": NestedKeyField("gpuOffload", "ratio"), + "offloadRatio": NestedKeyField("gpu", "ratio"), }, } @@ -126,8 +126,9 @@ def update_client_config( SUPPORTED_SERVER_KEYS: dict[str, DictObject] = { "load": { "gpuSplitConfig": MultiPartField( - "gpuOffload", ("mainGpu", "splitStrategy", "disabledGpus") + "gpu", ("mainGpu", "splitStrategy", "disabledGpus") ), + "gpuStrictVramCap": ConfigField("gpuStrictVramCap"), }, "embedding.load": { **_COMMON_MODEL_LOAD_KEYS, @@ -185,7 +186,9 @@ def update_client_config( # Define mappings to translate server KV configs to client config instances -def _iter_server_keys(*namespaces: str) -> Iterable[tuple[str, ConfigField]]: +def _iter_server_keys( + *namespaces: str, excluded: Container[str] = () +) -> Iterable[tuple[str, ConfigField]]: # Map dotted config field names to their client config field counterparts for namespace in namespaces: scopes: list[tuple[str, DictObject]] = [ @@ -193,6 +196,9 @@ def _iter_server_keys(*namespaces: str) -> Iterable[tuple[str, ConfigField]]: ] for prefix, scope in scopes: for k, v in scope.items(): + if k in excluded: + # 'load' config namespace currently includes some LLM-only config keys + continue prefixed_key = f"{prefix}.{k}" if prefix else k if isinstance(v, ConfigField): yield prefixed_key, v @@ -202,7 +208,9 @@ def _iter_server_keys(*namespaces: str) -> Iterable[tuple[str, ConfigField]]: FROM_SERVER_LOAD_LLM = dict(_iter_server_keys("load", "llm.load")) -FROM_SERVER_LOAD_EMBEDDING = dict(_iter_server_keys("load", "embedding.load")) +FROM_SERVER_LOAD_EMBEDDING = dict( + _iter_server_keys("load", "embedding.load", excluded="gpuStrictVramCap") +) FROM_SERVER_PREDICTION = dict(_iter_server_keys("llm.prediction")) FROM_SERVER_CONFIG = dict(_iter_server_keys(*SUPPORTED_SERVER_KEYS)) @@ -216,7 +224,7 @@ def _invert_config_keymap(from_server: FromServerKeymap) -> ToServerKeymap: to_server: ToServerKeymap = {} for server_key, config_field in sorted(from_server.items()): client_key = config_field.client_key - # There's at least one client field (gpuOffload) which maps to + # There's at least one client field (gpu) which maps to # multiple KV config fields, so don't expect a 1:1 mapping config_fields = to_server.setdefault(client_key, []) config_fields.append((server_key, config_field)) diff --git a/src/lmstudio/_sdk_models/__init__.py b/src/lmstudio/_sdk_models/__init__.py index 465c7af..f3d4a6e 100644 --- a/src/lmstudio/_sdk_models/__init__.py +++ b/src/lmstudio/_sdk_models/__init__.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: lms-with-inferred-unions.json -# timestamp: 2025-03-13T13:10:49+00:00 +# timestamp: 2025-03-20T15:15:46+00:00 from __future__ import annotations @@ -33,6 +33,8 @@ "AcceleratorDict", "ArtifactManifestBase", "ArtifactManifestBaseDict", + "AvailablePresetsSampleItem", + "AvailablePresetsSampleItemDict", "BackendNotification", "BackendNotificationDict", "BlockLocationAfterId", @@ -141,6 +143,8 @@ "ErrorDisplayDataGenericNoModelMatchingQueryDict", "ErrorDisplayDataGenericPathNotFound", "ErrorDisplayDataGenericPathNotFoundDict", + "ErrorDisplayDataGenericPresetNotFound", + "ErrorDisplayDataGenericPresetNotFoundDict", "ErrorDisplayDataGenericSpecificModelUnloaded", "ErrorDisplayDataGenericSpecificModelUnloadedDict", "FilesChannelRetrieveCreationParameter", @@ -181,6 +185,8 @@ "GetModelOptsDict", "GpuSetting", "GpuSettingDict", + "GpuSplitConfig", + "GpuSplitConfigDict", "InternalRetrievalResult", "InternalRetrievalResultDict", "InternalRetrievalResultEntry", @@ -333,6 +339,8 @@ "ModelInfoBaseDict", "ModelInstanceInfoBase", "ModelInstanceInfoBaseDict", + "ModelManifest", + "ModelManifestDict", "ModelQuery", "ModelQueryDict", "ModelSearchOpts", @@ -826,6 +834,18 @@ class SerializedLMSExtendedErrorDict(TypedDict): ] +DisabledGpu = Annotated[int, Meta(ge=0)] + + +PriorityItem = Annotated[int, Meta(ge=0)] + + +CustomRatioItem = Annotated[float, Meta(ge=0.0)] + + +GpuSplitStrategy = Literal["evenly", "priorityOrder", "custom"] + + ############################################################################### # KvConfigField ############################################################################### @@ -848,10 +868,12 @@ class KvConfigFieldDict(TypedDict): KvConfigLayerName = Literal[ + "currentlyEditing", "currentlyLoaded", "apiOverride", "conversationSpecific", "conversationGlobal", + "preset", "serverSession", "httpServerRequestOverride", "completeModeFormatting", @@ -1172,7 +1194,7 @@ class LlmManualPromptTemplateDict(TypedDict): LlmPromptTemplateType = Literal["manual", "jinja"] -LlmStructuredPredictionType = Literal["none", "json"] +LlmStructuredPredictionType = Literal["none", "json", "gbnf"] ############################################################################### @@ -2868,6 +2890,65 @@ class ErrorDisplayDataGenericEngineDoesNotSupportFeatureDict(TypedDict): supportedVersion: NotRequired[str | None] +############################################################################### +# AvailablePresetsSampleItem +############################################################################### + + +class AvailablePresetsSampleItem( + LMStudioStruct["AvailablePresetsSampleItemDict"], kw_only=True +): + identifier: str + name: str + + +class AvailablePresetsSampleItemDict(TypedDict): + """Corresponding typed dictionary definition for AvailablePresetsSampleItem. + + NOTE: Multi-word keys are defined using their camelCase form, + as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. + """ + + identifier: str + name: str + + +############################################################################### +# ErrorDisplayDataGenericPresetNotFound +############################################################################### + + +class ErrorDisplayDataGenericPresetNotFound( + LMStudioStruct["ErrorDisplayDataGenericPresetNotFoundDict"], + kw_only=True, + tag_field="code", + tag="generic.presetNotFound", +): + code: ClassVar[Annotated[Literal["generic.presetNotFound"], Meta(title="Code")]] = ( + "generic.presetNotFound" + ) + specified_fuzzy_preset_identifier: str = field( + name="specifiedFuzzyPresetIdentifier" + ) + available_presets_sample: Sequence[AvailablePresetsSampleItem] = field( + name="availablePresetsSample" + ) + total_available_presets: int = field(name="totalAvailablePresets") + + +class ErrorDisplayDataGenericPresetNotFoundDict(TypedDict): + """Corresponding typed dictionary definition for ErrorDisplayDataGenericPresetNotFound. + + NOTE: Multi-word keys are defined using their camelCase form, + as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. + """ + + code: Literal["generic.presetNotFound"] + specifiedFuzzyPresetIdentifier: str + availablePresetsSample: Sequence[AvailablePresetsSampleItemDict] + totalAvailablePresets: int + + ############################################################################### # ParsedFileIdentifierLocal ############################################################################### @@ -4557,12 +4638,20 @@ class RepositoryChannelEnsureAuthenticatedToClientPacketAuthenticatedDict(TypedD type: Literal["authenticated"] -Description = str +Description = Annotated[str, Meta(max_length=1000)] + + +Name = Annotated[ + str, Meta(max_length=100, min_length=1, pattern="^[a-z0-9]+(?:-[a-z0-9]+)*$") +] Revision = int +DescriptionModel = str + + NoAutoDismiss = bool @@ -4663,8 +4752,10 @@ class RepositoryChannelEnsureAuthenticatedToClientPacketAuthenticatedDict(TypedD class ArtifactManifestBase(LMStudioStruct["ArtifactManifestBaseDict"], kw_only=True): owner: KebabCase - name: KebabCase - description: str + name: Annotated[ + str, Meta(max_length=100, min_length=1, pattern="^[a-z0-9]+(?:-[a-z0-9]+)*$") + ] + description: Annotated[str, Meta(max_length=1000)] revision: int | None = None @@ -4676,8 +4767,10 @@ class ArtifactManifestBaseDict(TypedDict): """ owner: str - name: str - description: str + name: Annotated[ + str, Meta(max_length=100, min_length=1, pattern="^[a-z0-9]+(?:-[a-z0-9]+)*$") + ] + description: Annotated[str, Meta(max_length=1000)] revision: NotRequired[int | None] @@ -4854,6 +4947,31 @@ class EmbeddingModelInstanceInfoDict(TypedDict): ParsedFileIdentifier = ParsedFileIdentifierLocal | ParsedFileIdentifierBase64 +############################################################################### +# GpuSplitConfig +############################################################################### + + +class GpuSplitConfig(LMStudioStruct["GpuSplitConfigDict"], kw_only=True): + strategy: GpuSplitStrategy + disabled_gpus: Sequence[DisabledGpu] = field(name="disabledGpus") + priority: Sequence[PriorityItem] + custom_ratio: Sequence[CustomRatioItem] = field(name="customRatio") + + +class GpuSplitConfigDict(TypedDict): + """Corresponding typed dictionary definition for GpuSplitConfig. + + NOTE: Multi-word keys are defined using their camelCase form, + as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. + """ + + strategy: GpuSplitStrategy + disabledGpus: Sequence[int] + priority: Sequence[int] + customRatio: Sequence[float] + + ContentBlockStyle = ( ContentBlockStyleDefault | ContentBlockStyleCustomLabel | ContentBlockStyleThinking ) @@ -4893,7 +5011,8 @@ class GpuSettingDict(TypedDict): class LlmLoadModelConfig(LMStudioStruct["LlmLoadModelConfigDict"], kw_only=True): - gpu_offload: GpuSetting | None = field(name="gpuOffload", default=None) + gpu: GpuSetting | None = None + gpu_strict_vram_cap: bool | None = field(name="gpuStrictVramCap", default=None) context_length: Annotated[int, Meta(ge=1)] | None = field( name="contextLength", default=None ) @@ -4927,7 +5046,8 @@ class LlmLoadModelConfigDict(TypedDict): as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. """ - gpuOffload: NotRequired[GpuSettingDict | None] + gpu: NotRequired[GpuSettingDict | None] + gpuStrictVramCap: NotRequired[bool | None] contextLength: NotRequired[Annotated[int, Meta(ge=1)] | None] ropeFrequencyBase: NotRequired[float | None] ropeFrequencyScale: NotRequired[float | None] @@ -5112,6 +5232,7 @@ class LlmStructuredPredictionSetting( ): type: LlmStructuredPredictionType json_schema: AdditionalProperties | None = field(name="jsonSchema", default=None) + gbnf_grammar: str | None = field(name="gbnfGrammar", default=None) class LlmStructuredPredictionSettingDict(TypedDict): @@ -5123,6 +5244,7 @@ class LlmStructuredPredictionSettingDict(TypedDict): type: LlmStructuredPredictionType jsonSchema: NotRequired[AdditionalProperties | None] + gbnfGrammar: NotRequired[str | None] BlockLocation = BlockLocationBeforeId | BlockLocationAfterId @@ -5287,6 +5409,37 @@ class ModelInstanceInfoBaseDict(TypedDict): architecture: NotRequired[str | None] +############################################################################### +# ModelManifest +############################################################################### + + +class ModelManifest( + LMStudioStruct["ModelManifestDict"], kw_only=True, tag_field="type", tag="model" +): + type: ClassVar[Annotated[Literal["model"], Meta(title="Type")]] = "model" + virtual: bool + owner: KebabCase + name: Name + description: Description + revision: Revision | None = None + + +class ModelManifestDict(TypedDict): + """Corresponding typed dictionary definition for ModelManifest. + + NOTE: Multi-word keys are defined using their camelCase form, + as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. + """ + + type: Literal["model"] + virtual: bool + owner: str + name: str + description: str + revision: NotRequired[int | None] + + ############################################################################### # ModelQuery ############################################################################### @@ -5323,7 +5476,7 @@ class PluginManifest( type: ClassVar[Annotated[Literal["plugin"], Meta(title="Type")]] = "plugin" runner: PluginRunnerType owner: KebabCase - name: KebabCase + name: Name description: Description revision: Revision | None = None @@ -5353,7 +5506,7 @@ class PresetManifest( ): type: ClassVar[Annotated[Literal["preset"], Meta(title="Type")]] = "preset" owner: KebabCase - name: KebabCase + name: Name description: Description revision: Revision | None = None @@ -6048,7 +6201,7 @@ class SystemRpcNotifyParameter( LMStudioStruct["SystemRpcNotifyParameterDict"], kw_only=True ): title: Title - description: Description | None = None + description: DescriptionModel | None = None no_auto_dismiss: NoAutoDismiss | None = field(name="noAutoDismiss", default=None) @@ -6779,7 +6932,7 @@ class PluginsChannelSetGeneratorToClientPacketGenerateDict(TypedDict): token: str -ArtifactManifest = PluginManifest | PresetManifest +ArtifactManifest = PluginManifest | PresetManifest | ModelManifest ChatMessageData = ( @@ -6812,7 +6965,7 @@ class PluginsChannelSetGeneratorToClientPacketGenerateDict(TypedDict): class EmbeddingLoadModelConfig( LMStudioStruct["EmbeddingLoadModelConfigDict"], kw_only=True ): - gpu_offload: GpuSetting | None = field(name="gpuOffload", default=None) + gpu: GpuSetting | None = None context_length: Annotated[int, Meta(ge=1)] | None = field( name="contextLength", default=None ) @@ -6829,7 +6982,7 @@ class EmbeddingLoadModelConfigDict(TypedDict): as that is what `to_dict()` emits, and what `_from_api_dict()` accepts. """ - gpuOffload: NotRequired[GpuSettingDict | None] + gpu: NotRequired[GpuSettingDict | None] contextLength: NotRequired[Annotated[int, Meta(ge=1)] | None] ropeFrequencyBase: NotRequired[float | None] ropeFrequencyScale: NotRequired[float | None] @@ -6844,6 +6997,7 @@ class EmbeddingLoadModelConfigDict(TypedDict): | ErrorDisplayDataGenericIdentifierNotFound | ErrorDisplayDataGenericDomainMismatch | ErrorDisplayDataGenericEngineDoesNotSupportFeature + | ErrorDisplayDataGenericPresetNotFound ) @@ -8114,6 +8268,9 @@ class LlmChannelPredictCreationParameter( model_specifier: ModelSpecifier = field(name="modelSpecifier") history: ChatHistoryData prediction_config_stack: KvConfigStack = field(name="predictionConfigStack") + fuzzy_preset_identifier: str | None = field( + name="fuzzyPresetIdentifier", default=None + ) ignore_server_session_config: bool | None = field( name="ignoreServerSessionConfig", default=None ) @@ -8129,6 +8286,7 @@ class LlmChannelPredictCreationParameterDict(TypedDict): modelSpecifier: ModelSpecifierDict history: ChatHistoryDataDict predictionConfigStack: KvConfigStackDict + fuzzyPresetIdentifier: NotRequired[str | None] ignoreServerSessionConfig: NotRequired[bool | None] diff --git a/src/lmstudio/async_api.py b/src/lmstudio/async_api.py index 462e48c..906270b 100644 --- a/src/lmstudio/async_api.py +++ b/src/lmstudio/async_api.py @@ -502,7 +502,10 @@ async def load_new_instance( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TAsyncModelHandle: - """Load this model with the given identifier and configuration.""" + """Load this model with the given identifier and configuration. + + Note: details of configuration fields may change in SDK feature releases. + """ handle: TAsyncModelHandle = await self._session._load_new_instance( self.model_key, instance_identifier, ttl, config, on_load_progress ) @@ -516,6 +519,11 @@ async def model( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TAsyncModelHandle: + """Retrieve model with given identifier, or load it with given configuration. + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ # Call _get_or_load directly, since we have a model identifier handle: TAsyncModelHandle = await self._session._get_or_load( self.model_key, ttl, config, on_load_progress @@ -786,7 +794,11 @@ async def model( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TAsyncModelHandle: - """Get a handle to the specified model (loading it if necessary).""" + """Get a handle to the specified model (loading it if necessary). + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ if model_key is None: # Should this raise an error if a config is supplied? return await self._get_any() @@ -816,7 +828,10 @@ async def load_new_instance( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TAsyncModelHandle: - """Load the specified model with the given identifier and configuration.""" + """Load the specified model with the given identifier and configuration. + + Note: details of configuration fields may change in SDK feature releases. + """ return await self._load_new_instance( model_key, instance_identifier, ttl, config, on_load_progress ) @@ -1033,7 +1048,10 @@ async def _complete_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> AsyncPredictionStream[str] | AsyncPredictionStream[DictObject]: - """Request a one-off prediction without any context and stream the generated tokens.""" + """Request a one-off prediction without any context and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ endpoint = CompletionEndpoint( model_specifier, prompt, @@ -1086,7 +1104,10 @@ async def _respond_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> AsyncPredictionStream[str] | AsyncPredictionStream[DictObject]: - """Request a response in an ongoing assistant chat session and stream the generated tokens.""" + """Request a response in an ongoing assistant chat session and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ if not isinstance(history, Chat): history = Chat.from_history(history) endpoint = ChatResponseEndpoint( @@ -1256,7 +1277,10 @@ async def complete_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> AsyncPredictionStream[str] | AsyncPredictionStream[DictObject]: - """Request a one-off prediction without any context and stream the generated tokens.""" + """Request a one-off prediction without any context and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ return await self._session._complete_stream( self.identifier, prompt, @@ -1304,7 +1328,10 @@ async def complete( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionResult[str] | PredictionResult[DictObject]: - """Request a one-off prediction without any context.""" + """Request a one-off prediction without any context. + + Note: details of configuration fields may change in SDK feature releases. + """ prediction_stream = await self._session._complete_stream( self.identifier, prompt, @@ -1357,7 +1384,10 @@ async def respond_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> AsyncPredictionStream[str] | AsyncPredictionStream[DictObject]: - """Request a response in an ongoing assistant chat session and stream the generated tokens.""" + """Request a response in an ongoing assistant chat session and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ return await self._session._respond_stream( self.identifier, history, @@ -1405,7 +1435,10 @@ async def respond( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionResult[str] | PredictionResult[DictObject]: - """Request a response in an ongoing assistant chat session.""" + """Request a response in an ongoing assistant chat session. + + Note: details of configuration fields may change in SDK feature releases. + """ prediction_stream = await self._session._respond_stream( self.identifier, history, diff --git a/src/lmstudio/sync_api.py b/src/lmstudio/sync_api.py index 5ce86a4..b7c9558 100644 --- a/src/lmstudio/sync_api.py +++ b/src/lmstudio/sync_api.py @@ -690,7 +690,10 @@ def load_new_instance( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TModelHandle: - """Load this model with the given identifier and configuration.""" + """Load this model with the given identifier and configuration. + + Note: details of configuration fields may change in SDK feature releases. + """ handle: TModelHandle = self._session._load_new_instance( self.model_key, instance_identifier, ttl, config, on_load_progress ) @@ -704,6 +707,11 @@ def model( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TModelHandle: + """Retrieve model with default identifier, or load it with given configuration. + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ # Call _get_or_load directly, since we have a model identifier handle: TModelHandle = self._session._get_or_load( self.model_key, ttl, config, on_load_progress @@ -951,7 +959,11 @@ def model( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TModelHandle: - """Get a handle to the specified model (loading it if necessary).""" + """Get a handle to the specified model (loading it if necessary). + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ if model_key is None: # Should this raise an error if a config is supplied? return self._get_any() @@ -981,7 +993,10 @@ def load_new_instance( config: TLoadConfig | TLoadConfigDict | None = None, on_load_progress: ModelLoadingCallback | None = None, ) -> TModelHandle: - """Load the specified model with the given identifier and configuration.""" + """Load the specified model with the given identifier and configuration. + + Note: details of configuration fields may change in SDK feature releases. + """ return self._load_new_instance( model_key, instance_identifier, ttl, config, on_load_progress ) @@ -1198,7 +1213,10 @@ def _complete_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionStream[str] | PredictionStream[DictObject]: - """Request a one-off prediction without any context and stream the generated tokens.""" + """Request a one-off prediction without any context and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ endpoint = CompletionEndpoint( model_specifier, prompt, @@ -1251,7 +1269,10 @@ def _respond_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionStream[str] | PredictionStream[DictObject]: - """Request a response in an ongoing assistant chat session and stream the generated tokens.""" + """Request a response in an ongoing assistant chat session and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ if not isinstance(history, Chat): history = Chat.from_history(history) endpoint = ChatResponseEndpoint( @@ -1417,7 +1438,10 @@ def complete_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionStream[str] | PredictionStream[DictObject]: - """Request a one-off prediction without any context and stream the generated tokens.""" + """Request a one-off prediction without any context and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ return self._session._complete_stream( self.identifier, prompt, @@ -1465,7 +1489,10 @@ def complete( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionResult[str] | PredictionResult[DictObject]: - """Request a one-off prediction without any context.""" + """Request a one-off prediction without any context. + + Note: details of configuration fields may change in SDK feature releases. + """ prediction_stream = self._session._complete_stream( self.identifier, prompt, @@ -1518,7 +1545,10 @@ def respond_stream( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionStream[str] | PredictionStream[DictObject]: - """Request a response in an ongoing assistant chat session and stream the generated tokens.""" + """Request a response in an ongoing assistant chat session and stream the generated tokens. + + Note: details of configuration fields may change in SDK feature releases. + """ return self._session._respond_stream( self.identifier, history, @@ -1566,7 +1596,10 @@ def respond( on_prediction_fragment: PredictionFragmentCallback | None = None, on_prompt_processing_progress: PromptProcessingCallback | None = None, ) -> PredictionResult[str] | PredictionResult[DictObject]: - """Request a response in an ongoing assistant chat session.""" + """Request a response in an ongoing assistant chat session. + + Note: details of configuration fields may change in SDK feature releases. + """ prediction_stream = self._session._respond_stream( self.identifier, history, @@ -1608,7 +1641,10 @@ def act( ] | None = None, ) -> ActResult: - """Request a response (with implicit tool use) in an ongoing agent chat session.""" + """Request a response (with implicit tool use) in an ongoing agent chat session. + + Note: details of configuration fields may change in SDK feature releases. + """ start_time = time.perf_counter() # It is not yet possible to combine tool calling with requests for structured responses response_format = None @@ -1920,7 +1956,11 @@ def llm( ttl: int | None = DEFAULT_TTL, config: LlmLoadModelConfig | LlmLoadModelConfigDict | None = None, ) -> LLM: - """Access an LLM using the default global client.""" + """Access an LLM using the default global client. + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ return get_default_client().llm.model(model_key, ttl=ttl, config=config) @@ -1932,7 +1972,11 @@ def embedding_model( ttl: int | None = DEFAULT_TTL, config: EmbeddingLoadModelConfig | EmbeddingLoadModelConfigDict | None = None, ) -> EmbeddingModel: - """Access an embedding model using the default global client.""" + """Access an embedding model using the default global client. + + Note: configuration of retrieved model is NOT checked against the given config. + Note: details of configuration fields may change in SDK feature releases. + """ return get_default_client().embedding.model(model_key, ttl=ttl, config=config) diff --git a/tests/test_kv_config.py b/tests/test_kv_config.py index 9da638d..24abace 100644 --- a/tests/test_kv_config.py +++ b/tests/test_kv_config.py @@ -45,7 +45,7 @@ LOAD_CONFIG_EMBEDDING: EmbeddingLoadModelConfigDict = { "contextLength": 1978, - "gpuOffload": GPU_CONFIG, + "gpu": GPU_CONFIG, "keepModelInMemory": True, "ropeFrequencyBase": 10.0, "ropeFrequencyScale": 1.5, @@ -54,7 +54,7 @@ SC_LOAD_CONFIG_EMBEDDING = { "context_length": 1978, - "gpu_offload": SC_GPU_CONFIG, + "gpu": SC_GPU_CONFIG, "keep_model_in_memory": True, "rope_frequency_base": 10.0, "rope_frequency_scale": 1.5, @@ -65,7 +65,8 @@ "contextLength": 1978, "evalBatchSize": 42, "flashAttention": False, - "gpuOffload": GPU_CONFIG, + "gpu": GPU_CONFIG, + "gpuStrictVramCap": False, "keepModelInMemory": True, "llamaKCacheQuantizationType": "q8_0", "llamaVCacheQuantizationType": "f32", @@ -81,7 +82,8 @@ "context_length": 1978, "eval_batch_size": 42, "flash_attention": False, - "gpu_offload": SC_GPU_CONFIG, + "gpu": SC_GPU_CONFIG, + "gpu_strict_vram_cap": False, "keep_model_in_memory": True, "llama_k_cache_quantization_type": "q8_0", "llama_v_cache_quantization_type": "f32", @@ -334,6 +336,7 @@ def test_kv_stack_field_coverage( {"key": "llm.load.llama.useFp16ForKVCache", "value": True}, {"key": "llm.load.numExperts", "value": 0}, {"key": "llm.load.seed", "value": {"checked": True, "value": 313}}, + {"key": "load.gpuStrictVramCap", "value": False}, ] }, } @@ -455,6 +458,6 @@ def test_kv_stack_prediction_config_conflict() -> None: # (this will most likely involve changing the data model code generation) # def test_nested_unknown_keys() -> None: # config = LOAD_CONFIG_EMBEDDING.copy() -# LOAD_CONFIG_EMBEDDING["gpuOffload"] = SC_GPU_CONFIG +# LOAD_CONFIG_EMBEDDING["gpu"] = SC_GPU_CONFIG # with pytest.raises(msgspec.ValidationError): # EmbeddingLoadModelConfigStrict._from_api_dict(config)