diff --git a/src/lmstudio/_kv_config.py b/src/lmstudio/_kv_config.py index a88ab34..a55e7b4 100644 --- a/src/lmstudio/_kv_config.py +++ b/src/lmstudio/_kv_config.py @@ -224,8 +224,8 @@ def _gpu_split_config_to_gpu_settings( **_COMMON_LLAMA_LOAD_KEYS, "evalBatchSize": ConfigField("evalBatchSize"), "flashAttention": ConfigField("flashAttention"), - "llamaKCacheQuantizationType": CheckboxField("llamaKCacheQuantizationType"), - "llamaVCacheQuantizationType": CheckboxField("llamaVCacheQuantizationType"), + "kCacheQuantizationType": CheckboxField("llamaKCacheQuantizationType"), + "vCacheQuantizationType": CheckboxField("llamaVCacheQuantizationType"), "useFp16ForKVCache": ConfigField("useFp16ForKVCache"), }, }, diff --git a/tests/test_kv_config.py b/tests/test_kv_config.py index ca852ce..8b472f0 100644 --- a/tests/test_kv_config.py +++ b/tests/test_kv_config.py @@ -328,15 +328,11 @@ def test_kv_stack_field_coverage( }, {"key": "llm.load.llama.evalBatchSize", "value": 42}, {"key": "llm.load.llama.flashAttention", "value": False}, - {"key": "llm.load.llama.keepModelInMemory", "value": True}, { - "key": "llm.load.llama.llamaKCacheQuantizationType", + "key": "llm.load.llama.kCacheQuantizationType", "value": {"checked": True, "value": "q8_0"}, }, - { - "key": "llm.load.llama.llamaVCacheQuantizationType", - "value": {"checked": True, "value": "f32"}, - }, + {"key": "llm.load.llama.keepModelInMemory", "value": True}, { "key": "llm.load.llama.ropeFrequencyBase", "value": {"checked": True, "value": 10.0}, @@ -347,6 +343,10 @@ def test_kv_stack_field_coverage( }, {"key": "llm.load.llama.tryMmap", "value": False}, {"key": "llm.load.llama.useFp16ForKVCache", "value": True}, + { + "key": "llm.load.llama.vCacheQuantizationType", + "value": {"checked": True, "value": "f32"}, + }, {"key": "llm.load.numExperts", "value": 0}, {"key": "llm.load.offloadKVCacheToGpu", "value": False}, {"key": "llm.load.seed", "value": {"checked": True, "value": 313}},