feat: add vllm cahche

fridayL · fridayL · commit 91562b3cc900 · 2025-07-15T08:50:30.000Z
diff --git a/examples/core_memories/vllm_kv_cache_memory.py b/examples/core_memories/vllm_kv_cache_memory.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating how to use VLLMKVCacheMemory with vLLM backend.
+This example shows how to use the new vLLM-compatible KV cache memory.
+"""
+
+from memos.configs.memory import MemoryConfigFactory
+from memos.memories.factory import MemoryFactory
+
+
+def main():
+    """Main function demonstrating VLLMKVCacheMemory usage."""
+
+    print("=== VLLM KV Cache Memory Example ===\n")
+
+    # 1. Create config for VLLMKVCacheMemory (using vLLM backend)
+    config = MemoryConfigFactory(
+        backend="vllm_kv_cache",  # Use the new vLLM KV cache backend
+        config={
+            "extractor_llm": {
+                "backend": "vllm",
+                "config": {
+                    "model_name_or_path": "/mnt/afs/models/hf_models/Qwen2.5-7B",
+                    "api_base": "http://localhost:8088/v1",
+                    "temperature": 0.7,
+                    "max_tokens": 1024,
+                    "model_schema": "memos.configs.llm.VLLMLLMConfig",
+                },
+            },
+        },
+    )
+
+    # 2. Instantiate VLLMKVCacheMemory using the factory
+    print("Initializing VLLM KV Cache Memory...")
+    vllm_kv_mem = MemoryFactory.from_config(config)
+    print("✓ VLLM KV Cache Memory initialized successfully.\n")
+
+    # 3. Extract a VLLMKVCacheItem from a prompt
+    print("===== Extract VLLMKVCacheItem =====")
+    system_prompt = [
+        {"role": "system", "content": "You are a helpful AI assistant."},
+        {"role": "user", "content": "What is MemOS?"},
+        {"role": "assistant", "content": "MemOS is a memory operating system for LLMs."},
+    ]
+
+    try:
+        cache_item = vllm_kv_mem.extract(system_prompt)
+        print("✓ KV cache item extracted successfully")
+        print(f"  ID: {cache_item.id}")
+        print(f"  Memory (prompt): {cache_item.memory[:100]}...")
+        print(f"  Metadata: {cache_item.metadata}")
+        print()
+    except Exception as e:
+        print(f"✗ Failed to extract KV cache item: {e}")
+        return
+
+    # 4. Add the extracted VLLMKVCacheItem
+    print("===== Add VLLMKVCacheItem =====")
+    vllm_kv_mem.add([cache_item])
+    all_items = vllm_kv_mem.get_all()
+    print(f"✓ Added cache item. Total items: {len(all_items)}")
+    print()
+
+    # 5. Get by id
+    print("===== Get VLLMKVCacheItem by id =====")
+    retrieved = vllm_kv_mem.get(cache_item.id)
+    if retrieved:
+        print(f"✓ Retrieved cache item: {retrieved.id}")
+        print(f"  Memory (prompt): {retrieved.memory[:100]}...")
+    else:
+        print("✗ Failed to retrieve cache item")
+    print()
+
+    # 6. Get cache (returns prompt string for vLLM)
+    print("===== Get Cache (Prompt String) =====")
+    prompt_string = vllm_kv_mem.get_cache([cache_item.id])
+    if prompt_string:
+        print(f"✓ Retrieved prompt string: {prompt_string[:100]}...")
+        print("  This prompt can be used for vLLM generation with preloaded KV cache")
+    else:
+        print("✗ Failed to retrieve prompt string")
+    print()
+
+    # 7. Extract another cache item for demonstration
+    print("===== Extract Another VLLMKVCacheItem =====")
+    another_prompt = [
+        {"role": "system", "content": "You are a coding assistant."},
+        {"role": "user", "content": "Write a Python function to calculate fibonacci numbers."},
+    ]
+
+    try:
+        cache_item2 = vllm_kv_mem.extract(another_prompt)
+        vllm_kv_mem.add([cache_item2])
+        print(f"✓ Added second cache item. Total items: {len(vllm_kv_mem.get_all())}")
+        print()
+    except Exception as e:
+        print(f"✗ Failed to extract second KV cache item: {e}")
+        print()
+
+    # 8. Preload KV cache on vLLM server
+    print("===== Preload KV Cache on vLLM Server =====")
+    try:
+        vllm_kv_mem.preload_kv_cache([cache_item.id, cache_item2.id])
+        print("✓ KV cache preloaded on vLLM server successfully")
+        print("  The server now has the KV cache ready for fast generation")
+    except Exception as e:
+        print(f"✗ Failed to preload KV cache: {e}")
+    print()
+
+    # 9. Delete one item
+    print("===== Delete One VLLMKVCacheItem =====")
+    vllm_kv_mem.delete([cache_item.id])
+    remaining_items = vllm_kv_mem.get_all()
+    print(f"✓ Deleted cache item. Remaining items: {len(remaining_items)}")
+    print()
+
+    # 10. Dump and load
+    print("===== Dump and Load VLLMKVCacheMemory =====")
+    try:
+        vllm_kv_mem.dump("tmp/vllm_kv_mem")
+        print("✓ Memory dumped to 'tmp/vllm_kv_mem'")
+
+        # Clear memory and reload
+        vllm_kv_mem.delete_all()
+        vllm_kv_mem.load("tmp/vllm_kv_mem")
+        reloaded_items = vllm_kv_mem.get_all()
+        print(f"✓ Memory loaded from 'tmp/vllm_kv_mem': {len(reloaded_items)} items")
+    except Exception as e:
+        print(f"✗ Failed to dump/load memory: {e}")
+    print()
+
+    print("=== Example completed successfully ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/memos/api/config.py b/src/memos/api/config.py
@@ -79,7 +79,7 @@ def get_activation_config() -> dict[str, Any]:
     def get_activation_vllm_config() -> dict[str, Any]:
         """Get Ollama configuration."""
         return {
-            "backend": "kv_cache",
+            "backend": "vllm_kv_cache",
             "config": {
                 "memory_filename": "activation_memory.pickle",
                 "extractor_llm": {
@@ -121,6 +121,7 @@ def get_scheduler_config() -> dict[str, Any]:
                     "MOS_SCHEDULER_ENABLE_PARALLEL_DISPATCH", "true"
                 ).lower()
                 == "true",
+                "enable_act_memory_update": True,
             },
         }
 
diff --git a/src/memos/configs/mem_cube.py b/src/memos/configs/mem_cube.py
@@ -70,7 +70,7 @@ def validate_text_mem(cls, text_mem: MemoryConfigFactory) -> MemoryConfigFactory
     @classmethod
     def validate_act_mem(cls, act_mem: MemoryConfigFactory) -> MemoryConfigFactory:
         """Validate the act_mem field."""
-        allowed_backends = ["kv_cache", "uninitialized"]
+        allowed_backends = ["kv_cache", "vllm_kv_cache", "uninitialized"]
         if act_mem.backend not in allowed_backends:
             raise ConfigurationError(
                 f"GeneralMemCubeConfig requires act_mem backend to be one of {allowed_backends}, got '{act_mem.backend}'"
diff --git a/src/memos/configs/memory.py b/src/memos/configs/memory.py
@@ -181,6 +181,7 @@ class MemoryConfigFactory(BaseConfig):
         "general_text": GeneralTextMemoryConfig,
         "tree_text": TreeTextMemoryConfig,
         "kv_cache": KVCacheMemoryConfig,
+        "vllm_kv_cache": KVCacheMemoryConfig,  # Use same config as kv_cache
         "lora": LoRAMemoryConfig,
         "uninitialized": UninitializedMemoryConfig,
     }
diff --git a/src/memos/mem_os/product.py b/src/memos/mem_os/product.py
@@ -14,7 +14,6 @@
 from memos.mem_cube.general import GeneralMemCube
 from memos.mem_os.core import MOSCore
 from memos.mem_os.utils.format_utils import (
-    convert_activation_memory_to_serializable,
     convert_graph_to_tree_forworkmem,
     filter_nodes_by_tree_ids,
     remove_embedding_recursive,
@@ -903,12 +902,11 @@ def get_all(
             )
         elif memory_type == "para_mem":
             act_mem_params = self.mem_cubes[mem_cube_ids[0]].act_mem.get_all()
-            # Convert activation memory to serializable format
-            serializable_act_mem = convert_activation_memory_to_serializable(act_mem_params)
+            logger.info(f"act_mem_params: {act_mem_params}")
             reformat_memory_list.append(
                 {
                     "cube_id": "xxxxxxxxxxxxxxxx" if not mem_cube_ids else mem_cube_ids[0],
-                    "memories": serializable_act_mem,
+                    "memories": act_mem_params[0].model_dump(),
                 }
             )
         return reformat_memory_list
diff --git a/src/memos/memories/activation/item.py b/src/memos/memories/activation/item.py
@@ -35,3 +35,16 @@ class KVCacheItem(ActivationMemoryItem):
 
     model_config = ConfigDict(arbitrary_types_allowed=True)  # To allow DynamicCache as a field type
     records: KVCacheRecords = KVCacheRecords()
+
+
+class VLLMKVCacheItem(KVCacheItem):
+    """
+    VLLM KV Cache Item that stores prompt strings instead of DynamicCache objects.
+    This is because vLLM handles KV cache on the server side via preloading.
+    """
+
+    # Override memory field to store prompt string instead of DynamicCache
+    memory: str = Field(
+        default="",
+        description="Prompt string used to preload KV cache in vLLM server",
+    )
diff --git a/src/memos/memories/activation/vllmkv.py b/src/memos/memories/activation/vllmkv.py
diff --git a/src/memos/memories/factory.py b/src/memos/memories/factory.py

Original file line number	Diff line number	Diff line change
`@@ -181,6 +181,7 @@ class MemoryConfigFactory(BaseConfig):`
`181`	`181`	`"general_text": GeneralTextMemoryConfig,`
`182`	`182`	`"tree_text": TreeTextMemoryConfig,`
`183`	`183`	`"kv_cache": KVCacheMemoryConfig,`
	`184`	`+ "vllm_kv_cache": KVCacheMemoryConfig, # Use same config as kv_cache`
`184`	`185`	`"lora": LoRAMemoryConfig,`
`185`	`186`	`"uninitialized": UninitializedMemoryConfig,`
`186`	`187`	`}`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@`
`14`	`14`	`from memos.mem_cube.general import GeneralMemCube`
`15`	`15`	`from memos.mem_os.core import MOSCore`
`16`	`16`	`from memos.mem_os.utils.format_utils import (`
`17`		`- convert_activation_memory_to_serializable,`
`18`	`17`	`convert_graph_to_tree_forworkmem,`
`19`	`18`	`filter_nodes_by_tree_ids,`
`20`	`19`	`remove_embedding_recursive,`
`@@ -903,12 +902,11 @@ def get_all(`
`903`	`902`	`)`
`904`	`903`	`elif memory_type == "para_mem":`
`905`	`904`	`act_mem_params = self.mem_cubes[mem_cube_ids[0]].act_mem.get_all()`
`906`		`- # Convert activation memory to serializable format`
`907`		`- serializable_act_mem = convert_activation_memory_to_serializable(act_mem_params)`
	`905`	`+ logger.info(f"act_mem_params: {act_mem_params}")`
`908`	`906`	`reformat_memory_list.append(`
`909`	`907`	`{`
`910`	`908`	`"cube_id": "xxxxxxxxxxxxxxxx" if not mem_cube_ids else mem_cube_ids[0],`
`911`		`- "memories": serializable_act_mem,`
	`909`	`+ "memories": act_mem_params[0].model_dump(),`
`912`	`910`	`}`
`913`	`911`	`)`
`914`	`912`	`return reformat_memory_list`