Skip to content

Commit e08e164

Browse files
authored
feat: complete multi modal (#562)
* fix: multi-model memreader init error * fix: kwargs bug * feat: init examples for each multi-model parser * feat: simple user_parser * feat: add multi-model-parser example * feat: add multi-model-parser example * feat: update user parser: only tackle with ChatCompletionUserMessageParam message * feat: rewrite create source and parse fast for system parser * feat: rewrite create source and parse fast for system parser * feat: rewrite assistant parser * feat: add additional sources to assistant parser * feat: add concat fast-mode memories from multi parsers * refactor: fix name * refactor: fix name * refactor: fix name * refactor: fix name * refactor: fix name * refactor: fix name * feat: add fine process path-A in multi_modal_struct * feat: add fine process path-A in multi_modal_struct * feat: add compare simple&multimodal example * feat: add _process_transfer_multi_modal_data in multimodal * feat: add image type * feat: add tool role; update string/text/tool parser * feat: update file_content_parser and multimodal reader * feat: default mem-reader for api is not set to multimodal reqader
1 parent 8724d58 commit e08e164

File tree

12 files changed

+1285
-720
lines changed

12 files changed

+1285
-720
lines changed

examples/mem_reader/multimodal_struct_reader.py

Lines changed: 764 additions & 688 deletions
Large diffs are not rendered by default.

src/memos/api/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ def get_embedder_config() -> dict[str, Any]:
426426
def get_reader_config() -> dict[str, Any]:
427427
"""Get reader configuration."""
428428
return {
429-
"backend": os.getenv("MEM_READER_BACKEND", "simple_struct"),
429+
"backend": os.getenv("MEM_READER_BACKEND", "multimodal_struct"),
430430
"config": {
431431
"chunk_type": os.getenv("MEM_READER_CHAT_CHUNK_TYPE", "default"),
432432
"chunk_length": int(os.getenv("MEM_READER_CHAT_CHUNK_TOKEN_SIZE", 1600)),

src/memos/mem_reader/read_multi_modal/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .assistant_parser import AssistantParser
1717
from .base import BaseMessageParser
1818
from .file_content_parser import FileContentParser
19+
from .image_parser import ImageParser
1920
from .multi_modal_parser import MultiModalParser
2021
from .string_parser import StringParser
2122
from .system_parser import SystemParser
@@ -29,6 +30,7 @@
2930
"AssistantParser",
3031
"BaseMessageParser",
3132
"FileContentParser",
33+
"ImageParser",
3234
"MultiModalParser",
3335
"StringParser",
3436
"SystemParser",

src/memos/mem_reader/read_multi_modal/assistant_parser.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ def parse_fast(
227227
# Combine all content parts
228228
content = " ".join(content_parts) if content_parts else ""
229229

230+
# If content is empty but we have tool_calls, audio, or refusal, still create memory
231+
if not content and not tool_calls and not audio and not refusal:
232+
return []
233+
230234
parts = [f"{role}: "]
231235
if chat_time:
232236
parts.append(f"[{chat_time}]: ")

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
from memos.embedders.base import BaseEmbedder
66
from memos.llms.base import BaseLLM
77
from memos.log import get_logger
8-
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
8+
from memos.memories.textual.item import (
9+
SourceMessage,
10+
TextualMemoryItem,
11+
TreeNodeTextualMemoryMetadata,
12+
)
913
from memos.parsers.factory import ParserFactory
1014
from memos.types.openai_chat_completion_types import File
1115

12-
from .base import BaseMessageParser
16+
from .base import BaseMessageParser, _derive_key
1317

1418

1519
logger = get_logger(__name__)
@@ -121,7 +125,111 @@ def parse_fast(
121125
info: dict[str, Any],
122126
**kwargs,
123127
) -> list[TextualMemoryItem]:
124-
return []
128+
"""
129+
Parse file content part in fast mode.
130+
131+
Fast mode extracts file information and creates a memory item without parsing file content.
132+
Handles various file parameter scenarios:
133+
- file_data: base64 encoded data, URL, or plain text content
134+
- file_id: ID of an uploaded file
135+
- filename: name of the file
136+
137+
Args:
138+
message: File content part to parse (dict with "type": "file" and "file": {...})
139+
info: Dictionary containing user_id and session_id
140+
**kwargs: Additional parameters
141+
142+
Returns:
143+
List of TextualMemoryItem objects
144+
"""
145+
if not isinstance(message, dict):
146+
logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
147+
return []
148+
149+
# Extract file information
150+
file_info = message.get("file", {})
151+
if not isinstance(file_info, dict):
152+
logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
153+
return []
154+
155+
# Extract file parameters (all are optional)
156+
file_data = file_info.get("file_data", "")
157+
file_id = file_info.get("file_id", "")
158+
filename = file_info.get("filename", "")
159+
160+
# Build content string based on available information
161+
content_parts = []
162+
163+
# Priority 1: If file_data is provided, use it (could be base64, URL, or plain text)
164+
if file_data:
165+
# In fast mode, we don't decode base64 or fetch URLs, just record the reference
166+
if isinstance(file_data, str):
167+
# Check if it looks like base64 (starts with data: or is long base64 string)
168+
if file_data.startswith("data:") or (
169+
len(file_data) > 100
170+
and all(
171+
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
172+
for c in file_data[:100]
173+
)
174+
):
175+
content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]")
176+
# Check if it looks like a URL
177+
elif file_data.startswith(("http://", "https://", "file://")):
178+
content_parts.append(f"[File URL: {file_data}]")
179+
else:
180+
# TODO: split into multiple memory items
181+
content_parts.append(file_data)
182+
else:
183+
content_parts.append(f"[File Data: {type(file_data).__name__}]")
184+
185+
# Priority 2: If file_id is provided, reference it
186+
if file_id:
187+
content_parts.append(f"[File ID: {file_id}]")
188+
189+
# Priority 3: If filename is provided, include it
190+
if filename:
191+
content_parts.append(f"[Filename: {filename}]")
192+
193+
# If no content can be extracted, create a placeholder
194+
if not content_parts:
195+
content_parts.append("[File: unknown]")
196+
197+
# Combine content parts
198+
content = " ".join(content_parts)
199+
200+
# Create source
201+
source = self.create_source(message, info)
202+
203+
# Extract info fields
204+
info_ = info.copy()
205+
user_id = info_.pop("user_id", "")
206+
session_id = info_.pop("session_id", "")
207+
208+
# For file content parts, default to LongTermMemory
209+
# (since we don't have role information at this level)
210+
memory_type = "LongTermMemory"
211+
212+
# Create memory item
213+
memory_item = TextualMemoryItem(
214+
memory=content,
215+
metadata=TreeNodeTextualMemoryMetadata(
216+
user_id=user_id,
217+
session_id=session_id,
218+
memory_type=memory_type,
219+
status="activated",
220+
tags=["mode:fast", "multimodal:file"],
221+
key=_derive_key(content),
222+
embedding=self.embedder.embed([content])[0],
223+
usage=[],
224+
sources=[source],
225+
background="",
226+
confidence=0.99,
227+
type="fact",
228+
info=info_,
229+
),
230+
)
231+
232+
return [memory_item]
125233

126234
def parse_fine(
127235
self,
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""Parser for image_url content parts."""
2+
3+
from typing import Any
4+
5+
from memos.embedders.base import BaseEmbedder
6+
from memos.llms.base import BaseLLM
7+
from memos.log import get_logger
8+
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
9+
from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam
10+
11+
from .base import BaseMessageParser
12+
13+
14+
logger = get_logger(__name__)
15+
16+
17+
class ImageParser(BaseMessageParser):
18+
"""Parser for image_url content parts."""
19+
20+
def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None):
21+
"""
22+
Initialize ImageParser.
23+
24+
Args:
25+
embedder: Embedder for generating embeddings
26+
llm: Optional LLM for fine mode processing
27+
"""
28+
super().__init__(embedder, llm)
29+
30+
def create_source(
31+
self,
32+
message: ChatCompletionContentPartImageParam,
33+
info: dict[str, Any],
34+
) -> SourceMessage:
35+
"""Create SourceMessage from image_url content part."""
36+
if isinstance(message, dict):
37+
image_url = message.get("image_url", {})
38+
if isinstance(image_url, dict):
39+
url = image_url.get("url", "")
40+
detail = image_url.get("detail", "auto")
41+
else:
42+
url = str(image_url)
43+
detail = "auto"
44+
return SourceMessage(
45+
type="image",
46+
content=f"[image_url]: {url}",
47+
original_part=message,
48+
url=url,
49+
detail=detail,
50+
)
51+
return SourceMessage(type="image", content=str(message))
52+
53+
def rebuild_from_source(
54+
self,
55+
source: SourceMessage,
56+
) -> ChatCompletionContentPartImageParam:
57+
"""Rebuild image_url content part from SourceMessage."""
58+
# Use original_part if available
59+
if hasattr(source, "original_part") and source.original_part:
60+
return source.original_part
61+
62+
# Rebuild from source fields
63+
url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "")
64+
detail = getattr(source, "detail", "auto")
65+
return {
66+
"type": "image_url",
67+
"image_url": {
68+
"url": url,
69+
"detail": detail,
70+
},
71+
}
72+
73+
def parse_fast(
74+
self,
75+
message: ChatCompletionContentPartImageParam,
76+
info: dict[str, Any],
77+
**kwargs,
78+
) -> list[TextualMemoryItem]:
79+
"""Parse image_url in fast mode - returns empty list as images need fine mode processing."""
80+
# In fast mode, images are not processed (they need vision models)
81+
# They will be processed in fine mode via process_transfer
82+
return []
83+
84+
def parse_fine(
85+
self,
86+
message: ChatCompletionContentPartImageParam,
87+
info: dict[str, Any],
88+
**kwargs,
89+
) -> list[TextualMemoryItem]:
90+
"""Parse image_url in fine mode - placeholder for future vision model integration."""
91+
# Fine mode processing would use vision models to extract text from images
92+
# For now, return empty list
93+
return []

src/memos/mem_reader/read_multi_modal/multi_modal_parser.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .assistant_parser import AssistantParser
1616
from .base import BaseMessageParser
1717
from .file_content_parser import FileContentParser
18+
from .image_parser import ImageParser
1819
from .string_parser import StringParser
1920
from .system_parser import SystemParser
2021
from .text_content_parser import TextContentParser
@@ -55,7 +56,7 @@ def __init__(
5556
self.tool_parser = ToolParser(embedder, llm)
5657
self.text_content_parser = TextContentParser(embedder, llm)
5758
self.file_content_parser = FileContentParser(embedder, llm, parser)
58-
self.image_parser = None # future
59+
self.image_parser = ImageParser(embedder, llm)
5960
self.audio_parser = None # future
6061

6162
self.role_parsers = {
@@ -69,7 +70,12 @@ def __init__(
6970
"text": self.text_content_parser,
7071
"file": self.file_content_parser,
7172
"image": self.image_parser,
73+
"image_url": self.image_parser, # Support both "image" and "image_url"
7274
"audio": self.audio_parser,
75+
# Custom tool formats
76+
"tool_description": self.tool_parser,
77+
"tool_input": self.tool_parser,
78+
"tool_output": self.tool_parser,
7379
}
7480

7581
def _get_parser(self, message: Any) -> BaseMessageParser | None:

0 commit comments

Comments
 (0)