Skip to content

Commit 0be6fd7

Browse files
committed
feat: add image type
1 parent da1d15a commit 0be6fd7

File tree

6 files changed

+195
-5
lines changed

6 files changed

+195
-5
lines changed

src/memos/mem_reader/read_multi_modal/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .assistant_parser import AssistantParser
1717
from .base import BaseMessageParser
1818
from .file_content_parser import FileContentParser
19+
from .image_parser import ImageParser
1920
from .multi_modal_parser import MultiModalParser
2021
from .string_parser import StringParser
2122
from .system_parser import SystemParser
@@ -29,6 +30,7 @@
2930
"AssistantParser",
3031
"BaseMessageParser",
3132
"FileContentParser",
33+
"ImageParser",
3234
"MultiModalParser",
3335
"StringParser",
3436
"SystemParser",

src/memos/mem_reader/read_multi_modal/assistant_parser.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ def parse_fast(
227227
# Combine all content parts
228228
content = " ".join(content_parts) if content_parts else ""
229229

230+
# If content is empty but we have tool_calls, audio, or refusal, still create memory
231+
if not content and not tool_calls and not audio and not refusal:
232+
return []
233+
230234
parts = [f"{role}: "]
231235
if chat_time:
232236
parts.append(f"[{chat_time}]: ")
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""Parser for image_url content parts."""
2+
3+
from typing import Any
4+
5+
from memos.embedders.base import BaseEmbedder
6+
from memos.llms.base import BaseLLM
7+
from memos.log import get_logger
8+
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
9+
from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam
10+
11+
from .base import BaseMessageParser
12+
13+
14+
logger = get_logger(__name__)
15+
16+
17+
class ImageParser(BaseMessageParser):
18+
"""Parser for image_url content parts."""
19+
20+
def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None):
21+
"""
22+
Initialize ImageParser.
23+
24+
Args:
25+
embedder: Embedder for generating embeddings
26+
llm: Optional LLM for fine mode processing
27+
"""
28+
super().__init__(embedder, llm)
29+
30+
def create_source(
31+
self,
32+
message: ChatCompletionContentPartImageParam,
33+
info: dict[str, Any],
34+
) -> SourceMessage:
35+
"""Create SourceMessage from image_url content part."""
36+
if isinstance(message, dict):
37+
image_url = message.get("image_url", {})
38+
if isinstance(image_url, dict):
39+
url = image_url.get("url", "")
40+
detail = image_url.get("detail", "auto")
41+
else:
42+
url = str(image_url)
43+
detail = "auto"
44+
return SourceMessage(
45+
type="image",
46+
content=f"[image_url]: {url}",
47+
original_part=message,
48+
url=url,
49+
detail=detail,
50+
)
51+
return SourceMessage(type="image", content=str(message))
52+
53+
def rebuild_from_source(
54+
self,
55+
source: SourceMessage,
56+
) -> ChatCompletionContentPartImageParam:
57+
"""Rebuild image_url content part from SourceMessage."""
58+
# Use original_part if available
59+
if hasattr(source, "original_part") and source.original_part:
60+
return source.original_part
61+
62+
# Rebuild from source fields
63+
url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "")
64+
detail = getattr(source, "detail", "auto")
65+
return {
66+
"type": "image_url",
67+
"image_url": {
68+
"url": url,
69+
"detail": detail,
70+
},
71+
}
72+
73+
def parse_fast(
74+
self,
75+
message: ChatCompletionContentPartImageParam,
76+
info: dict[str, Any],
77+
**kwargs,
78+
) -> list[TextualMemoryItem]:
79+
"""Parse image_url in fast mode - returns empty list as images need fine mode processing."""
80+
# In fast mode, images are not processed (they need vision models)
81+
# They will be processed in fine mode via process_transfer
82+
return []
83+
84+
def parse_fine(
85+
self,
86+
message: ChatCompletionContentPartImageParam,
87+
info: dict[str, Any],
88+
**kwargs,
89+
) -> list[TextualMemoryItem]:
90+
"""Parse image_url in fine mode - placeholder for future vision model integration."""
91+
# Fine mode processing would use vision models to extract text from images
92+
# For now, return empty list
93+
return []

src/memos/mem_reader/read_multi_modal/multi_modal_parser.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .assistant_parser import AssistantParser
1616
from .base import BaseMessageParser
1717
from .file_content_parser import FileContentParser
18+
from .image_parser import ImageParser
1819
from .string_parser import StringParser
1920
from .system_parser import SystemParser
2021
from .text_content_parser import TextContentParser
@@ -55,7 +56,7 @@ def __init__(
5556
self.tool_parser = ToolParser(embedder, llm)
5657
self.text_content_parser = TextContentParser(embedder, llm)
5758
self.file_content_parser = FileContentParser(embedder, llm, parser)
58-
self.image_parser = None # future
59+
self.image_parser = ImageParser(embedder, llm)
5960
self.audio_parser = None # future
6061

6162
self.role_parsers = {
@@ -69,7 +70,12 @@ def __init__(
6970
"text": self.text_content_parser,
7071
"file": self.file_content_parser,
7172
"image": self.image_parser,
73+
"image_url": self.image_parser, # Support both "image" and "image_url"
7274
"audio": self.audio_parser,
75+
# Custom tool formats
76+
"tool_description": self.tool_parser,
77+
"tool_input": self.tool_parser,
78+
"tool_output": self.tool_parser,
7379
}
7480

7581
def _get_parser(self, message: Any) -> BaseMessageParser | None:

src/memos/mem_reader/read_multi_modal/tool_parser.py

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,52 @@ def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None):
2929

3030
def create_source(
3131
self,
32-
message: ChatCompletionToolMessageParam,
32+
message: ChatCompletionToolMessageParam | dict[str, Any],
3333
info: dict[str, Any],
3434
) -> SourceMessage:
35-
"""Create SourceMessage from tool message."""
35+
"""Create SourceMessage from tool message or custom tool format."""
3636
if not isinstance(message, dict):
3737
return SourceMessage(type="chat", role="tool")
3838

39+
# Handle custom tool formats (tool_description, tool_input, tool_output)
40+
msg_type = message.get("type", "")
41+
if msg_type == "tool_description":
42+
name = message.get("name", "")
43+
description = message.get("description", "")
44+
parameters = message.get("parameters", {})
45+
content = f"[tool_description] name={name}, description={description}, parameters={parameters}"
46+
return SourceMessage(
47+
type="tool_description",
48+
content=content,
49+
original_part=message,
50+
)
51+
elif msg_type == "tool_input":
52+
call_id = message.get("call_id", "")
53+
name = message.get("name", "")
54+
argument = message.get("argument", {})
55+
content = f"[tool_input] call_id={call_id}, name={name}, argument={argument}"
56+
return SourceMessage(
57+
type="tool_input",
58+
content=content,
59+
message_id=call_id,
60+
original_part=message,
61+
)
62+
elif msg_type == "tool_output":
63+
call_id = message.get("call_id", "")
64+
name = message.get("name", "")
65+
output = message.get("output", {})
66+
content = f"[tool_output] call_id={call_id}, name={name}, output={output}"
67+
return SourceMessage(
68+
type="tool_output",
69+
content=content,
70+
message_id=call_id,
71+
original_part=message,
72+
)
73+
74+
# Handle standard tool message
3975
content = _extract_text_from_content(message.get("content", ""))
4076
return SourceMessage(
41-
type="chat",
77+
type="tool",
4278
role="tool",
4379
chat_time=message.get("chat_time"),
4480
message_id=message.get("message_id"),
@@ -60,10 +96,54 @@ def rebuild_from_source(
6096

6197
def parse_fast(
6298
self,
63-
message: ChatCompletionToolMessageParam,
99+
message: ChatCompletionToolMessageParam | dict[str, Any],
64100
info: dict[str, Any],
65101
**kwargs,
66102
) -> list[TextualMemoryItem]:
103+
"""Parse tool message in fast mode."""
104+
from memos.memories.textual.item import TreeNodeTextualMemoryMetadata
105+
106+
from .base import _derive_key
107+
108+
if not isinstance(message, dict):
109+
return []
110+
111+
# Handle custom tool formats
112+
msg_type = message.get("type", "")
113+
if msg_type in ("tool_description", "tool_input", "tool_output"):
114+
# Create source
115+
source = self.create_source(message, info)
116+
content = source.content or ""
117+
if not content:
118+
return []
119+
120+
# Extract info fields
121+
info_ = info.copy()
122+
user_id = info_.pop("user_id", "")
123+
session_id = info_.pop("session_id", "")
124+
125+
# Create memory item
126+
memory_item = TextualMemoryItem(
127+
memory=content,
128+
metadata=TreeNodeTextualMemoryMetadata(
129+
user_id=user_id,
130+
session_id=session_id,
131+
memory_type="LongTermMemory",
132+
status="activated",
133+
tags=["mode:fast"],
134+
key=_derive_key(content),
135+
embedding=self.embedder.embed([content])[0],
136+
usage=[],
137+
sources=[source],
138+
background="",
139+
confidence=0.99,
140+
type="fact",
141+
info=info_,
142+
),
143+
)
144+
return [memory_item]
145+
146+
# Handle standard tool message
67147
return super().parse_fast(message, info, **kwargs)
68148

69149
def parse_fine(

src/memos/mem_reader/read_multi_modal/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[Messa
9393
if not items:
9494
continue
9595

96+
# Keep string as-is (MessagesType supports str)
97+
if isinstance(items, str):
98+
complete_scene_data.append(items)
99+
continue
100+
96101
# ONLY add chat_time if it's a MessageList
97102
if not _is_message_list(items):
98103
complete_scene_data.append(items)

0 commit comments

Comments
 (0)