Skip to content

Commit 6f32006

Browse files
fridayLCaralHsi
andauthored
feat: add file_info for file parser (#651)
* feat: update memos headers * feat: headers add * feat: update search agent * feat: upadte mem story * feat: update mem scehduler * feat: update deepsearch mem code * feat: update deepsearch agent * feat: update test code * fix: remove dup config * feat: dock search pipeline * fix: code test * feat: add test scripts * feat: add test * feat: update need_raw process * fix: add initter * fix: change agent search func name * feat: update logs and defined * feat: update full text mem search * feat: cp plugin to dev * feat: add one recall for fulltext retrieval * fix: set default for fulltext search * feat: add langchain chunk * feat: fix playground for query * feat: update file content memory extract * feat: update code * feat: update import * code: reformat suffix * feat: update file_id * remove langchain-text-splitters==1.0.0 * feat: add reqiuement * feat: make test * feat: fix markdown * feat: fix simple chunker * feat: add file sources * feat: add concat doc source * add: file_info --------- Co-authored-by: CaralHsi <[email protected]>
1 parent 43faee0 commit 6f32006

File tree

3 files changed

+10
-6
lines changed

3 files changed

+10
-6
lines changed

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ def create_source(
170170
chunk_index: int | None = None,
171171
chunk_total: int | None = None,
172172
chunk_content: str | None = None,
173+
file_url_flag: bool = False,
173174
) -> SourceMessage:
174175
"""Create SourceMessage from file content part."""
175176
if isinstance(message, dict):
@@ -178,6 +179,7 @@ def create_source(
178179
"type": "file",
179180
"doc_path": file_info.get("filename") or file_info.get("file_id", ""),
180181
"content": chunk_content if chunk_content else file_info.get("file_data", ""),
182+
"file_info": file_info if file_url_flag else {},
181183
}
182184
# Add chunk ordering information if provided
183185
if chunk_index is not None:
@@ -202,10 +204,7 @@ def rebuild_from_source(
202204
# Rebuild from source fields
203205
return {
204206
"type": "file",
205-
"file": {
206-
"filename": source.doc_path or "",
207-
"file_data": source.content or "",
208-
},
207+
"file": source.file_info,
209208
}
210209

211210
def _parse_file(self, file_info: dict[str, Any]) -> str:
@@ -278,7 +277,7 @@ def parse_fast(
278277
file_data = file_info.get("file_data", "")
279278
file_id = file_info.get("file_id", "")
280279
filename = file_info.get("filename", "")
281-
280+
file_url_flag = False
282281
# Build content string based on available information
283282
content_parts = []
284283

@@ -297,6 +296,7 @@ def parse_fast(
297296
content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]")
298297
# Check if it looks like a URL
299298
elif file_data.startswith(("http://", "https://", "file://")):
299+
file_url_flag = True
300300
content_parts.append(f"[File URL: {file_data}]")
301301
else:
302302
# TODO: split into multiple memory items
@@ -348,6 +348,7 @@ def parse_fast(
348348
chunk_index=chunk_idx,
349349
chunk_total=total_chunks,
350350
chunk_content=chunk_text,
351+
file_url_flag=file_url_flag,
351352
)
352353

353354
memory_item = TextualMemoryItem(
@@ -384,6 +385,7 @@ def parse_fast(
384385
chunk_index=None,
385386
chunk_total=0,
386387
chunk_content=content,
388+
file_url_flag=file_url_flag,
387389
)
388390
memory_item = TextualMemoryItem(
389391
memory=content,

src/memos/mem_reader/read_multi_modal/user_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def create_source(
8080
message_id=message_id,
8181
doc_path=file_info.get("filename") or file_info.get("file_id", ""),
8282
content=file_info.get("file_data", ""),
83+
file_info=file_info,
8384
)
8485
)
8586
elif part_type == "image_url":

src/memos/memories/textual/item.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class SourceMessage(BaseModel):
2828
source is a chat turn.
2929
- content: Minimal reproducible snippet from the source. If omitted,
3030
upstream may fall back to `doc_path` / `url` / `message_id`.
31+
- file_info: File information for file source.
3132
- chat_time / message_id / doc_path: Locators for precisely pointing back
3233
to the original record (timestamp, message id, document path).
3334
- Extra fields: Allowed (`model_config.extra="allow"`) to carry arbitrary
@@ -40,7 +41,7 @@ class SourceMessage(BaseModel):
4041
message_id: str | None = None
4142
content: str | None = None
4243
doc_path: str | None = None
43-
44+
file_info: dict | None = None
4445
model_config = ConfigDict(extra="allow")
4546

4647

0 commit comments

Comments
 (0)