Skip to content

Commit 29f64df

Browse files
committed
feat: update file_content_parser fine
1 parent f714027 commit 29f64df

File tree

1 file changed

+164
-1
lines changed

1 file changed

+164
-1
lines changed

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
"""Parser for file content parts (RawMessageList)."""
22

3+
import os
4+
35
from typing import Any
6+
from urllib.parse import urlparse
47

58
from memos.embedders.base import BaseEmbedder
69
from memos.llms.base import BaseLLM
@@ -237,4 +240,164 @@ def parse_fine(
237240
info: dict[str, Any],
238241
**kwargs,
239242
) -> list[TextualMemoryItem]:
240-
return []
243+
"""
244+
Parse file content part in fine mode.
245+
Fine mode downloads and parses file content, especially for URLs.
246+
Handles various file parameter scenarios:
247+
- file_data: URL (http://, https://, or @http://), base64 encoded data, or plain text content
248+
- file_id: ID of an uploaded file
249+
- filename: name of the file
250+
"""
251+
if not isinstance(message, dict):
252+
logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
253+
return []
254+
255+
# Extract file information
256+
file_info = message.get("file", {})
257+
if not isinstance(file_info, dict):
258+
logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
259+
return []
260+
261+
# Extract file parameters (all are optional)
262+
file_data = file_info.get("file_data", "")
263+
file_id = file_info.get("file_id", "")
264+
filename = file_info.get("filename", "")
265+
266+
# Initialize parser if not already set
267+
if not self.parser:
268+
try:
269+
from memos.configs.parser import ParserConfigFactory
270+
271+
parser_config = ParserConfigFactory.model_validate(
272+
{
273+
"backend": "markitdown",
274+
"config": {},
275+
}
276+
)
277+
self.parser = ParserFactory.from_config(parser_config)
278+
except Exception as e:
279+
logger.warning(f"[FileContentParser] Failed to create parser: {e}")
280+
return []
281+
282+
parsed_text = ""
283+
temp_file_path = None
284+
285+
try:
286+
# Priority 1: If file_data is provided, process it
287+
if file_data:
288+
if isinstance(file_data, str):
289+
# Check if it's a URL (supports @http://, http://, https://)
290+
url_str = file_data
291+
if url_str.startswith("@"):
292+
url_str = url_str[1:] # Remove @ prefix if present
293+
294+
if url_str.startswith(("http://", "https://")):
295+
# Download and parse URL
296+
try:
297+
import requests
298+
299+
# Parse URL to check hostname
300+
parsed_url = urlparse(url_str)
301+
hostname = parsed_url.hostname or ""
302+
303+
logger.info(f"[FileContentParser] Downloading file from URL: {url_str}")
304+
response = requests.get(url_str, timeout=30)
305+
response.raise_for_status()
306+
307+
# Determine filename from URL or use provided filename
308+
if not filename:
309+
filename = os.path.basename(parsed_url.path) or "downloaded_file"
310+
311+
# Route based on hostname
312+
if hostname == "139.196.232.20":
313+
# Special handling for 139.196.232.20: directly use response text as markdown
314+
logger.info(
315+
f"[FileContentParser] Using direct markdown content for {hostname}"
316+
)
317+
parsed_text = response.text
318+
else:
319+
logger.warning("[FileContentParser] Outer url not implemented now.")
320+
except requests.RequestException as e:
321+
logger.error(
322+
f"[FileContentParser] Failed to download URL {url_str}: {e}"
323+
)
324+
parsed_text = f"[File URL download failed: {url_str}]"
325+
except Exception as e:
326+
logger.error(f"[FileContentParser] Error parsing downloaded file: {e}")
327+
parsed_text = f"[File parsing error: {e!s}]"
328+
329+
# Check if it's a local file path
330+
elif os.path.exists(file_data):
331+
logger.info("[FileContentParser] local file not implemented now.")
332+
# Check if it's base64 encoded data
333+
elif file_data.startswith("data:") or (
334+
len(file_data) > 100
335+
and all(
336+
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
337+
for c in file_data[:100]
338+
)
339+
):
340+
logger.info("[FileContentParser] base64 not implemented now.")
341+
# Otherwise treat as plain text
342+
else:
343+
parsed_text = file_data
344+
345+
# Priority 2: If file_id is provided but no file_data, try to use file_id as path
346+
elif file_id:
347+
logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}")
348+
parsed_text = f"[File ID: {file_id}]: File data not provided"
349+
350+
# If no content could be parsed, create a placeholder
351+
if not parsed_text:
352+
if filename:
353+
parsed_text = f"[File: {filename}]: File data not provided"
354+
else:
355+
parsed_text = "[File: unknown]: File data not provided"
356+
357+
except Exception as e:
358+
logger.error(f"[FileContentParser] Error in parse_fine: {e}")
359+
parsed_text = f"[File parsing error: {e!s}]"
360+
361+
finally:
362+
# Clean up temporary file
363+
if temp_file_path and os.path.exists(temp_file_path):
364+
try:
365+
os.unlink(temp_file_path)
366+
logger.debug(f"[FileContentParser] Cleaned up temporary file: {temp_file_path}")
367+
except Exception as e:
368+
logger.warning(
369+
f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}"
370+
)
371+
372+
# Create source
373+
source = self.create_source(message, info)
374+
375+
# Extract info fields
376+
info_ = info.copy()
377+
user_id = info_.pop("user_id", "")
378+
session_id = info_.pop("session_id", "")
379+
380+
# For file content parts, default to LongTermMemory
381+
memory_type = "LongTermMemory"
382+
383+
# Create memory item with parsed content
384+
memory_item = TextualMemoryItem(
385+
memory=parsed_text,
386+
metadata=TreeNodeTextualMemoryMetadata(
387+
user_id=user_id,
388+
session_id=session_id,
389+
memory_type=memory_type,
390+
status="activated",
391+
tags=["mode:fine", "multimodal:file"],
392+
key=_derive_key(parsed_text),
393+
embedding=self.embedder.embed([parsed_text])[0],
394+
usage=[],
395+
sources=[source],
396+
background="",
397+
confidence=0.99,
398+
type="fact",
399+
info=info_,
400+
),
401+
)
402+
403+
return [memory_item]

0 commit comments

Comments
 (0)