|
1 | 1 | """Parser for file content parts (RawMessageList).""" |
2 | 2 |
|
| 3 | +import os |
| 4 | + |
3 | 5 | from typing import Any |
| 6 | +from urllib.parse import urlparse |
4 | 7 |
|
5 | 8 | from memos.embedders.base import BaseEmbedder |
6 | 9 | from memos.llms.base import BaseLLM |
@@ -237,4 +240,164 @@ def parse_fine( |
237 | 240 | info: dict[str, Any], |
238 | 241 | **kwargs, |
239 | 242 | ) -> list[TextualMemoryItem]: |
240 | | - return [] |
| 243 | + """ |
| 244 | + Parse file content part in fine mode. |
| 245 | + Fine mode downloads and parses file content, especially for URLs. |
| 246 | + Handles various file parameter scenarios: |
| 247 | + - file_data: URL (http://, https://, or @http://), base64 encoded data, or plain text content |
| 248 | + - file_id: ID of an uploaded file |
| 249 | + - filename: name of the file |
| 250 | + """ |
| 251 | + if not isinstance(message, dict): |
| 252 | + logger.warning(f"[FileContentParser] Expected dict, got {type(message)}") |
| 253 | + return [] |
| 254 | + |
| 255 | + # Extract file information |
| 256 | + file_info = message.get("file", {}) |
| 257 | + if not isinstance(file_info, dict): |
| 258 | + logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}") |
| 259 | + return [] |
| 260 | + |
| 261 | + # Extract file parameters (all are optional) |
| 262 | + file_data = file_info.get("file_data", "") |
| 263 | + file_id = file_info.get("file_id", "") |
| 264 | + filename = file_info.get("filename", "") |
| 265 | + |
| 266 | + # Initialize parser if not already set |
| 267 | + if not self.parser: |
| 268 | + try: |
| 269 | + from memos.configs.parser import ParserConfigFactory |
| 270 | + |
| 271 | + parser_config = ParserConfigFactory.model_validate( |
| 272 | + { |
| 273 | + "backend": "markitdown", |
| 274 | + "config": {}, |
| 275 | + } |
| 276 | + ) |
| 277 | + self.parser = ParserFactory.from_config(parser_config) |
| 278 | + except Exception as e: |
| 279 | + logger.warning(f"[FileContentParser] Failed to create parser: {e}") |
| 280 | + return [] |
| 281 | + |
| 282 | + parsed_text = "" |
| 283 | + temp_file_path = None |
| 284 | + |
| 285 | + try: |
| 286 | + # Priority 1: If file_data is provided, process it |
| 287 | + if file_data: |
| 288 | + if isinstance(file_data, str): |
| 289 | + # Check if it's a URL (supports @http://, http://, https://) |
| 290 | + url_str = file_data |
| 291 | + if url_str.startswith("@"): |
| 292 | + url_str = url_str[1:] # Remove @ prefix if present |
| 293 | + |
| 294 | + if url_str.startswith(("http://", "https://")): |
| 295 | + # Download and parse URL |
| 296 | + try: |
| 297 | + import requests |
| 298 | + |
| 299 | + # Parse URL to check hostname |
| 300 | + parsed_url = urlparse(url_str) |
| 301 | + hostname = parsed_url.hostname or "" |
| 302 | + |
| 303 | + logger.info(f"[FileContentParser] Downloading file from URL: {url_str}") |
| 304 | + response = requests.get(url_str, timeout=30) |
| 305 | + response.raise_for_status() |
| 306 | + |
| 307 | + # Determine filename from URL or use provided filename |
| 308 | + if not filename: |
| 309 | + filename = os.path.basename(parsed_url.path) or "downloaded_file" |
| 310 | + |
| 311 | + # Route based on hostname |
| 312 | + if hostname == "139.196.232.20": |
| 313 | + # Special handling for 139.196.232.20: directly use response text as markdown |
| 314 | + logger.info( |
| 315 | + f"[FileContentParser] Using direct markdown content for {hostname}" |
| 316 | + ) |
| 317 | + parsed_text = response.text |
| 318 | + else: |
| 319 | + logger.warning("[FileContentParser] Outer url not implemented now.") |
| 320 | + except requests.RequestException as e: |
| 321 | + logger.error( |
| 322 | + f"[FileContentParser] Failed to download URL {url_str}: {e}" |
| 323 | + ) |
| 324 | + parsed_text = f"[File URL download failed: {url_str}]" |
| 325 | + except Exception as e: |
| 326 | + logger.error(f"[FileContentParser] Error parsing downloaded file: {e}") |
| 327 | + parsed_text = f"[File parsing error: {e!s}]" |
| 328 | + |
| 329 | + # Check if it's a local file path |
| 330 | + elif os.path.exists(file_data): |
| 331 | + logger.info("[FileContentParser] local file not implemented now.") |
| 332 | + # Check if it's base64 encoded data |
| 333 | + elif file_data.startswith("data:") or ( |
| 334 | + len(file_data) > 100 |
| 335 | + and all( |
| 336 | + c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" |
| 337 | + for c in file_data[:100] |
| 338 | + ) |
| 339 | + ): |
| 340 | + logger.info("[FileContentParser] base64 not implemented now.") |
| 341 | + # Otherwise treat as plain text |
| 342 | + else: |
| 343 | + parsed_text = file_data |
| 344 | + |
| 345 | + # Priority 2: If file_id is provided but no file_data, try to use file_id as path |
| 346 | + elif file_id: |
| 347 | + logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}") |
| 348 | + parsed_text = f"[File ID: {file_id}]: File data not provided" |
| 349 | + |
| 350 | + # If no content could be parsed, create a placeholder |
| 351 | + if not parsed_text: |
| 352 | + if filename: |
| 353 | + parsed_text = f"[File: {filename}]: File data not provided" |
| 354 | + else: |
| 355 | + parsed_text = "[File: unknown]: File data not provided" |
| 356 | + |
| 357 | + except Exception as e: |
| 358 | + logger.error(f"[FileContentParser] Error in parse_fine: {e}") |
| 359 | + parsed_text = f"[File parsing error: {e!s}]" |
| 360 | + |
| 361 | + finally: |
| 362 | + # Clean up temporary file |
| 363 | + if temp_file_path and os.path.exists(temp_file_path): |
| 364 | + try: |
| 365 | + os.unlink(temp_file_path) |
| 366 | + logger.debug(f"[FileContentParser] Cleaned up temporary file: {temp_file_path}") |
| 367 | + except Exception as e: |
| 368 | + logger.warning( |
| 369 | + f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}" |
| 370 | + ) |
| 371 | + |
| 372 | + # Create source |
| 373 | + source = self.create_source(message, info) |
| 374 | + |
| 375 | + # Extract info fields |
| 376 | + info_ = info.copy() |
| 377 | + user_id = info_.pop("user_id", "") |
| 378 | + session_id = info_.pop("session_id", "") |
| 379 | + |
| 380 | + # For file content parts, default to LongTermMemory |
| 381 | + memory_type = "LongTermMemory" |
| 382 | + |
| 383 | + # Create memory item with parsed content |
| 384 | + memory_item = TextualMemoryItem( |
| 385 | + memory=parsed_text, |
| 386 | + metadata=TreeNodeTextualMemoryMetadata( |
| 387 | + user_id=user_id, |
| 388 | + session_id=session_id, |
| 389 | + memory_type=memory_type, |
| 390 | + status="activated", |
| 391 | + tags=["mode:fine", "multimodal:file"], |
| 392 | + key=_derive_key(parsed_text), |
| 393 | + embedding=self.embedder.embed([parsed_text])[0], |
| 394 | + usage=[], |
| 395 | + sources=[source], |
| 396 | + background="", |
| 397 | + confidence=0.99, |
| 398 | + type="fact", |
| 399 | + info=info_, |
| 400 | + ), |
| 401 | + ) |
| 402 | + |
| 403 | + return [memory_item] |
0 commit comments