fix: handle UTF-8 BOM in frontmatter parsing

phernandez · claude · phernandez · commit 85684f848f45 · 2025-12-24T14:28:50.000-06:00
Fixes #452 - Imported conversations not fully indexed Files with UTF-8 BOM (Byte Order Mark) at the start would fail frontmatter detection, causing: - Title to fall back to filename instead of frontmatter value - Permalink to be null in the database Added strip_bom() helper function and updated all frontmatter-related functions to strip BOM before processing: - has_frontmatter() - parse_frontmatter() - remove_frontmatter() - EntityParser.parse_markdown_content() Added comprehensive tests for BOM handling with various scenarios. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
diff --git a/src/basic_memory/file_utils.py b/src/basic_memory/file_utils.py
@@ -69,6 +69,28 @@ async def compute_checksum(content: Union[str, bytes]) -> str:
         raise FileError(f"Failed to compute checksum: {e}")
 
 
+# UTF-8 BOM character that can appear at the start of files
+UTF8_BOM = '\ufeff'
+
+
+def strip_bom(content: str) -> str:
+    """Strip UTF-8 BOM from the start of content if present.
+
+    BOM (Byte Order Mark) characters can be present in files created on Windows
+    or copied from certain sources. They should be stripped before processing
+    frontmatter. See issue #452.
+
+    Args:
+        content: Content that may start with BOM
+
+    Returns:
+        Content with BOM removed if present
+    """
+    if content and content.startswith(UTF8_BOM):
+        return content[1:]
+    return content
+
+
 async def write_file_atomic(path: FilePath, content: str) -> None:
     """
     Write file with atomic operation using temporary file.
@@ -113,7 +135,8 @@ def has_frontmatter(content: str) -> bool:
     if not content:
         return False
 
-    content = content.strip()
+    # Strip BOM before checking for frontmatter markers
+    content = strip_bom(content).strip()
     if not content.startswith("---"):
         return False
 
@@ -134,6 +157,8 @@ def parse_frontmatter(content: str) -> Dict[str, Any]:
         ParseError: If frontmatter is invalid or parsing fails
     """
     try:
+        # Strip BOM before parsing frontmatter
+        content = strip_bom(content)
         if not content.strip().startswith("---"):
             raise ParseError("Content has no frontmatter")
 
@@ -175,7 +200,8 @@ def remove_frontmatter(content: str) -> str:
     Raises:
         ParseError: If content starts with frontmatter marker but is malformed
     """
-    content = content.strip()
+    # Strip BOM before processing
+    content = strip_bom(content).strip()
 
     # Return as-is if no frontmatter marker
     if not content.startswith("---"):
diff --git a/src/basic_memory/markdown/entity_parser.py b/src/basic_memory/markdown/entity_parser.py
@@ -227,6 +227,11 @@ async def parse_markdown_content(
         Returns:
             EntityMarkdown with parsed content
         """
+        # Strip BOM before parsing (can be present in files from Windows or certain sources)
+        # See issue #452
+        from basic_memory.file_utils import strip_bom
+        content = strip_bom(content)
+
         # Parse frontmatter with proper error handling for malformed YAML
         try:
             post = frontmatter.loads(content)
diff --git a/tests/importers/test_conversation_indexing.py b/tests/importers/test_conversation_indexing.py
@@ -0,0 +1,103 @@
+"""Test that imported conversations are properly indexed with correct permalink and title.
+
+This test verifies issue #452 - Imported conversations not indexed correctly.
+"""
+
+import pytest
+from pathlib import Path
+
+from basic_memory.config import ProjectConfig
+from basic_memory.importers.claude_conversations_importer import ClaudeConversationsImporter
+from basic_memory.markdown import EntityParser
+from basic_memory.markdown.markdown_processor import MarkdownProcessor
+from basic_memory.repository import EntityRepository
+from basic_memory.services import EntityService
+from basic_memory.services.search_service import SearchService
+from basic_memory.schemas.search import SearchQuery
+from basic_memory.sync.sync_service import SyncService
+
+
+@pytest.mark.asyncio
+async def test_imported_conversations_have_correct_permalink_and_title(
+    project_config: ProjectConfig,
+    sync_service: SyncService,
+    entity_service: EntityService,
+    entity_repository: EntityRepository,
+    search_service: SearchService,
+):
+    """Test that imported conversations have correct permalink and title after sync.
+
+    Issue #452: Imported conversations show permalink: null in search results
+    and title shows as filename instead of frontmatter title.
+    """
+    base_path = project_config.home
+
+    # Create parser and processor for importer
+    parser = EntityParser(base_path)
+    processor = MarkdownProcessor(parser)
+
+    # Create importer
+    importer = ClaudeConversationsImporter(base_path, processor)
+
+    # Sample conversation data
+    conversations = [{
+        'uuid': 'test-123',
+        'name': 'My Test Conversation Title',
+        'created_at': '2025-01-15T10:00:00Z',
+        'updated_at': '2025-01-15T11:00:00Z',
+        'chat_messages': [
+            {
+                'uuid': 'msg-1',
+                'sender': 'human',
+                'created_at': '2025-01-15T10:00:00Z',
+                'text': 'Hello world',
+                'content': [{'type': 'text', 'text': 'Hello world'}],
+                'attachments': []
+            },
+            {
+                'uuid': 'msg-2',
+                'sender': 'assistant',
+                'created_at': '2025-01-15T10:01:00Z',
+                'text': 'Hello!',
+                'content': [{'type': 'text', 'text': 'Hello!'}],
+                'attachments': []
+            }
+        ]
+    }]
+
+    # Run import
+    result = await importer.import_data(conversations, 'conversations')
+    assert result.success, f"Import failed: {result}"
+    assert result.conversations == 1
+
+    # Verify the file was created with correct content
+    conv_path = base_path / 'conversations' / '20250115-My_Test_Conversation_Title.md'
+    assert conv_path.exists(), f"Expected file at {conv_path}"
+
+    content = conv_path.read_text()
+    assert '---' in content, "File should have frontmatter markers"
+    assert 'title: My Test Conversation Title' in content, "File should have title in frontmatter"
+    assert 'permalink: conversations/20250115-My_Test_Conversation_Title' in content, "File should have permalink in frontmatter"
+
+    # Run sync to index the imported file
+    await sync_service.sync(base_path, project_config.name)
+
+    # Verify entity in database
+    entities = await entity_repository.find_all()
+    assert len(entities) == 1, f"Expected 1 entity, got {len(entities)}"
+
+    entity = entities[0]
+
+    # These are the key assertions for issue #452
+    assert entity.title == 'My Test Conversation Title', f"Title should be from frontmatter, got: {entity.title}"
+    assert entity.permalink == 'conversations/20250115-My_Test_Conversation_Title', f"Permalink should be from frontmatter, got: {entity.permalink}"
+
+    # Verify search index also has correct data
+    results = await search_service.search(SearchQuery(text='Test Conversation'))
+    assert len(results) >= 1, "Should find the conversation in search"
+
+    # Find our entity in search results
+    search_result = next((r for r in results if r.entity_id == entity.id), None)
+    assert search_result is not None, "Entity should be in search results"
+    assert search_result.title == 'My Test Conversation Title', f"Search title should be from frontmatter, got: {search_result.title}"
+    assert search_result.permalink == 'conversations/20250115-My_Test_Conversation_Title', f"Search permalink should not be null, got: {search_result.permalink}"
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
@@ -214,3 +214,36 @@ def test_sanitize_for_filename_removes_invalid_characters():
 )
 def test_sanitize_for_folder_edge_cases(input_folder, expected):
     assert sanitize_for_folder(input_folder) == expected
+
+
+class TestBOMHandling:
+    """Test handling of Byte Order Mark (BOM) in frontmatter.
+
+    BOM characters can be present in files created on Windows or copied
+    from certain sources. They should not break frontmatter detection
+    or parsing. See issue #452.
+    """
+
+    def test_has_frontmatter_with_bom(self):
+        """Test that has_frontmatter handles BOM correctly."""
+        # Content with UTF-8 BOM
+        content_with_bom = '\ufeff---\ntitle: Test\n---\nContent'
+        assert has_frontmatter(content_with_bom), "Should detect frontmatter even with BOM"
+
+    def test_has_frontmatter_with_bom_and_windows_crlf(self):
+        """Test BOM with Windows line endings."""
+        content = '\ufeff---\r\ntitle: Test\r\n---\r\nContent'
+        assert has_frontmatter(content), "Should detect frontmatter with BOM and CRLF"
+
+    def test_parse_frontmatter_with_bom(self):
+        """Test that parse_frontmatter handles BOM correctly."""
+        content_with_bom = '\ufeff---\ntitle: Test Title\ntype: note\n---\nContent'
+        result = parse_frontmatter(content_with_bom)
+        assert result['title'] == 'Test Title'
+        assert result['type'] == 'note'
+
+    def test_remove_frontmatter_with_bom(self):
+        """Test that remove_frontmatter handles BOM correctly."""
+        content_with_bom = '\ufeff---\ntitle: Test\n---\nContent here'
+        result = remove_frontmatter(content_with_bom)
+        assert result == 'Content here'