Skip to content

Commit 85684f8

Browse files
phernandezclaude
andcommitted
fix: handle UTF-8 BOM in frontmatter parsing
Fixes #452 - Imported conversations not fully indexed Files with UTF-8 BOM (Byte Order Mark) at the start would fail frontmatter detection, causing: - Title to fall back to filename instead of frontmatter value - Permalink to be null in the database Added strip_bom() helper function and updated all frontmatter-related functions to strip BOM before processing: - has_frontmatter() - parse_frontmatter() - remove_frontmatter() - EntityParser.parse_markdown_content() Added comprehensive tests for BOM handling with various scenarios. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]> Signed-off-by: phernandez <[email protected]>
1 parent 14ce5a3 commit 85684f8

File tree

4 files changed

+169
-2
lines changed

4 files changed

+169
-2
lines changed

src/basic_memory/file_utils.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,28 @@ async def compute_checksum(content: Union[str, bytes]) -> str:
6969
raise FileError(f"Failed to compute checksum: {e}")
7070

7171

72+
# UTF-8 BOM character that can appear at the start of files
73+
UTF8_BOM = '\ufeff'
74+
75+
76+
def strip_bom(content: str) -> str:
77+
"""Strip UTF-8 BOM from the start of content if present.
78+
79+
BOM (Byte Order Mark) characters can be present in files created on Windows
80+
or copied from certain sources. They should be stripped before processing
81+
frontmatter. See issue #452.
82+
83+
Args:
84+
content: Content that may start with BOM
85+
86+
Returns:
87+
Content with BOM removed if present
88+
"""
89+
if content and content.startswith(UTF8_BOM):
90+
return content[1:]
91+
return content
92+
93+
7294
async def write_file_atomic(path: FilePath, content: str) -> None:
7395
"""
7496
Write file with atomic operation using temporary file.
@@ -113,7 +135,8 @@ def has_frontmatter(content: str) -> bool:
113135
if not content:
114136
return False
115137

116-
content = content.strip()
138+
# Strip BOM before checking for frontmatter markers
139+
content = strip_bom(content).strip()
117140
if not content.startswith("---"):
118141
return False
119142

@@ -134,6 +157,8 @@ def parse_frontmatter(content: str) -> Dict[str, Any]:
134157
ParseError: If frontmatter is invalid or parsing fails
135158
"""
136159
try:
160+
# Strip BOM before parsing frontmatter
161+
content = strip_bom(content)
137162
if not content.strip().startswith("---"):
138163
raise ParseError("Content has no frontmatter")
139164

@@ -175,7 +200,8 @@ def remove_frontmatter(content: str) -> str:
175200
Raises:
176201
ParseError: If content starts with frontmatter marker but is malformed
177202
"""
178-
content = content.strip()
203+
# Strip BOM before processing
204+
content = strip_bom(content).strip()
179205

180206
# Return as-is if no frontmatter marker
181207
if not content.startswith("---"):

src/basic_memory/markdown/entity_parser.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,11 @@ async def parse_markdown_content(
227227
Returns:
228228
EntityMarkdown with parsed content
229229
"""
230+
# Strip BOM before parsing (can be present in files from Windows or certain sources)
231+
# See issue #452
232+
from basic_memory.file_utils import strip_bom
233+
content = strip_bom(content)
234+
230235
# Parse frontmatter with proper error handling for malformed YAML
231236
try:
232237
post = frontmatter.loads(content)
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""Test that imported conversations are properly indexed with correct permalink and title.
2+
3+
This test verifies issue #452 - Imported conversations not indexed correctly.
4+
"""
5+
6+
import pytest
7+
from pathlib import Path
8+
9+
from basic_memory.config import ProjectConfig
10+
from basic_memory.importers.claude_conversations_importer import ClaudeConversationsImporter
11+
from basic_memory.markdown import EntityParser
12+
from basic_memory.markdown.markdown_processor import MarkdownProcessor
13+
from basic_memory.repository import EntityRepository
14+
from basic_memory.services import EntityService
15+
from basic_memory.services.search_service import SearchService
16+
from basic_memory.schemas.search import SearchQuery
17+
from basic_memory.sync.sync_service import SyncService
18+
19+
20+
@pytest.mark.asyncio
21+
async def test_imported_conversations_have_correct_permalink_and_title(
22+
project_config: ProjectConfig,
23+
sync_service: SyncService,
24+
entity_service: EntityService,
25+
entity_repository: EntityRepository,
26+
search_service: SearchService,
27+
):
28+
"""Test that imported conversations have correct permalink and title after sync.
29+
30+
Issue #452: Imported conversations show permalink: null in search results
31+
and title shows as filename instead of frontmatter title.
32+
"""
33+
base_path = project_config.home
34+
35+
# Create parser and processor for importer
36+
parser = EntityParser(base_path)
37+
processor = MarkdownProcessor(parser)
38+
39+
# Create importer
40+
importer = ClaudeConversationsImporter(base_path, processor)
41+
42+
# Sample conversation data
43+
conversations = [{
44+
'uuid': 'test-123',
45+
'name': 'My Test Conversation Title',
46+
'created_at': '2025-01-15T10:00:00Z',
47+
'updated_at': '2025-01-15T11:00:00Z',
48+
'chat_messages': [
49+
{
50+
'uuid': 'msg-1',
51+
'sender': 'human',
52+
'created_at': '2025-01-15T10:00:00Z',
53+
'text': 'Hello world',
54+
'content': [{'type': 'text', 'text': 'Hello world'}],
55+
'attachments': []
56+
},
57+
{
58+
'uuid': 'msg-2',
59+
'sender': 'assistant',
60+
'created_at': '2025-01-15T10:01:00Z',
61+
'text': 'Hello!',
62+
'content': [{'type': 'text', 'text': 'Hello!'}],
63+
'attachments': []
64+
}
65+
]
66+
}]
67+
68+
# Run import
69+
result = await importer.import_data(conversations, 'conversations')
70+
assert result.success, f"Import failed: {result}"
71+
assert result.conversations == 1
72+
73+
# Verify the file was created with correct content
74+
conv_path = base_path / 'conversations' / '20250115-My_Test_Conversation_Title.md'
75+
assert conv_path.exists(), f"Expected file at {conv_path}"
76+
77+
content = conv_path.read_text()
78+
assert '---' in content, "File should have frontmatter markers"
79+
assert 'title: My Test Conversation Title' in content, "File should have title in frontmatter"
80+
assert 'permalink: conversations/20250115-My_Test_Conversation_Title' in content, "File should have permalink in frontmatter"
81+
82+
# Run sync to index the imported file
83+
await sync_service.sync(base_path, project_config.name)
84+
85+
# Verify entity in database
86+
entities = await entity_repository.find_all()
87+
assert len(entities) == 1, f"Expected 1 entity, got {len(entities)}"
88+
89+
entity = entities[0]
90+
91+
# These are the key assertions for issue #452
92+
assert entity.title == 'My Test Conversation Title', f"Title should be from frontmatter, got: {entity.title}"
93+
assert entity.permalink == 'conversations/20250115-My_Test_Conversation_Title', f"Permalink should be from frontmatter, got: {entity.permalink}"
94+
95+
# Verify search index also has correct data
96+
results = await search_service.search(SearchQuery(text='Test Conversation'))
97+
assert len(results) >= 1, "Should find the conversation in search"
98+
99+
# Find our entity in search results
100+
search_result = next((r for r in results if r.entity_id == entity.id), None)
101+
assert search_result is not None, "Entity should be in search results"
102+
assert search_result.title == 'My Test Conversation Title', f"Search title should be from frontmatter, got: {search_result.title}"
103+
assert search_result.permalink == 'conversations/20250115-My_Test_Conversation_Title', f"Search permalink should not be null, got: {search_result.permalink}"

tests/utils/test_file_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,36 @@ def test_sanitize_for_filename_removes_invalid_characters():
214214
)
215215
def test_sanitize_for_folder_edge_cases(input_folder, expected):
216216
assert sanitize_for_folder(input_folder) == expected
217+
218+
219+
class TestBOMHandling:
220+
"""Test handling of Byte Order Mark (BOM) in frontmatter.
221+
222+
BOM characters can be present in files created on Windows or copied
223+
from certain sources. They should not break frontmatter detection
224+
or parsing. See issue #452.
225+
"""
226+
227+
def test_has_frontmatter_with_bom(self):
228+
"""Test that has_frontmatter handles BOM correctly."""
229+
# Content with UTF-8 BOM
230+
content_with_bom = '\ufeff---\ntitle: Test\n---\nContent'
231+
assert has_frontmatter(content_with_bom), "Should detect frontmatter even with BOM"
232+
233+
def test_has_frontmatter_with_bom_and_windows_crlf(self):
234+
"""Test BOM with Windows line endings."""
235+
content = '\ufeff---\r\ntitle: Test\r\n---\r\nContent'
236+
assert has_frontmatter(content), "Should detect frontmatter with BOM and CRLF"
237+
238+
def test_parse_frontmatter_with_bom(self):
239+
"""Test that parse_frontmatter handles BOM correctly."""
240+
content_with_bom = '\ufeff---\ntitle: Test Title\ntype: note\n---\nContent'
241+
result = parse_frontmatter(content_with_bom)
242+
assert result['title'] == 'Test Title'
243+
assert result['type'] == 'note'
244+
245+
def test_remove_frontmatter_with_bom(self):
246+
"""Test that remove_frontmatter handles BOM correctly."""
247+
content_with_bom = '\ufeff---\ntitle: Test\n---\nContent here'
248+
result = remove_frontmatter(content_with_bom)
249+
assert result == 'Content here'

0 commit comments

Comments
 (0)