Skip to content

Commit a7d7cc5

Browse files
phernandezclaude
andauthored
fix: Normalize YAML frontmatter types to prevent AttributeError (#236) (#402)
Signed-off-by: phernandez <[email protected]> Co-authored-by: Claude <[email protected]>
1 parent a7e696b commit a7d7cc5

File tree

2 files changed

+321
-5
lines changed

2 files changed

+321
-5
lines changed

src/basic_memory/markdown/entity_parser.py

Lines changed: 86 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""
55

66
from dataclasses import dataclass, field
7-
from datetime import datetime
7+
from datetime import date, datetime
88
from pathlib import Path
99
from typing import Any, Optional
1010

@@ -26,6 +26,82 @@
2626
md = MarkdownIt().use(observation_plugin).use(relation_plugin)
2727

2828

29+
def normalize_frontmatter_value(value: Any) -> Any:
30+
"""Normalize frontmatter values to safe types for processing.
31+
32+
PyYAML automatically converts various string-like values into native Python types:
33+
- Date strings ("2025-10-24") → datetime.date objects
34+
- Numbers ("1.0") → int or float
35+
- Booleans ("true") → bool
36+
- Lists → list objects
37+
38+
This can cause AttributeError when code expects strings and calls string methods
39+
like .strip() on these values (see GitHub issue #236).
40+
41+
This function normalizes all frontmatter values to safe types:
42+
- Dates/datetimes → ISO format strings
43+
- Numbers (int/float) → strings
44+
- Booleans → strings ("True"/"False")
45+
- Lists → preserved as lists, but items are recursively normalized
46+
- Dicts → preserved as dicts, but values are recursively normalized
47+
- Strings → kept as-is
48+
- None → kept as None
49+
50+
Args:
51+
value: The frontmatter value to normalize
52+
53+
Returns:
54+
The normalized value safe for string operations
55+
56+
Example:
57+
>>> normalize_frontmatter_value(datetime.date(2025, 10, 24))
58+
'2025-10-24'
59+
>>> normalize_frontmatter_value([datetime.date(2025, 10, 24), "tag", 123])
60+
['2025-10-24', 'tag', '123']
61+
>>> normalize_frontmatter_value(True)
62+
'True'
63+
"""
64+
# Convert date/datetime objects to ISO format strings
65+
if isinstance(value, datetime):
66+
return value.isoformat()
67+
if isinstance(value, date):
68+
return value.isoformat()
69+
70+
# Convert boolean to string (must come before int check since bool is subclass of int)
71+
if isinstance(value, bool):
72+
return str(value)
73+
74+
# Convert numbers to strings
75+
if isinstance(value, (int, float)):
76+
return str(value)
77+
78+
# Recursively process lists (preserve as list, normalize items)
79+
if isinstance(value, list):
80+
return [normalize_frontmatter_value(item) for item in value]
81+
82+
# Recursively process dicts (preserve as dict, normalize values)
83+
if isinstance(value, dict):
84+
return {key: normalize_frontmatter_value(val) for key, val in value.items()}
85+
86+
# Keep strings and None as-is
87+
return value
88+
89+
90+
def normalize_frontmatter_metadata(metadata: dict) -> dict:
91+
"""Normalize all values in frontmatter metadata dict.
92+
93+
Converts date/datetime objects to ISO format strings to prevent
94+
AttributeError when code expects strings (GitHub issue #236).
95+
96+
Args:
97+
metadata: The frontmatter metadata dictionary
98+
99+
Returns:
100+
A new dictionary with all values normalized
101+
"""
102+
return {key: normalize_frontmatter_value(value) for key, value in metadata.items()}
103+
104+
29105
@dataclass
30106
class EntityContent:
31107
content: str
@@ -127,20 +203,25 @@ async def parse_file_content(self, absolute_path, file_content):
127203

128204
# Extract file stat info
129205
file_stats = absolute_path.stat()
130-
metadata = post.metadata
206+
207+
# Normalize frontmatter values to prevent AttributeError on date objects (issue #236)
208+
# PyYAML automatically converts date strings like "2025-10-24" to datetime.date objects
209+
# This normalization converts them back to ISO format strings to ensure compatibility
210+
# with code that expects string values
211+
metadata = normalize_frontmatter_metadata(post.metadata)
131212

132213
# Ensure required fields have defaults (issue #184, #387)
133214
# Handle title - use default if missing, None/null, empty, or string "None"
134-
title = post.metadata.get("title")
215+
title = metadata.get("title")
135216
if not title or title == "None":
136217
metadata["title"] = absolute_path.stem
137218
else:
138219
metadata["title"] = title
139220
# Handle type - use default if missing OR explicitly set to None/null
140-
entity_type = post.metadata.get("type")
221+
entity_type = metadata.get("type")
141222
metadata["type"] = entity_type if entity_type is not None else "note"
142223

143-
tags = parse_tags(post.metadata.get("tags", [])) # pyright: ignore
224+
tags = parse_tags(metadata.get("tags", [])) # pyright: ignore
144225
if tags:
145226
metadata["tags"] = tags
146227

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
"""Test that YAML date parsing doesn't break frontmatter processing.
2+
3+
This test reproduces GitHub issue #236 from basic-memory-cloud where date fields
4+
in YAML frontmatter are automatically parsed as datetime.date objects by PyYAML,
5+
but later code expects strings and calls .strip() on them, causing AttributeError.
6+
"""
7+
8+
import pytest
9+
from pathlib import Path
10+
from basic_memory.markdown.entity_parser import EntityParser
11+
12+
13+
@pytest.fixture
14+
def test_file_with_date(tmp_path):
15+
"""Create a test file with date fields in frontmatter."""
16+
test_file = tmp_path / "test_note.md"
17+
content = """---
18+
title: Test Note
19+
date: 2025-10-24
20+
created: 2025-10-24
21+
tags:
22+
- python
23+
- testing
24+
---
25+
26+
# Test Content
27+
28+
This file has date fields in frontmatter that PyYAML will parse as datetime.date objects.
29+
"""
30+
test_file.write_text(content)
31+
return test_file
32+
33+
34+
@pytest.fixture
35+
def test_file_with_date_in_tags(tmp_path):
36+
"""Create a test file with a date value in tags (edge case)."""
37+
test_file = tmp_path / "test_note_date_tags.md"
38+
content = """---
39+
title: Test Note with Date Tags
40+
tags: 2025-10-24
41+
---
42+
43+
# Test Content
44+
45+
This file has a date value as tags, which will be parsed as datetime.date.
46+
"""
47+
test_file.write_text(content)
48+
return test_file
49+
50+
51+
@pytest.fixture
52+
def test_file_with_dates_in_tag_list(tmp_path):
53+
"""Create a test file with dates in a tag list (edge case)."""
54+
test_file = tmp_path / "test_note_dates_in_list.md"
55+
content = """---
56+
title: Test Note with Dates in Tags List
57+
tags:
58+
- valid-tag
59+
- 2025-10-24
60+
- another-tag
61+
---
62+
63+
# Test Content
64+
65+
This file has date values mixed into tags list.
66+
"""
67+
test_file.write_text(content)
68+
return test_file
69+
70+
71+
@pytest.mark.asyncio
72+
async def test_parse_file_with_date_fields(test_file_with_date, tmp_path):
73+
"""Test that files with date fields in frontmatter can be parsed without errors."""
74+
parser = EntityParser(tmp_path)
75+
76+
# This should not raise AttributeError about .strip()
77+
entity_markdown = await parser.parse_file(test_file_with_date)
78+
79+
# Verify basic parsing worked
80+
assert entity_markdown.frontmatter.title == "Test Note"
81+
82+
# Date fields should be converted to ISO format strings
83+
date_field = entity_markdown.frontmatter.metadata.get("date")
84+
assert date_field is not None
85+
assert isinstance(date_field, str), "Date should be converted to string"
86+
assert date_field == "2025-10-24", "Date should be in ISO format"
87+
88+
created_field = entity_markdown.frontmatter.metadata.get("created")
89+
assert created_field is not None
90+
assert isinstance(created_field, str), "Created date should be converted to string"
91+
assert created_field == "2025-10-24", "Created date should be in ISO format"
92+
93+
94+
@pytest.mark.asyncio
95+
async def test_parse_file_with_date_as_tags(test_file_with_date_in_tags, tmp_path):
96+
"""Test that date values in tags field don't cause errors."""
97+
parser = EntityParser(tmp_path)
98+
99+
# This should not raise AttributeError - date should be converted to string
100+
entity_markdown = await parser.parse_file(test_file_with_date_in_tags)
101+
assert entity_markdown.frontmatter.title == "Test Note with Date Tags"
102+
103+
# The date should be converted to ISO format string before parse_tags processes it
104+
tags = entity_markdown.frontmatter.tags
105+
assert tags is not None
106+
assert isinstance(tags, list)
107+
# The date value should be converted to string
108+
assert "2025-10-24" in tags
109+
110+
111+
@pytest.mark.asyncio
112+
async def test_parse_file_with_dates_in_tag_list(test_file_with_dates_in_tag_list, tmp_path):
113+
"""Test that date values in a tags list don't cause errors."""
114+
parser = EntityParser(tmp_path)
115+
116+
# This should not raise AttributeError - dates should be converted to strings
117+
entity_markdown = await parser.parse_file(test_file_with_dates_in_tag_list)
118+
assert entity_markdown.frontmatter.title == "Test Note with Dates in Tags List"
119+
120+
# Tags should be parsed
121+
tags = entity_markdown.frontmatter.tags
122+
assert tags is not None
123+
assert isinstance(tags, list)
124+
125+
# Should have 3 tags (2 valid + 1 date converted to ISO string)
126+
assert len(tags) == 3
127+
assert "valid-tag" in tags
128+
assert "another-tag" in tags
129+
# Date should be converted to ISO format string
130+
assert "2025-10-24" in tags
131+
132+
133+
@pytest.mark.asyncio
134+
async def test_parse_file_with_various_yaml_types(tmp_path):
135+
"""Test that various YAML types in frontmatter don't cause errors.
136+
137+
This reproduces the broader issue from GitHub #236 where ANY non-string
138+
YAML type (dates, lists, numbers, booleans) can cause AttributeError
139+
when code expects strings and calls .strip().
140+
"""
141+
test_file = tmp_path / "test_yaml_types.md"
142+
content = """---
143+
title: Test YAML Types
144+
date: 2025-10-24
145+
priority: 1
146+
completed: true
147+
tags:
148+
- python
149+
- testing
150+
metadata:
151+
author: Test User
152+
version: 1.0
153+
---
154+
155+
# Test Content
156+
157+
This file has various YAML types that need to be normalized.
158+
"""
159+
test_file.write_text(content)
160+
161+
parser = EntityParser(tmp_path)
162+
entity_markdown = await parser.parse_file(test_file)
163+
164+
# All values should be accessible without AttributeError
165+
assert entity_markdown.frontmatter.title == "Test YAML Types"
166+
167+
# Date should be converted to ISO string
168+
date_field = entity_markdown.frontmatter.metadata.get("date")
169+
assert isinstance(date_field, str)
170+
assert date_field == "2025-10-24"
171+
172+
# Number should be converted to string
173+
priority = entity_markdown.frontmatter.metadata.get("priority")
174+
assert isinstance(priority, str)
175+
assert priority == "1"
176+
177+
# Boolean should be converted to string
178+
completed = entity_markdown.frontmatter.metadata.get("completed")
179+
assert isinstance(completed, str)
180+
assert completed == "True" # Python's str(True) always returns "True"
181+
182+
# List should be preserved as list, but items should be strings
183+
tags = entity_markdown.frontmatter.tags
184+
assert isinstance(tags, list)
185+
assert all(isinstance(tag, str) for tag in tags)
186+
assert "python" in tags
187+
assert "testing" in tags
188+
189+
# Dict should be preserved as dict, but nested values should be strings
190+
metadata = entity_markdown.frontmatter.metadata.get("metadata")
191+
assert isinstance(metadata, dict)
192+
assert isinstance(metadata.get("author"), str)
193+
assert metadata.get("author") == "Test User"
194+
assert isinstance(metadata.get("version"), str)
195+
assert metadata.get("version") in ("1.0", "1")
196+
197+
198+
@pytest.mark.asyncio
199+
async def test_parse_file_with_datetime_objects(tmp_path):
200+
"""Test that datetime objects (not just date objects) are properly normalized.
201+
202+
This tests the edge case where frontmatter might contain datetime values
203+
with time components (as parsed by PyYAML), ensuring they're converted to ISO format strings.
204+
"""
205+
test_file = tmp_path / "test_datetime.md"
206+
207+
# YAML datetime strings that PyYAML will parse as datetime objects
208+
# Format: YYYY-MM-DD HH:MM:SS or YYYY-MM-DDTHH:MM:SS
209+
content = """---
210+
title: Test Datetime
211+
created_at: 2025-10-24 14:30:00
212+
updated_at: 2025-10-24T00:00:00
213+
---
214+
215+
# Test Content
216+
217+
This file has datetime values in frontmatter that PyYAML will parse as datetime objects.
218+
"""
219+
test_file.write_text(content)
220+
221+
parser = EntityParser(tmp_path)
222+
entity_markdown = await parser.parse_file(test_file)
223+
224+
# Verify datetime objects are converted to ISO format strings
225+
created_at = entity_markdown.frontmatter.metadata.get("created_at")
226+
assert isinstance(created_at, str), "Datetime should be converted to string"
227+
# PyYAML parses "2025-10-24 14:30:00" as datetime, which we normalize to ISO
228+
assert "2025-10-24" in created_at and "14:30:00" in created_at, \
229+
f"Datetime with time should be normalized to ISO format, got: {created_at}"
230+
231+
updated_at = entity_markdown.frontmatter.metadata.get("updated_at")
232+
assert isinstance(updated_at, str), "Datetime should be converted to string"
233+
# PyYAML parses "2025-10-24T00:00:00" as datetime, which we normalize to ISO
234+
assert "2025-10-24" in updated_at and "00:00:00" in updated_at, \
235+
f"Datetime at midnight should be normalized to ISO format, got: {updated_at}"

0 commit comments

Comments
 (0)