Skip to content

Commit 74e12eb

Browse files
authored
fix: Sanitize filenames and allow optional kebab case (#260)
Signed-off-by: Brandon Mayes <[email protected]>
1 parent 7a8b08d commit 74e12eb

File tree

7 files changed

+202
-48
lines changed

7 files changed

+202
-48
lines changed

src/basic_memory/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ class BasicMemoryConfig(BaseSettings):
7474
description="Whether to sync changes in real time. default (True)",
7575
)
7676

77+
kebab_filenames: bool = Field(
78+
default=False,
79+
description="Format for generated filenames. False preserves spaces and special chars, True converts them to hyphens for consistency with permalinks",
80+
)
81+
7782
# API connection configuration
7883
api_url: Optional[str] = Field(
7984
default=None,

src/basic_memory/file_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hashlib
44
from pathlib import Path
5+
import re
56
from typing import Any, Dict, Union
67

78
import yaml
@@ -233,3 +234,21 @@ async def update_frontmatter(path: FilePath, updates: Dict[str, Any]) -> str:
233234
error=str(e),
234235
)
235236
raise FileError(f"Failed to update frontmatter: {e}")
237+
238+
239+
def sanitize_for_filename(text: str, replacement: str = "-") -> str:
240+
"""
241+
Sanitize string to be safe for use as a note title
242+
Replaces path separators and other problematic characters
243+
with hyphens.
244+
"""
245+
# replace both POSIX and Windows path separators
246+
text = re.sub(r"[/\\]", replacement, text)
247+
248+
# replace some other problematic chars
249+
text = re.sub(r'[<>:"|?*]', replacement, text)
250+
251+
# compress multiple, repeated replacements
252+
text = re.sub(f"{re.escape(replacement)}+", replacement, text)
253+
254+
return text.strip(replacement)

src/basic_memory/schemas/base.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
from pydantic import BaseModel, BeforeValidator, Field, model_validator
2424

25+
from basic_memory.config import ConfigManager
26+
from basic_memory.file_utils import sanitize_for_filename
2527
from basic_memory.utils import generate_permalink
2628

2729

@@ -190,13 +192,35 @@ class Entity(BaseModel):
190192
default="text/markdown",
191193
)
192194

195+
@property
196+
def safe_title(self) -> str:
197+
"""
198+
A sanitized version of the title, which is safe for use on the filesystem. For example,
199+
a title of "Coupon Enable/Disable Feature" should create a the file as "Coupon Enable-Disable Feature.md"
200+
instead of creating a file named "Disable Feature.md" beneath the "Coupon Enable" directory.
201+
202+
Replaces POSIX and/or Windows style slashes as well as a few other characters that are not safe for filenames.
203+
If kebab_filenames is True, then behavior is consistent with transformation used when generating permalink
204+
strings (e.g. "Coupon Enable/Disable Feature" -> "coupon-enable-disable-feature").
205+
"""
206+
fixed_title = sanitize_for_filename(self.title)
207+
208+
app_config = ConfigManager().config
209+
use_kebab_case = app_config.kebab_filenames
210+
211+
if use_kebab_case:
212+
fixed_title = generate_permalink(file_path=fixed_title, split_extension=False)
213+
214+
return fixed_title
215+
193216
@property
194217
def file_path(self):
195218
"""Get the file path for this entity based on its permalink."""
219+
safe_title = self.safe_title
196220
if self.content_type == "text/markdown":
197-
return f"{self.folder}/{self.title}.md" if self.folder else f"{self.title}.md"
221+
return f"{self.folder}/{safe_title}.md" if self.folder else f"{safe_title}.md"
198222
else:
199-
return f"{self.folder}/{self.title}" if self.folder else self.title
223+
return f"{self.folder}/{safe_title}" if self.folder else safe_title
200224

201225
@property
202226
def permalink(self) -> Permalink:

src/basic_memory/utils.py

Lines changed: 58 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def __str__(self) -> str: ...
2828
logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)
2929

3030

31-
def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
31+
def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str:
3232
"""Generate a stable permalink from a file path.
3333
3434
Args:
@@ -51,53 +51,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5151
# Convert Path to string if needed
5252
path_str = Path(str(file_path)).as_posix()
5353

54-
# Remove extension
55-
base = os.path.splitext(path_str)[0]
54+
# Remove extension (for now, possibly)
55+
(base, extension) = os.path.splitext(path_str)
5656

5757
# Check if we have CJK characters that should be preserved
58-
# CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
58+
# CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
5959
# \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
6060
has_cjk_chars = any(
61-
'\u4e00' <= char <= '\u9fff' or
62-
'\u3000' <= char <= '\u303f' or
63-
'\u3400' <= char <= '\u4dbf' or
64-
'\uff00' <= char <= '\uffef'
61+
"\u4e00" <= char <= "\u9fff"
62+
or "\u3000" <= char <= "\u303f"
63+
or "\u3400" <= char <= "\u4dbf"
64+
or "\uff00" <= char <= "\uffef"
6565
for char in base
6666
)
67-
67+
6868
if has_cjk_chars:
6969
# For text with CJK characters, selectively transliterate only Latin accented chars
7070
result = ""
7171
for char in base:
72-
if ('\u4e00' <= char <= '\u9fff' or
73-
'\u3000' <= char <= '\u303f' or
74-
'\u3400' <= char <= '\u4dbf'):
72+
if (
73+
"\u4e00" <= char <= "\u9fff"
74+
or "\u3000" <= char <= "\u303f"
75+
or "\u3400" <= char <= "\u4dbf"
76+
):
7577
# Preserve CJK ideographs and symbols
7678
result += char
77-
elif ('\uff00' <= char <= '\uffef'):
79+
elif "\uff00" <= char <= "\uffef":
7880
# Remove Chinese fullwidth punctuation entirely (like ,!?)
7981
continue
8082
else:
8183
# Transliterate Latin accented characters to ASCII
8284
result += unidecode(char)
83-
85+
8486
# Insert hyphens between CJK and Latin character transitions
8587
# Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
86-
result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result)
87-
result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result)
88-
88+
result = re.sub(
89+
r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])", r"\1-\2", result
90+
)
91+
result = re.sub(
92+
r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])", r"\1-\2", result
93+
)
94+
8995
# Insert dash between camelCase
9096
result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
91-
97+
9298
# Convert ASCII letters to lowercase, preserve CJK
9399
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
94-
100+
95101
# Replace underscores with hyphens
96102
text_with_hyphens = lower_text.replace("_", "-")
97-
103+
98104
# Remove apostrophes entirely (don't replace with hyphens)
99105
text_no_apostrophes = text_with_hyphens.replace("'", "")
100-
106+
101107
# Replace unsafe chars with hyphens, but preserve CJK characters
102108
clean_text = re.sub(
103109
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes
@@ -129,7 +135,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
129135
segments = clean_text.split("/")
130136
clean_segments = [s.strip("-") for s in segments]
131137

132-
return "/".join(clean_segments)
138+
return_val = "/".join(clean_segments)
139+
140+
# Append file extension back, if necessary
141+
if not split_extension and extension:
142+
return_val += extension
143+
144+
return return_val
133145

134146

135147
def setup_logging(
@@ -229,79 +241,79 @@ def normalize_newlines(multiline: str) -> str:
229241
Returns:
230242
A string with normalized newlines native to the platform.
231243
"""
232-
return re.sub(r'\r\n?|\n', os.linesep, multiline)
244+
return re.sub(r"\r\n?|\n", os.linesep, multiline)
233245

234246

235247
def normalize_file_path_for_comparison(file_path: str) -> str:
236248
"""Normalize a file path for conflict detection.
237-
249+
238250
This function normalizes file paths to help detect potential conflicts:
239251
- Converts to lowercase for case-insensitive comparison
240252
- Normalizes Unicode characters
241253
- Handles path separators consistently
242-
254+
243255
Args:
244256
file_path: The file path to normalize
245-
257+
246258
Returns:
247259
Normalized file path for comparison purposes
248260
"""
249261
import unicodedata
250-
262+
251263
# Convert to lowercase for case-insensitive comparison
252264
normalized = file_path.lower()
253-
265+
254266
# Normalize Unicode characters (NFD normalization)
255-
normalized = unicodedata.normalize('NFD', normalized)
256-
267+
normalized = unicodedata.normalize("NFD", normalized)
268+
257269
# Replace path separators with forward slashes
258-
normalized = normalized.replace('\\', '/')
259-
270+
normalized = normalized.replace("\\", "/")
271+
260272
# Remove multiple slashes
261-
normalized = re.sub(r'/+', '/', normalized)
262-
273+
normalized = re.sub(r"/+", "/", normalized)
274+
263275
return normalized
264276

265277

266278
def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]:
267279
"""Detect potential conflicts between a file path and existing paths.
268-
280+
269281
This function checks for various types of conflicts:
270282
- Case sensitivity differences
271283
- Unicode normalization differences
272284
- Path separator differences
273285
- Permalink generation conflicts
274-
286+
275287
Args:
276288
file_path: The file path to check
277289
existing_paths: List of existing file paths to check against
278-
290+
279291
Returns:
280292
List of existing paths that might conflict with the given file path
281293
"""
282294
conflicts = []
283-
295+
284296
# Normalize the input file path
285297
normalized_input = normalize_file_path_for_comparison(file_path)
286298
input_permalink = generate_permalink(file_path)
287-
299+
288300
for existing_path in existing_paths:
289301
# Skip identical paths
290302
if existing_path == file_path:
291303
continue
292-
304+
293305
# Check for case-insensitive path conflicts
294306
normalized_existing = normalize_file_path_for_comparison(existing_path)
295307
if normalized_input == normalized_existing:
296308
conflicts.append(existing_path)
297309
continue
298-
310+
299311
# Check for permalink conflicts
300312
existing_permalink = generate_permalink(existing_path)
301313
if input_permalink == existing_permalink:
302314
conflicts.append(existing_path)
303315
continue
304-
316+
305317
return conflicts
306318

307319

@@ -336,13 +348,13 @@ def validate_project_path(path: str, project_path: Path) -> bool:
336348

337349
def ensure_timezone_aware(dt: datetime) -> datetime:
338350
"""Ensure a datetime is timezone-aware using system timezone.
339-
351+
340352
If the datetime is naive, convert it to timezone-aware using the system's local timezone.
341353
If it's already timezone-aware, return it unchanged.
342-
354+
343355
Args:
344356
dt: The datetime to ensure is timezone-aware
345-
357+
346358
Returns:
347359
A timezone-aware datetime
348360
"""
@@ -351,4 +363,4 @@ def ensure_timezone_aware(dt: datetime) -> datetime:
351363
return dt.astimezone()
352364
else:
353365
# Already timezone-aware
354-
return dt
366+
return dt

test-int/mcp/test_write_note_integration.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
import pytest
1111
from fastmcp import Client
12+
from unittest.mock import patch
13+
14+
from basic_memory.config import ConfigManager
1215

1316

1417
@pytest.mark.asyncio
@@ -282,3 +285,64 @@ async def test_write_note_preserve_frontmatter(mcp_server, app):
282285
assert "# Created note" in response_text
283286
assert "file_path: test/Frontmatter Note.md" in response_text
284287
assert "permalink: test/frontmatter-note" in response_text
288+
289+
290+
@pytest.mark.asyncio
291+
async def test_write_note_kebab_filenames_basic(mcp_server):
292+
"""Test note creation with kebab_filenames=True and invalid filename characters."""
293+
294+
config = ConfigManager().config
295+
curr_config_val = config.kebab_filenames
296+
config.kebab_filenames = True
297+
298+
with patch.object(ConfigManager, "config", config):
299+
async with Client(mcp_server) as client:
300+
result = await client.call_tool(
301+
"write_note",
302+
{
303+
"title": "My Note: With/Invalid|Chars?",
304+
"folder": "my-folder",
305+
"content": "Testing kebab-case and invalid characters.",
306+
"tags": "kebab,invalid,filename",
307+
},
308+
)
309+
310+
assert len(result.content) == 1
311+
response_text = result.content[0].text
312+
313+
# File path and permalink should be kebab-case and sanitized
314+
assert "file_path: my-folder/my-note-with-invalid-chars.md" in response_text
315+
assert "permalink: my-folder/my-note-with-invalid-chars" in response_text
316+
317+
# Restore original config value
318+
config.kebab_filenames = curr_config_val
319+
320+
321+
@pytest.mark.asyncio
322+
async def test_write_note_kebab_filenames_repeat_invalid(mcp_server):
323+
"""Test note creation with multiple invalid and repeated characters."""
324+
325+
config = ConfigManager().config
326+
curr_config_val = config.kebab_filenames
327+
config.kebab_filenames = True
328+
329+
with patch.object(ConfigManager, "config", config):
330+
async with Client(mcp_server) as client:
331+
result = await client.call_tool(
332+
"write_note",
333+
{
334+
"title": 'Crazy<>:"|?*Note/Name',
335+
"folder": "my-folder",
336+
"content": "Should be fully kebab-case and safe.",
337+
"tags": "crazy,filename,test",
338+
},
339+
)
340+
341+
assert len(result.content) == 1
342+
response_text = result.content[0].text
343+
344+
assert "file_path: my-folder/crazy-note-name.md" in response_text
345+
assert "permalink: my-folder/crazy-note-name" in response_text
346+
347+
# Restore original config value
348+
config.kebab_filenames = curr_config_val

0 commit comments

Comments
 (0)