fix: Sanitize filenames and allow optional kebab case (#260)

bdmayes · web-flow · commit 74e12eb782fa · 2025-08-27T19:29:01.000-04:00
Signed-off-by: Brandon Mayes &lt;5610870+bdmayes@users.noreply.github.com&gt;
diff --git a/src/basic_memory/config.py b/src/basic_memory/config.py
@@ -74,6 +74,11 @@ class BasicMemoryConfig(BaseSettings):
         description="Whether to sync changes in real time. default (True)",
     )
 
+    kebab_filenames: bool = Field(
+        default=False,
+        description="Format for generated filenames. False preserves spaces and special chars, True converts them to hyphens for consistency with permalinks",
+    )
+
     # API connection configuration
     api_url: Optional[str] = Field(
         default=None,
diff --git a/src/basic_memory/file_utils.py b/src/basic_memory/file_utils.py
@@ -2,6 +2,7 @@
 
 import hashlib
 from pathlib import Path
+import re
 from typing import Any, Dict, Union
 
 import yaml
@@ -233,3 +234,21 @@ async def update_frontmatter(path: FilePath, updates: Dict[str, Any]) -> str:
             error=str(e),
         )
         raise FileError(f"Failed to update frontmatter: {e}")
+
+
+def sanitize_for_filename(text: str, replacement: str = "-") -> str:
+    """
+    Sanitize string to be safe for use as a note title
+    Replaces path separators and other problematic characters
+    with hyphens.
+    """
+    # replace both POSIX and Windows path separators
+    text = re.sub(r"[/\\]", replacement, text)
+
+    # replace some other problematic chars
+    text = re.sub(r'[<>:"|?*]', replacement, text)
+
+    # compress multiple, repeated replacements
+    text = re.sub(f"{re.escape(replacement)}+", replacement, text)
+
+    return text.strip(replacement)
diff --git a/src/basic_memory/schemas/base.py b/src/basic_memory/schemas/base.py
@@ -22,6 +22,8 @@
 
 from pydantic import BaseModel, BeforeValidator, Field, model_validator
 
+from basic_memory.config import ConfigManager
+from basic_memory.file_utils import sanitize_for_filename
 from basic_memory.utils import generate_permalink
 
 
@@ -190,13 +192,35 @@ class Entity(BaseModel):
         default="text/markdown",
     )
 
+    @property
+    def safe_title(self) -> str:
+        """
+        A sanitized version of the title, which is safe for use on the filesystem. For example,
+        a title of "Coupon Enable/Disable Feature" should create a the file as "Coupon Enable-Disable Feature.md"
+        instead of creating a file named "Disable Feature.md" beneath the "Coupon Enable" directory.
+
+        Replaces POSIX and/or Windows style slashes as well as a few other characters that are not safe for filenames.
+        If kebab_filenames is True, then behavior is consistent with transformation used when generating permalink
+        strings (e.g. "Coupon Enable/Disable Feature" -> "coupon-enable-disable-feature").
+        """
+        fixed_title = sanitize_for_filename(self.title)
+
+        app_config = ConfigManager().config
+        use_kebab_case = app_config.kebab_filenames
+
+        if use_kebab_case:
+            fixed_title = generate_permalink(file_path=fixed_title, split_extension=False)
+
+        return fixed_title
+
     @property
     def file_path(self):
         """Get the file path for this entity based on its permalink."""
+        safe_title = self.safe_title
         if self.content_type == "text/markdown":
-            return f"{self.folder}/{self.title}.md" if self.folder else f"{self.title}.md"
+            return f"{self.folder}/{safe_title}.md" if self.folder else f"{safe_title}.md"
         else:
-            return f"{self.folder}/{self.title}" if self.folder else self.title
+            return f"{self.folder}/{safe_title}" if self.folder else safe_title
 
     @property
     def permalink(self) -> Permalink:
diff --git a/src/basic_memory/utils.py b/src/basic_memory/utils.py
@@ -28,7 +28,7 @@ def __str__(self) -> str: ...
 logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)
 
 
-def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
+def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str:
     """Generate a stable permalink from a file path.
 
     Args:
@@ -51,53 +51,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
     # Convert Path to string if needed
     path_str = Path(str(file_path)).as_posix()
 
-    # Remove extension
-    base = os.path.splitext(path_str)[0]
+    # Remove extension (for now, possibly)
+    (base, extension) = os.path.splitext(path_str)
 
     # Check if we have CJK characters that should be preserved
-    # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols), 
+    # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
     # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
     has_cjk_chars = any(
-        '\u4e00' <= char <= '\u9fff' or 
-        '\u3000' <= char <= '\u303f' or 
-        '\u3400' <= char <= '\u4dbf' or
-        '\uff00' <= char <= '\uffef'
+        "\u4e00" <= char <= "\u9fff"
+        or "\u3000" <= char <= "\u303f"
+        or "\u3400" <= char <= "\u4dbf"
+        or "\uff00" <= char <= "\uffef"
         for char in base
     )
-    
+
     if has_cjk_chars:
         # For text with CJK characters, selectively transliterate only Latin accented chars
         result = ""
         for char in base:
-            if ('\u4e00' <= char <= '\u9fff' or 
-                '\u3000' <= char <= '\u303f' or 
-                '\u3400' <= char <= '\u4dbf'):
+            if (
+                "\u4e00" <= char <= "\u9fff"
+                or "\u3000" <= char <= "\u303f"
+                or "\u3400" <= char <= "\u4dbf"
+            ):
                 # Preserve CJK ideographs and symbols
                 result += char
-            elif ('\uff00' <= char <= '\uffef'):
+            elif "\uff00" <= char <= "\uffef":
                 # Remove Chinese fullwidth punctuation entirely (like ，！？)
                 continue
             else:
                 # Transliterate Latin accented characters to ASCII
                 result += unidecode(char)
-        
+
         # Insert hyphens between CJK and Latin character transitions
         # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
-        result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result)
-        result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result)
-        
+        result = re.sub(
+            r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])", r"\1-\2", result
+        )
+        result = re.sub(
+            r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])", r"\1-\2", result
+        )
+
         # Insert dash between camelCase
         result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
-        
+
         # Convert ASCII letters to lowercase, preserve CJK
         lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
-        
+
         # Replace underscores with hyphens
         text_with_hyphens = lower_text.replace("_", "-")
-        
+
         # Remove apostrophes entirely (don't replace with hyphens)
         text_no_apostrophes = text_with_hyphens.replace("'", "")
-        
+
         # Replace unsafe chars with hyphens, but preserve CJK characters
         clean_text = re.sub(
             r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes
@@ -129,7 +135,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
     segments = clean_text.split("/")
     clean_segments = [s.strip("-") for s in segments]
 
-    return "/".join(clean_segments)
+    return_val = "/".join(clean_segments)
+
+    # Append file extension back, if necessary
+    if not split_extension and extension:
+        return_val += extension
+
+    return return_val
 
 
 def setup_logging(
@@ -229,79 +241,79 @@ def normalize_newlines(multiline: str) -> str:
     Returns:
         A string with normalized newlines native to the platform.
     """
-    return re.sub(r'\r\n?|\n', os.linesep, multiline)
+    return re.sub(r"\r\n?|\n", os.linesep, multiline)
 
 
 def normalize_file_path_for_comparison(file_path: str) -> str:
     """Normalize a file path for conflict detection.
-    
+
     This function normalizes file paths to help detect potential conflicts:
     - Converts to lowercase for case-insensitive comparison
     - Normalizes Unicode characters
     - Handles path separators consistently
-    
+
     Args:
         file_path: The file path to normalize
-        
+
     Returns:
         Normalized file path for comparison purposes
     """
     import unicodedata
-    
+
     # Convert to lowercase for case-insensitive comparison
     normalized = file_path.lower()
-    
+
     # Normalize Unicode characters (NFD normalization)
-    normalized = unicodedata.normalize('NFD', normalized)
-    
+    normalized = unicodedata.normalize("NFD", normalized)
+
     # Replace path separators with forward slashes
-    normalized = normalized.replace('\\', '/')
-    
+    normalized = normalized.replace("\\", "/")
+
     # Remove multiple slashes
-    normalized = re.sub(r'/+', '/', normalized)
-    
+    normalized = re.sub(r"/+", "/", normalized)
+
     return normalized
 
 
 def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]:
     """Detect potential conflicts between a file path and existing paths.
-    
+
     This function checks for various types of conflicts:
     - Case sensitivity differences
     - Unicode normalization differences
     - Path separator differences
     - Permalink generation conflicts
-    
+
     Args:
         file_path: The file path to check
         existing_paths: List of existing file paths to check against
-        
+
     Returns:
         List of existing paths that might conflict with the given file path
     """
     conflicts = []
-    
+
     # Normalize the input file path
     normalized_input = normalize_file_path_for_comparison(file_path)
     input_permalink = generate_permalink(file_path)
-    
+
     for existing_path in existing_paths:
         # Skip identical paths
         if existing_path == file_path:
             continue
-            
+
         # Check for case-insensitive path conflicts
         normalized_existing = normalize_file_path_for_comparison(existing_path)
         if normalized_input == normalized_existing:
             conflicts.append(existing_path)
             continue
-            
+
         # Check for permalink conflicts
         existing_permalink = generate_permalink(existing_path)
         if input_permalink == existing_permalink:
             conflicts.append(existing_path)
             continue
-    
+
     return conflicts
 
 
@@ -336,13 +348,13 @@ def validate_project_path(path: str, project_path: Path) -> bool:
 
 def ensure_timezone_aware(dt: datetime) -> datetime:
     """Ensure a datetime is timezone-aware using system timezone.
-    
+
     If the datetime is naive, convert it to timezone-aware using the system's local timezone.
     If it's already timezone-aware, return it unchanged.
-    
+
     Args:
         dt: The datetime to ensure is timezone-aware
-        
+
     Returns:
         A timezone-aware datetime
     """
@@ -351,4 +363,4 @@ def ensure_timezone_aware(dt: datetime) -> datetime:
         return dt.astimezone()
     else:
         # Already timezone-aware
-        return dt
+        return dt
diff --git a/test-int/mcp/test_write_note_integration.py b/test-int/mcp/test_write_note_integration.py
@@ -9,6 +9,9 @@
 
 import pytest
 from fastmcp import Client
+from unittest.mock import patch
+
+from basic_memory.config import ConfigManager
 
 
 @pytest.mark.asyncio
@@ -282,3 +285,64 @@ async def test_write_note_preserve_frontmatter(mcp_server, app):
         assert "# Created note" in response_text
         assert "file_path: test/Frontmatter Note.md" in response_text
         assert "permalink: test/frontmatter-note" in response_text
+
+
+@pytest.mark.asyncio
+async def test_write_note_kebab_filenames_basic(mcp_server):
+    """Test note creation with kebab_filenames=True and invalid filename characters."""
+
+    config = ConfigManager().config
+    curr_config_val = config.kebab_filenames
+    config.kebab_filenames = True
+
+    with patch.object(ConfigManager, "config", config):
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "write_note",
+                {
+                    "title": "My Note: With/Invalid|Chars?",
+                    "folder": "my-folder",
+                    "content": "Testing kebab-case and invalid characters.",
+                    "tags": "kebab,invalid,filename",
+                },
+            )
+
+            assert len(result.content) == 1
+            response_text = result.content[0].text
+
+            # File path and permalink should be kebab-case and sanitized
+            assert "file_path: my-folder/my-note-with-invalid-chars.md" in response_text
+            assert "permalink: my-folder/my-note-with-invalid-chars" in response_text
+
+    # Restore original config value
+    config.kebab_filenames = curr_config_val
+
+
+@pytest.mark.asyncio
+async def test_write_note_kebab_filenames_repeat_invalid(mcp_server):
+    """Test note creation with multiple invalid and repeated characters."""
+
+    config = ConfigManager().config
+    curr_config_val = config.kebab_filenames
+    config.kebab_filenames = True
+
+    with patch.object(ConfigManager, "config", config):
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "write_note",
+                {
+                    "title": 'Crazy<>:"|?*Note/Name',
+                    "folder": "my-folder",
+                    "content": "Should be fully kebab-case and safe.",
+                    "tags": "crazy,filename,test",
+                },
+            )
+
+            assert len(result.content) == 1
+            response_text = result.content[0].text
+
+            assert "file_path: my-folder/crazy-note-name.md" in response_text
+            assert "permalink: my-folder/crazy-note-name" in response_text
+
+    # Restore original config value
+    config.kebab_filenames = curr_config_val
diff --git a/tests/api/test_knowledge_router.py b/tests/api/test_knowledge_router.py
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py