feat: add encoding option to text file operations

Yoshihiro Takahara · Yoshihiro Takahara · commit f982f84b3ff9 · 2024-12-15T11:29:24.000+09:00
- Remove automatic encoding detection
- Add encoding parameter to API (default: utf-8)
- Add tests for encoding error handling
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ MCP Text Editor Server is designed to facilitate safe and efficient line-based t
 - Read multiple ranges from multiple files in a single operation
 - Line-based patch application with correct handling of line number shifts
 - Edit text file contents with conflict detection
+- Flexible character encoding support (utf-8, shift_jis, latin1, etc.)
 - Support for multiple file operations
 - Proper handling of concurrent edits with hash-based validation
 - Memory-efficient processing of large files
@@ -143,6 +144,7 @@ Parameters:
 - `file_path`: Path to the text file
 - `line_start`/`start`: Line number to start from (1-based)
 - `line_end`/`end`: Line number to end at (inclusive, null for end of file)
+- `encoding`: File encoding (default: "utf-8"). Specify the encoding of the text file (e.g., "shift_jis", "latin1")
 
 **Single Range Response:**
 
@@ -238,6 +240,7 @@ Important Notes:
 3. Patches must not overlap within the same file
 4. Line numbers are 1-based
 5. If original content ends with newline, ensure patch content also ends with newline
+6. File encoding must match the encoding used in get_text_file_contents
 
 **Success Response:**
 
@@ -295,6 +298,7 @@ result = await edit_text_file_contents({
         {
             "path": "file.txt",
             "hash": contents["file.txt"][0]["hash"],
+            "encoding": "utf-8",  # Optional, defaults to "utf-8"
             "patches": [
                 {
                     "line_start": 5,
@@ -325,6 +329,7 @@ The server handles various error cases:
 - Hash mismatches (concurrent edit detection)
 - Invalid patch ranges
 - Overlapping patches
+- Encoding errors (when file cannot be decoded with specified encoding)
 - Line number out of bounds
 
 ## Security Considerations
diff --git a/src/mcp_text_editor/server.py b/src/mcp_text_editor/server.py
@@ -67,7 +67,12 @@ def get_tool_description(self) -> Tool:
                             },
                             "required": ["file_path", "ranges"],
                         },
-                    }
+                    },
+                    "encoding": {
+                        "type": "string",
+                        "description": "Text encoding (default: 'utf-8')",
+                        "default": "utf-8",
+                    },
                 },
                 "required": ["files"],
             },
@@ -81,7 +86,10 @@ async def run_tool(self, arguments: Dict[str, Any]) -> Sequence[TextContent]:
                 raise RuntimeError("Missing required argument: 'files'")
 
             # Handle request
-            result = await self.editor.read_multiple_ranges(arguments["files"])
+            encoding = arguments.get("encoding", "utf-8")
+            result = await self.editor.read_multiple_ranges(
+                arguments["files"], encoding=encoding
+            )
             response = result
 
             return [TextContent(type="text", text=json.dumps(response, indent=2))]
@@ -130,14 +138,23 @@ def get_tool_description(self) -> Tool:
                                                 "default": None,
                                             },
                                             "contents": {"type": "string"},
+                                            "range_hash": {
+                                                "type": "string",
+                                                "description": "Hash of the content being replaced (required except for new files and append operations)",
+                                            },
                                         },
                                         "required": ["contents"],
                                     },
                                 },
                             },
                             "required": ["path", "file_hash", "patches"],
                         },
-                    }
+                    },
+                    "encoding": {
+                        "type": "string",
+                        "description": "Text encoding (default: 'utf-8')",
+                        "default": "utf-8",
+                    },
                 },
                 "required": ["files"],
             },
@@ -189,8 +206,9 @@ async def run_tool(self, arguments: Dict[str, Any]) -> Sequence[TextContent]:
                         }
                         continue
 
+                    encoding = arguments.get("encoding", "utf-8")
                     result = await self.editor.edit_file_contents(
-                        file_path, file_hash, patches
+                        file_path, file_hash, patches, encoding=encoding
                     )
                     results[file_path] = result
                 except Exception as e:
diff --git a/src/mcp_text_editor/text_editor.py b/src/mcp_text_editor/text_editor.py
@@ -48,68 +48,6 @@ def _validate_file_path(self, file_path: str) -> None:
         if ".." in file_path:
             raise ValueError("Path traversal not allowed")
 
-    def _detect_encoding(self, file_path: str) -> str:
-        """
-        Detect file encoding with Shift-JIS prioritized.
-
-        Args:
-            file_path (str): Path to the file
-
-        Returns:
-            str: Detected encoding, falls back to utf-8 if detection fails.
-        """
-
-        def try_decode(data: bytes, encoding: str) -> bool:
-            """Try to decode data with the given encoding."""
-            try:
-                data.decode(encoding)
-                return True
-            except UnicodeDecodeError:
-                return False
-
-        # Read file content for encoding detection
-        try:
-            with open(file_path, "rb") as f:
-                raw_data = f.read()
-
-                # Try encodings in order of priority
-                if try_decode(raw_data, "shift_jis"):
-                    return "shift_jis"
-                if try_decode(raw_data, "utf-8"):
-                    return "utf-8"
-
-                # As a last resort, use chardet
-                try:
-                    import chardet
-
-                    result = chardet.detect(raw_data)
-                    encoding = result.get("encoding") or ""
-                    encoding = encoding.lower()
-
-                    if encoding:
-                        # Map encoding aliases
-                        if encoding in [
-                            "shift_jis",
-                            "shift-jis",
-                            "shiftjis",
-                            "sjis",
-                            "csshiftjis",
-                        ]:
-                            return "shift_jis"
-                        if encoding in ["ascii"]:
-                            return "utf-8"
-                        # Try detected encoding
-                        if try_decode(raw_data, encoding):
-                            return encoding
-                except ImportError:
-                    pass
-
-                # Fall back to UTF-8
-                return "utf-8"
-
-        except (IOError, OSError, UnicodeDecodeError):
-            return "utf-8"
-
     @staticmethod
     def calculate_hash(content: str) -> str:
         """
@@ -123,26 +61,49 @@ def calculate_hash(content: str) -> str:
         """
         return hashlib.sha256(content.encode()).hexdigest()
 
-    async def _read_file(self, file_path: str) -> Tuple[List[str], str, int]:
-        """Read file and return lines, content, and total lines."""
+    async def _read_file(
+        self, file_path: str, encoding: str = "utf-8"
+    ) -> Tuple[List[str], str, int]:
+        """Read file and return lines, content, and total lines.
+
+        Args:
+            file_path (str): Path to the file to read
+            encoding (str, optional): File encoding. Defaults to "utf-8"
+
+        Returns:
+            Tuple[List[str], str, int]: Lines, content, and total line count
+
+        Raises:
+            FileNotFoundError: If file not found
+            UnicodeDecodeError: If file cannot be decoded with specified encoding
+        """
         self._validate_file_path(file_path)
-        encoding = self._detect_encoding(file_path)
         try:
             with open(file_path, "r", encoding=encoding) as f:
                 lines = f.readlines()
             file_content = "".join(lines)
             return lines, file_content, len(lines)
         except FileNotFoundError as err:
             raise FileNotFoundError(f"File not found: {file_path}") from err
+        except UnicodeDecodeError as err:
+            raise UnicodeDecodeError(
+                encoding,
+                err.object,
+                err.start,
+                err.end,
+                f"Failed to decode file '{file_path}' with {encoding} encoding",
+            ) from err
 
     async def read_multiple_ranges(
-        self, ranges: List[FileRanges]
+        self, ranges: List[FileRanges], encoding: str = "utf-8"
     ) -> Dict[str, Dict[str, Any]]:
         result: Dict[str, Dict[str, Any]] = {}
 
         for file_range in ranges:
             file_path = file_range["file_path"]
-            lines, file_content, total_lines = await self._read_file(file_path)
+            lines, file_content, total_lines = await self._read_file(
+                file_path, encoding=encoding
+            )
             file_hash = self.calculate_hash(file_content)
             result[file_path] = {"ranges": [], "file_hash": file_hash}
 
@@ -187,9 +148,15 @@ async def read_multiple_ranges(
         return result
 
     async def read_file_contents(
-        self, file_path: str, line_start: int = 1, line_end: Optional[int] = None
+        self,
+        file_path: str,
+        line_start: int = 1,
+        line_end: Optional[int] = None,
+        encoding: str = "utf-8",
     ) -> Tuple[str, int, int, str, int, int]:
-        lines, file_content, total_lines = await self._read_file(file_path)
+        lines, file_content, total_lines = await self._read_file(
+            file_path, encoding=encoding
+        )
         line_start = max(1, line_start) - 1
         line_end = total_lines if line_end is None else min(line_end, total_lines)
 
@@ -203,7 +170,7 @@ async def read_file_contents(
         selected_lines = lines[line_start:line_end]
         content = "".join(selected_lines)
         content_hash = self.calculate_hash(content)
-        content_size = len(content.encode(self._detect_encoding(file_path)))
+        content_size = len(content.encode(encoding))
 
         return (
             content,
@@ -215,7 +182,11 @@ async def read_file_contents(
         )
 
     async def edit_file_contents(
-        self, file_path: str, expected_hash: str, patches: List[Dict[str, Any]]
+        self,
+        file_path: str,
+        expected_hash: str,
+        patches: List[Dict[str, Any]],
+        encoding: str = "utf-8",
     ) -> Dict[str, Any]:
         """
         Edit file contents with hash-based conflict detection and multiple patches.
@@ -294,7 +265,7 @@ async def edit_file_contents(
             else:
                 # Read current file content and verify hash
                 current_content, _, _, current_hash, total_lines, _ = (
-                    await self.read_file_contents(file_path)
+                    await self.read_file_contents(file_path, encoding=encoding)
                 )
 
                 if current_hash != expected_hash:
@@ -400,11 +371,6 @@ async def edit_file_contents(
 
             # Write the final content back to file
             final_content = "".join(lines)
-            encoding = (
-                "utf-8"
-                if not os.path.exists(file_path)
-                else self._detect_encoding(file_path)
-            )
             with open(file_path, "w", encoding=encoding) as f:
                 f.write(final_content)
 
diff --git a/tests/test_text_editor.py b/tests/test_text_editor.py