Skip to content

Commit f982f84

Browse files
author
Yoshihiro Takahara
committed
feat: add encoding option to text file operations
- Remove automatic encoding detection - Add encoding parameter to API (default: utf-8) - Add tests for encoding error handling
1 parent 6a45d21 commit f982f84

File tree

4 files changed

+108
-786
lines changed

4 files changed

+108
-786
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ MCP Text Editor Server is designed to facilitate safe and efficient line-based t
4747
- Read multiple ranges from multiple files in a single operation
4848
- Line-based patch application with correct handling of line number shifts
4949
- Edit text file contents with conflict detection
50+
- Flexible character encoding support (utf-8, shift_jis, latin1, etc.)
5051
- Support for multiple file operations
5152
- Proper handling of concurrent edits with hash-based validation
5253
- Memory-efficient processing of large files
@@ -143,6 +144,7 @@ Parameters:
143144
- `file_path`: Path to the text file
144145
- `line_start`/`start`: Line number to start from (1-based)
145146
- `line_end`/`end`: Line number to end at (inclusive, null for end of file)
147+
- `encoding`: File encoding (default: "utf-8"). Specify the encoding of the text file (e.g., "shift_jis", "latin1")
146148

147149
**Single Range Response:**
148150

@@ -238,6 +240,7 @@ Important Notes:
238240
3. Patches must not overlap within the same file
239241
4. Line numbers are 1-based
240242
5. If original content ends with newline, ensure patch content also ends with newline
243+
6. File encoding must match the encoding used in get_text_file_contents
241244

242245
**Success Response:**
243246

@@ -295,6 +298,7 @@ result = await edit_text_file_contents({
295298
{
296299
"path": "file.txt",
297300
"hash": contents["file.txt"][0]["hash"],
301+
"encoding": "utf-8", # Optional, defaults to "utf-8"
298302
"patches": [
299303
{
300304
"line_start": 5,
@@ -325,6 +329,7 @@ The server handles various error cases:
325329
- Hash mismatches (concurrent edit detection)
326330
- Invalid patch ranges
327331
- Overlapping patches
332+
- Encoding errors (when file cannot be decoded with specified encoding)
328333
- Line number out of bounds
329334

330335
## Security Considerations

src/mcp_text_editor/server.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,12 @@ def get_tool_description(self) -> Tool:
6767
},
6868
"required": ["file_path", "ranges"],
6969
},
70-
}
70+
},
71+
"encoding": {
72+
"type": "string",
73+
"description": "Text encoding (default: 'utf-8')",
74+
"default": "utf-8",
75+
},
7176
},
7277
"required": ["files"],
7378
},
@@ -81,7 +86,10 @@ async def run_tool(self, arguments: Dict[str, Any]) -> Sequence[TextContent]:
8186
raise RuntimeError("Missing required argument: 'files'")
8287

8388
# Handle request
84-
result = await self.editor.read_multiple_ranges(arguments["files"])
89+
encoding = arguments.get("encoding", "utf-8")
90+
result = await self.editor.read_multiple_ranges(
91+
arguments["files"], encoding=encoding
92+
)
8593
response = result
8694

8795
return [TextContent(type="text", text=json.dumps(response, indent=2))]
@@ -130,14 +138,23 @@ def get_tool_description(self) -> Tool:
130138
"default": None,
131139
},
132140
"contents": {"type": "string"},
141+
"range_hash": {
142+
"type": "string",
143+
"description": "Hash of the content being replaced (required except for new files and append operations)",
144+
},
133145
},
134146
"required": ["contents"],
135147
},
136148
},
137149
},
138150
"required": ["path", "file_hash", "patches"],
139151
},
140-
}
152+
},
153+
"encoding": {
154+
"type": "string",
155+
"description": "Text encoding (default: 'utf-8')",
156+
"default": "utf-8",
157+
},
141158
},
142159
"required": ["files"],
143160
},
@@ -189,8 +206,9 @@ async def run_tool(self, arguments: Dict[str, Any]) -> Sequence[TextContent]:
189206
}
190207
continue
191208

209+
encoding = arguments.get("encoding", "utf-8")
192210
result = await self.editor.edit_file_contents(
193-
file_path, file_hash, patches
211+
file_path, file_hash, patches, encoding=encoding
194212
)
195213
results[file_path] = result
196214
except Exception as e:

src/mcp_text_editor/text_editor.py

Lines changed: 43 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -48,68 +48,6 @@ def _validate_file_path(self, file_path: str) -> None:
4848
if ".." in file_path:
4949
raise ValueError("Path traversal not allowed")
5050

51-
def _detect_encoding(self, file_path: str) -> str:
52-
"""
53-
Detect file encoding with Shift-JIS prioritized.
54-
55-
Args:
56-
file_path (str): Path to the file
57-
58-
Returns:
59-
str: Detected encoding, falls back to utf-8 if detection fails.
60-
"""
61-
62-
def try_decode(data: bytes, encoding: str) -> bool:
63-
"""Try to decode data with the given encoding."""
64-
try:
65-
data.decode(encoding)
66-
return True
67-
except UnicodeDecodeError:
68-
return False
69-
70-
# Read file content for encoding detection
71-
try:
72-
with open(file_path, "rb") as f:
73-
raw_data = f.read()
74-
75-
# Try encodings in order of priority
76-
if try_decode(raw_data, "shift_jis"):
77-
return "shift_jis"
78-
if try_decode(raw_data, "utf-8"):
79-
return "utf-8"
80-
81-
# As a last resort, use chardet
82-
try:
83-
import chardet
84-
85-
result = chardet.detect(raw_data)
86-
encoding = result.get("encoding") or ""
87-
encoding = encoding.lower()
88-
89-
if encoding:
90-
# Map encoding aliases
91-
if encoding in [
92-
"shift_jis",
93-
"shift-jis",
94-
"shiftjis",
95-
"sjis",
96-
"csshiftjis",
97-
]:
98-
return "shift_jis"
99-
if encoding in ["ascii"]:
100-
return "utf-8"
101-
# Try detected encoding
102-
if try_decode(raw_data, encoding):
103-
return encoding
104-
except ImportError:
105-
pass
106-
107-
# Fall back to UTF-8
108-
return "utf-8"
109-
110-
except (IOError, OSError, UnicodeDecodeError):
111-
return "utf-8"
112-
11351
@staticmethod
11452
def calculate_hash(content: str) -> str:
11553
"""
@@ -123,26 +61,49 @@ def calculate_hash(content: str) -> str:
12361
"""
12462
return hashlib.sha256(content.encode()).hexdigest()
12563

126-
async def _read_file(self, file_path: str) -> Tuple[List[str], str, int]:
127-
"""Read file and return lines, content, and total lines."""
64+
async def _read_file(
65+
self, file_path: str, encoding: str = "utf-8"
66+
) -> Tuple[List[str], str, int]:
67+
"""Read file and return lines, content, and total lines.
68+
69+
Args:
70+
file_path (str): Path to the file to read
71+
encoding (str, optional): File encoding. Defaults to "utf-8"
72+
73+
Returns:
74+
Tuple[List[str], str, int]: Lines, content, and total line count
75+
76+
Raises:
77+
FileNotFoundError: If file not found
78+
UnicodeDecodeError: If file cannot be decoded with specified encoding
79+
"""
12880
self._validate_file_path(file_path)
129-
encoding = self._detect_encoding(file_path)
13081
try:
13182
with open(file_path, "r", encoding=encoding) as f:
13283
lines = f.readlines()
13384
file_content = "".join(lines)
13485
return lines, file_content, len(lines)
13586
except FileNotFoundError as err:
13687
raise FileNotFoundError(f"File not found: {file_path}") from err
88+
except UnicodeDecodeError as err:
89+
raise UnicodeDecodeError(
90+
encoding,
91+
err.object,
92+
err.start,
93+
err.end,
94+
f"Failed to decode file '{file_path}' with {encoding} encoding",
95+
) from err
13796

13897
async def read_multiple_ranges(
139-
self, ranges: List[FileRanges]
98+
self, ranges: List[FileRanges], encoding: str = "utf-8"
14099
) -> Dict[str, Dict[str, Any]]:
141100
result: Dict[str, Dict[str, Any]] = {}
142101

143102
for file_range in ranges:
144103
file_path = file_range["file_path"]
145-
lines, file_content, total_lines = await self._read_file(file_path)
104+
lines, file_content, total_lines = await self._read_file(
105+
file_path, encoding=encoding
106+
)
146107
file_hash = self.calculate_hash(file_content)
147108
result[file_path] = {"ranges": [], "file_hash": file_hash}
148109

@@ -187,9 +148,15 @@ async def read_multiple_ranges(
187148
return result
188149

189150
async def read_file_contents(
190-
self, file_path: str, line_start: int = 1, line_end: Optional[int] = None
151+
self,
152+
file_path: str,
153+
line_start: int = 1,
154+
line_end: Optional[int] = None,
155+
encoding: str = "utf-8",
191156
) -> Tuple[str, int, int, str, int, int]:
192-
lines, file_content, total_lines = await self._read_file(file_path)
157+
lines, file_content, total_lines = await self._read_file(
158+
file_path, encoding=encoding
159+
)
193160
line_start = max(1, line_start) - 1
194161
line_end = total_lines if line_end is None else min(line_end, total_lines)
195162

@@ -203,7 +170,7 @@ async def read_file_contents(
203170
selected_lines = lines[line_start:line_end]
204171
content = "".join(selected_lines)
205172
content_hash = self.calculate_hash(content)
206-
content_size = len(content.encode(self._detect_encoding(file_path)))
173+
content_size = len(content.encode(encoding))
207174

208175
return (
209176
content,
@@ -215,7 +182,11 @@ async def read_file_contents(
215182
)
216183

217184
async def edit_file_contents(
218-
self, file_path: str, expected_hash: str, patches: List[Dict[str, Any]]
185+
self,
186+
file_path: str,
187+
expected_hash: str,
188+
patches: List[Dict[str, Any]],
189+
encoding: str = "utf-8",
219190
) -> Dict[str, Any]:
220191
"""
221192
Edit file contents with hash-based conflict detection and multiple patches.
@@ -294,7 +265,7 @@ async def edit_file_contents(
294265
else:
295266
# Read current file content and verify hash
296267
current_content, _, _, current_hash, total_lines, _ = (
297-
await self.read_file_contents(file_path)
268+
await self.read_file_contents(file_path, encoding=encoding)
298269
)
299270

300271
if current_hash != expected_hash:
@@ -400,11 +371,6 @@ async def edit_file_contents(
400371

401372
# Write the final content back to file
402373
final_content = "".join(lines)
403-
encoding = (
404-
"utf-8"
405-
if not os.path.exists(file_path)
406-
else self._detect_encoding(file_path)
407-
)
408374
with open(file_path, "w", encoding=encoding) as f:
409375
f.write(final_content)
410376

0 commit comments

Comments
 (0)