Skip to content

Commit b45b887

Browse files
authored
fix: resolve multiple security vulnerabilities in #268 (#272)
* fix: jinja2 template injection * fix: milvus collection name injection * fix: restrict debug mode and host binding via env vars * refactor: replace SHA-1 with SHA-256 for better security * fix: path traversal vulnerability & pdf memory leak
1 parent dcd5880 commit b45b887

File tree

5 files changed

+256
-37
lines changed

5 files changed

+256
-37
lines changed

servers/corpus/src/corpus.py

Lines changed: 96 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,48 @@
1414
from tqdm import tqdm
1515
from ultrarag.server import UltraRAG_MCP_Server
1616

17+
18+
def _validate_path(user_path: str, allowed_base: Optional[str] = None) -> Path:
19+
"""Validate and sanitize file path to prevent path traversal attacks.
20+
21+
Args:
22+
user_path: User-provided file path
23+
allowed_base: Optional base directory to restrict paths to
24+
25+
Returns:
26+
Resolved and validated Path object
27+
28+
Raises:
29+
ValueError: If path traversal is detected or path is invalid
30+
"""
31+
try:
32+
# Resolve the path to absolute
33+
safe_path = Path(user_path).resolve()
34+
35+
# If allowed_base is provided, ensure path is within it
36+
if allowed_base:
37+
base_path = Path(allowed_base).resolve()
38+
try:
39+
# Check if safe_path is relative to base_path
40+
safe_path.relative_to(base_path)
41+
except ValueError:
42+
raise ValueError(
43+
f"Path traversal detected: '{user_path}' is outside allowed directory '{allowed_base}'"
44+
)
45+
46+
# Additional safety: check for suspicious patterns
47+
path_str = str(safe_path)
48+
if ".." in path_str or path_str.startswith("/etc/") or path_str.startswith("/proc/"):
49+
# Double check even after resolve
50+
if ".." in str(Path(user_path)):
51+
raise ValueError(f"Path traversal detected: '{user_path}' contains '..'")
52+
53+
return safe_path
54+
except (OSError, ValueError) as e:
55+
if isinstance(e, ValueError):
56+
raise
57+
raise ValueError(f"Invalid path: {user_path}") from e
58+
1759
app = UltraRAG_MCP_Server("corpus")
1860

1961

@@ -171,7 +213,15 @@ async def build_text_corpus(
171213
PMLIKE_EXT = [".pdf", ".xps", ".oxps", ".epub", ".mobi", ".fb2"]
172214
DOCX_EXT = [".docx"]
173215

174-
in_path = os.path.abspath(parse_file_path)
216+
# Validate and sanitize path to prevent path traversal
217+
try:
218+
safe_path = _validate_path(parse_file_path)
219+
in_path = str(safe_path)
220+
except ValueError as e:
221+
err_msg = f"Invalid file path: {e}"
222+
app.logger.error(err_msg)
223+
raise ToolError(err_msg)
224+
175225
if not os.path.exists(in_path):
176226
err_msg = f"Input path not found: {in_path}"
177227
app.logger.error(err_msg)
@@ -224,6 +274,7 @@ def process_one_file(fp: str) -> None:
224274
app.logger.error(err_msg)
225275
raise ToolError(err_msg)
226276
try:
277+
doc = None
227278
with suppress_stdout():
228279
doc = pymupdf.open(fp)
229280
texts = []
@@ -235,6 +286,13 @@ def process_one_file(fp: str) -> None:
235286
content = "\n\n".join(texts)
236287
except Exception as e:
237288
app.logger.warning(f"PDF read failed: {fp} | {e}")
289+
finally:
290+
# Ensure PDF document is closed to prevent memory leaks
291+
if doc is not None:
292+
try:
293+
doc.close()
294+
except Exception:
295+
pass
238296
else:
239297
warn_msg = f"Unsupported file type, skip: {fp}"
240298
app.logger.warning(warn_msg)
@@ -291,13 +349,28 @@ async def build_image_corpus(
291349
app.logger.error(err_msg)
292350
raise ToolError(err_msg)
293351

294-
in_path = os.path.abspath(parse_file_path)
352+
# Validate and sanitize path to prevent path traversal
353+
try:
354+
safe_path = _validate_path(parse_file_path)
355+
in_path = str(safe_path)
356+
except ValueError as e:
357+
err_msg = f"Invalid file path: {e}"
358+
app.logger.error(err_msg)
359+
raise ToolError(err_msg)
360+
295361
if not os.path.exists(in_path):
296362
err_msg = f"Input path not found: {in_path}"
297363
app.logger.error(err_msg)
298364
raise ToolError(err_msg)
299365

300-
corpus_jsonl = os.path.abspath(image_corpus_save_path)
366+
# Validate output path
367+
try:
368+
safe_output_path = _validate_path(image_corpus_save_path)
369+
corpus_jsonl = str(safe_output_path)
370+
except ValueError as e:
371+
err_msg = f"Invalid output path: {e}"
372+
app.logger.error(err_msg)
373+
raise ToolError(err_msg)
301374
out_root = os.path.dirname(corpus_jsonl) or os.getcwd()
302375
base_img_dir = os.path.join(out_root, "image")
303376
os.makedirs(base_img_dir, exist_ok=True)
@@ -329,6 +402,7 @@ async def build_image_corpus(
329402
out_img_dir = os.path.join(base_img_dir, stem)
330403
os.makedirs(out_img_dir, exist_ok=True)
331404

405+
doc = None
332406
try:
333407
with suppress_stdout():
334408
doc = pymupdf.open(pdf_path)
@@ -337,6 +411,9 @@ async def build_image_corpus(
337411
app.logger.warning(warn_msg)
338412
continue
339413

414+
if doc is None:
415+
continue
416+
340417
if getattr(doc, "is_encrypted", False):
341418
try:
342419
doc.authenticate("")
@@ -393,6 +470,13 @@ async def build_image_corpus(
393470
}
394471
)
395472
gid += 1
473+
474+
# Ensure PDF document is closed to prevent memory leaks
475+
if doc is not None:
476+
try:
477+
doc.close()
478+
except Exception:
479+
pass
396480

397481
_save_jsonl(valid_rows, corpus_jsonl)
398482
info_msg = (
@@ -429,7 +513,15 @@ async def mineru_parse(
429513
app.logger.error(err_msg)
430514
raise ToolError(err_msg)
431515

432-
in_path = os.path.abspath(parse_file_path)
516+
# Validate and sanitize path to prevent path traversal
517+
try:
518+
safe_path = _validate_path(parse_file_path)
519+
in_path = str(safe_path)
520+
except ValueError as e:
521+
err_msg = f"Invalid file path: {e}"
522+
app.logger.error(err_msg)
523+
raise ToolError(err_msg)
524+
433525
if not os.path.exists(in_path):
434526
err_msg = f"Input path not found: {in_path}"
435527
app.logger.error(err_msg)

0 commit comments

Comments
 (0)