1414from tqdm import tqdm
1515from ultrarag .server import UltraRAG_MCP_Server
1616
17+
18+ def _validate_path (user_path : str , allowed_base : Optional [str ] = None ) -> Path :
19+ """Validate and sanitize file path to prevent path traversal attacks.
20+
21+ Args:
22+ user_path: User-provided file path
23+ allowed_base: Optional base directory to restrict paths to
24+
25+ Returns:
26+ Resolved and validated Path object
27+
28+ Raises:
29+ ValueError: If path traversal is detected or path is invalid
30+ """
31+ try :
32+ # Resolve the path to absolute
33+ safe_path = Path (user_path ).resolve ()
34+
35+ # If allowed_base is provided, ensure path is within it
36+ if allowed_base :
37+ base_path = Path (allowed_base ).resolve ()
38+ try :
39+ # Check if safe_path is relative to base_path
40+ safe_path .relative_to (base_path )
41+ except ValueError :
42+ raise ValueError (
43+ f"Path traversal detected: '{ user_path } ' is outside allowed directory '{ allowed_base } '"
44+ )
45+
46+ # Additional safety: check for suspicious patterns
47+ path_str = str (safe_path )
48+ if ".." in path_str or path_str .startswith ("/etc/" ) or path_str .startswith ("/proc/" ):
49+ # Double check even after resolve
50+ if ".." in str (Path (user_path )):
51+ raise ValueError (f"Path traversal detected: '{ user_path } ' contains '..'" )
52+
53+ return safe_path
54+ except (OSError , ValueError ) as e :
55+ if isinstance (e , ValueError ):
56+ raise
57+ raise ValueError (f"Invalid path: { user_path } " ) from e
58+
1759app = UltraRAG_MCP_Server ("corpus" )
1860
1961
@@ -171,7 +213,15 @@ async def build_text_corpus(
171213 PMLIKE_EXT = [".pdf" , ".xps" , ".oxps" , ".epub" , ".mobi" , ".fb2" ]
172214 DOCX_EXT = [".docx" ]
173215
174- in_path = os .path .abspath (parse_file_path )
216+ # Validate and sanitize path to prevent path traversal
217+ try :
218+ safe_path = _validate_path (parse_file_path )
219+ in_path = str (safe_path )
220+ except ValueError as e :
221+ err_msg = f"Invalid file path: { e } "
222+ app .logger .error (err_msg )
223+ raise ToolError (err_msg )
224+
175225 if not os .path .exists (in_path ):
176226 err_msg = f"Input path not found: { in_path } "
177227 app .logger .error (err_msg )
@@ -224,6 +274,7 @@ def process_one_file(fp: str) -> None:
224274 app .logger .error (err_msg )
225275 raise ToolError (err_msg )
226276 try :
277+ doc = None
227278 with suppress_stdout ():
228279 doc = pymupdf .open (fp )
229280 texts = []
@@ -235,6 +286,13 @@ def process_one_file(fp: str) -> None:
235286 content = "\n \n " .join (texts )
236287 except Exception as e :
237288 app .logger .warning (f"PDF read failed: { fp } | { e } " )
289+ finally :
290+ # Ensure PDF document is closed to prevent memory leaks
291+ if doc is not None :
292+ try :
293+ doc .close ()
294+ except Exception :
295+ pass
238296 else :
239297 warn_msg = f"Unsupported file type, skip: { fp } "
240298 app .logger .warning (warn_msg )
@@ -291,13 +349,28 @@ async def build_image_corpus(
291349 app .logger .error (err_msg )
292350 raise ToolError (err_msg )
293351
294- in_path = os .path .abspath (parse_file_path )
352+ # Validate and sanitize path to prevent path traversal
353+ try :
354+ safe_path = _validate_path (parse_file_path )
355+ in_path = str (safe_path )
356+ except ValueError as e :
357+ err_msg = f"Invalid file path: { e } "
358+ app .logger .error (err_msg )
359+ raise ToolError (err_msg )
360+
295361 if not os .path .exists (in_path ):
296362 err_msg = f"Input path not found: { in_path } "
297363 app .logger .error (err_msg )
298364 raise ToolError (err_msg )
299365
300- corpus_jsonl = os .path .abspath (image_corpus_save_path )
366+ # Validate output path
367+ try :
368+ safe_output_path = _validate_path (image_corpus_save_path )
369+ corpus_jsonl = str (safe_output_path )
370+ except ValueError as e :
371+ err_msg = f"Invalid output path: { e } "
372+ app .logger .error (err_msg )
373+ raise ToolError (err_msg )
301374 out_root = os .path .dirname (corpus_jsonl ) or os .getcwd ()
302375 base_img_dir = os .path .join (out_root , "image" )
303376 os .makedirs (base_img_dir , exist_ok = True )
@@ -329,6 +402,7 @@ async def build_image_corpus(
329402 out_img_dir = os .path .join (base_img_dir , stem )
330403 os .makedirs (out_img_dir , exist_ok = True )
331404
405+ doc = None
332406 try :
333407 with suppress_stdout ():
334408 doc = pymupdf .open (pdf_path )
@@ -337,6 +411,9 @@ async def build_image_corpus(
337411 app .logger .warning (warn_msg )
338412 continue
339413
414+ if doc is None :
415+ continue
416+
340417 if getattr (doc , "is_encrypted" , False ):
341418 try :
342419 doc .authenticate ("" )
@@ -393,6 +470,13 @@ async def build_image_corpus(
393470 }
394471 )
395472 gid += 1
473+
474+ # Ensure PDF document is closed to prevent memory leaks
475+ if doc is not None :
476+ try :
477+ doc .close ()
478+ except Exception :
479+ pass
396480
397481 _save_jsonl (valid_rows , corpus_jsonl )
398482 info_msg = (
@@ -429,7 +513,15 @@ async def mineru_parse(
429513 app .logger .error (err_msg )
430514 raise ToolError (err_msg )
431515
432- in_path = os .path .abspath (parse_file_path )
516+ # Validate and sanitize path to prevent path traversal
517+ try :
518+ safe_path = _validate_path (parse_file_path )
519+ in_path = str (safe_path )
520+ except ValueError as e :
521+ err_msg = f"Invalid file path: { e } "
522+ app .logger .error (err_msg )
523+ raise ToolError (err_msg )
524+
433525 if not os .path .exists (in_path ):
434526 err_msg = f"Input path not found: { in_path } "
435527 app .logger .error (err_msg )
0 commit comments