@@ -100,11 +100,10 @@ async def perform_document_conversion(
100100 if not file_path :
101101 return None
102102
103- conversion_success = False
104103 conversion_msg = ""
105104
106105 # 首先尝试使用简单的PDF转换器(对于PDF文件)
107- if file_path .lower ().endswith (".pdf" ) and PYPDF2_AVAILABLE and not extract_images :
106+ if file_path .lower ().endswith (".pdf" ) and PYPDF2_AVAILABLE :
108107 try :
109108 simple_converter = SimplePdfConverter ()
110109 conversion_result = simple_converter .convert_pdf_to_markdown (file_path )
@@ -119,40 +118,40 @@ async def perform_document_conversion(
119118 conversion_msg += (
120119 f"\n Pages extracted: { conversion_result ['pages_extracted' ]} "
121120 )
122- conversion_success = True
121+
123122 else :
124123 conversion_msg = f"\n [WARNING] PDF conversion failed: { conversion_result ['error' ]} "
125124 except Exception as conv_error :
126125 conversion_msg = f"\n [WARNING] PDF conversion error: { str (conv_error )} "
127126
128127 # 如果简单转换失败,尝试使用docling(支持图片提取)
129- if not conversion_success and DOCLING_AVAILABLE :
130- try :
131- converter = DoclingConverter ()
132- if converter .is_supported_format (file_path ):
133- conversion_result = converter .convert_to_markdown (
134- file_path , extract_images = extract_images
135- )
136- if conversion_result ["success" ]:
137- conversion_msg = (
138- "\n [INFO] Document converted to Markdown (docling)"
139- )
140- conversion_msg += (
141- f"\n Markdown file: { conversion_result ['output_file' ]} "
142- )
143- conversion_msg += f"\n Conversion time: { conversion_result ['duration' ]:.2f} seconds"
144- if conversion_result .get ("images_extracted" , 0 ) > 0 :
145- conversion_msg += f"\n Images extracted: { conversion_result ['images_extracted' ]} "
146- images_dir = os .path .join (
147- os .path .dirname (conversion_result ["output_file" ]), "images"
148- )
149- conversion_msg += f"\n Images saved to: { images_dir } "
150- else :
151- conversion_msg = f"\n [WARNING] Docling conversion failed: { conversion_result ['error' ]} "
152- except Exception as conv_error :
153- conversion_msg = (
154- f"\n [WARNING] Docling conversion error: { str (conv_error )} "
155- )
128+ # if not conversion_success and DOCLING_AVAILABLE:
129+ # try:
130+ # converter = DoclingConverter()
131+ # if converter.is_supported_format(file_path):
132+ # conversion_result = converter.convert_to_markdown(
133+ # file_path, extract_images=extract_images
134+ # )
135+ # if conversion_result["success"]:
136+ # conversion_msg = (
137+ # "\n [INFO] Document converted to Markdown (docling)"
138+ # )
139+ # conversion_msg += (
140+ # f"\n Markdown file: {conversion_result['output_file']}"
141+ # )
142+ # conversion_msg += f"\n Conversion time: {conversion_result['duration']:.2f} seconds"
143+ # if conversion_result.get("images_extracted", 0) > 0:
144+ # conversion_msg += f"\n Images extracted: {conversion_result['images_extracted']}"
145+ # images_dir = os.path.join(
146+ # os.path.dirname(conversion_result["output_file"]), "images"
147+ # )
148+ # conversion_msg += f"\n Images saved to: {images_dir}"
149+ # else:
150+ # conversion_msg = f"\n [WARNING] Docling conversion failed: {conversion_result['error']}"
151+ # except Exception as conv_error:
152+ # conversion_msg = (
153+ # f"\n [WARNING] Docling conversion error: {str(conv_error)}"
154+ # )
156155
157156 return conversion_msg if conversion_msg else None
158157
@@ -1235,110 +1234,110 @@ async def move_file_to(
12351234 return msg + f"[ERROR] Move failed!\n Error: { result ['error' ]} "
12361235
12371236
1238- @mcp .tool ()
1239- async def convert_document_to_markdown (
1240- file_path : str , output_path : Optional [str ] = None , extract_images : bool = True
1241- ) -> str :
1242- """
1243- Convert a document to Markdown format with image extraction support.
1244-
1245- Supports both local files and URLs. Uses docling for advanced conversion with image extraction,
1246- or falls back to PyPDF2 for simple PDF text extraction.
1247-
1248- Args:
1249- file_path: Path to the input document file or URL (supports PDF, DOCX, PPTX, HTML, TXT, MD)
1250- output_path: Path for the output Markdown file (optional, auto-generated if not provided)
1251- extract_images: Whether to extract images from the document (default: True)
1252-
1253- Returns:
1254- Status message about the conversion operation with preview of converted content
1255-
1256- Examples:
1257- - "convert_document_to_markdown('paper.pdf')"
1258- - "convert_document_to_markdown('https://example.com/doc.pdf', 'output.md')"
1259- - "convert_document_to_markdown('presentation.pptx', extract_images=False)"
1260- """
1261- # 检查是否为URL
1262- is_url_input = False
1263- try :
1264- parsed = urlparse (file_path )
1265- is_url_input = parsed .scheme in ("http" , "https" )
1266- except Exception :
1267- is_url_input = False
1268-
1269- # 检查文件是否存在(如果不是URL)
1270- if not is_url_input and not os .path .exists (file_path ):
1271- return f"[ERROR] Input file not found: { file_path } "
1272-
1273- # 检查是否是PDF文件,优先使用简单转换器(仅对本地文件)
1274- if (
1275- not is_url_input
1276- and file_path .lower ().endswith (".pdf" )
1277- and PYPDF2_AVAILABLE
1278- and not extract_images
1279- ):
1280- try :
1281- simple_converter = SimplePdfConverter ()
1282- result = simple_converter .convert_pdf_to_markdown (file_path , output_path )
1283- except Exception as e :
1284- return f"[ERROR] PDF conversion error: { str (e )} "
1285- elif DOCLING_AVAILABLE :
1286- try :
1287- converter = DoclingConverter ()
1288-
1289- # 检查文件格式是否支持
1290- if not is_url_input and not converter .is_supported_format (file_path ):
1291- supported_formats = [".pdf" , ".docx" , ".pptx" , ".html" , ".md" , ".txt" ]
1292- return f"[ERROR] Unsupported file format. Supported formats: { ', ' .join (supported_formats )} "
1293- elif is_url_input and not file_path .lower ().endswith (
1294- (".pdf" , ".docx" , ".pptx" , ".html" , ".md" , ".txt" )
1295- ):
1296- return f"[ERROR] Unsupported URL format: { file_path } "
1297-
1298- # 执行转换(支持图片提取)
1299- result = converter .convert_to_markdown (
1300- file_path , output_path , extract_images
1301- )
1302- except Exception as e :
1303- return f"[ERROR] Docling conversion error: { str (e )} "
1304- else :
1305- return (
1306- "[ERROR] No conversion tools available. Please install docling or PyPDF2."
1307- )
1308-
1309- if result ["success" ]:
1310- msg = "[SUCCESS] Document converted successfully!\n "
1311- msg += f" Input: { result ['input_file' ]} \n "
1312- msg += f" Output file: { result ['output_file' ]} \n "
1313- msg += f" Conversion time: { result ['duration' ]:.2f} seconds\n "
1314-
1315- if result ["input_size" ] > 0 :
1316- msg += f" Original size: { result ['input_size' ] / 1024 :.1f} KB\n "
1317- msg += f" Markdown size: { result ['output_size' ] / 1024 :.1f} KB\n "
1318-
1319- # 显示图片提取信息
1320- if extract_images and "images_extracted" in result :
1321- images_count = result ["images_extracted" ]
1322- if images_count > 0 :
1323- msg += f" Images extracted: { images_count } \n "
1324- msg += f" Images saved to: { os .path .join (os .path .dirname (result ['output_file' ]), 'images' )} \n "
1325- else :
1326- msg += " No images found in document\n "
1327-
1328- # 显示Markdown内容的前几行作为预览
1329- content_lines = result ["markdown_content" ].split ("\n " )
1330- preview_lines = content_lines [:5 ]
1331- if len (content_lines ) > 5 :
1332- preview_lines .append ("..." )
1333-
1334- msg += "\n [PREVIEW] First few lines of converted Markdown:\n "
1335- for line in preview_lines :
1336- msg += f" { line } \n "
1337- else :
1338- msg = "[ERROR] Conversion failed!\n "
1339- msg += f" Error: { result ['error' ]} "
1340-
1341- return msg
1237+ # @mcp.tool()
1238+ # async def convert_document_to_markdown(
1239+ # file_path: str, output_path: Optional[str] = None, extract_images: bool = True
1240+ # ) -> str:
1241+ # """
1242+ # Convert a document to Markdown format with image extraction support.
1243+
1244+ # Supports both local files and URLs. Uses docling for advanced conversion with image extraction,
1245+ # or falls back to PyPDF2 for simple PDF text extraction.
1246+
1247+ # Args:
1248+ # file_path: Path to the input document file or URL (supports PDF, DOCX, PPTX, HTML, TXT, MD)
1249+ # output_path: Path for the output Markdown file (optional, auto-generated if not provided)
1250+ # extract_images: Whether to extract images from the document (default: True)
1251+
1252+ # Returns:
1253+ # Status message about the conversion operation with preview of converted content
1254+
1255+ # Examples:
1256+ # - "convert_document_to_markdown('paper.pdf')"
1257+ # - "convert_document_to_markdown('https://example.com/doc.pdf', 'output.md')"
1258+ # - "convert_document_to_markdown('presentation.pptx', extract_images=False)"
1259+ # """
1260+ # # 检查是否为URL
1261+ # is_url_input = False
1262+ # try:
1263+ # parsed = urlparse(file_path)
1264+ # is_url_input = parsed.scheme in ("http", "https")
1265+ # except Exception:
1266+ # is_url_input = False
1267+
1268+ # # 检查文件是否存在(如果不是URL)
1269+ # if not is_url_input and not os.path.exists(file_path):
1270+ # return f"[ERROR] Input file not found: {file_path}"
1271+
1272+ # # 检查是否是PDF文件,优先使用简单转换器(仅对本地文件)
1273+ # if (
1274+ # not is_url_input
1275+ # and file_path.lower().endswith(".pdf")
1276+ # and PYPDF2_AVAILABLE
1277+ # and not extract_images
1278+ # ):
1279+ # try:
1280+ # simple_converter = SimplePdfConverter()
1281+ # result = simple_converter.convert_pdf_to_markdown(file_path, output_path)
1282+ # except Exception as e:
1283+ # return f"[ERROR] PDF conversion error: {str(e)}"
1284+ # elif DOCLING_AVAILABLE:
1285+ # try:
1286+ # converter = DoclingConverter()
1287+
1288+ # # 检查文件格式是否支持
1289+ # if not is_url_input and not converter.is_supported_format(file_path):
1290+ # supported_formats = [".pdf", ".docx", ".pptx", ".html", ".md", ".txt"]
1291+ # return f"[ERROR] Unsupported file format. Supported formats: {', '.join(supported_formats)}"
1292+ # elif is_url_input and not file_path.lower().endswith(
1293+ # (".pdf", ".docx", ".pptx", ".html", ".md", ".txt")
1294+ # ):
1295+ # return f"[ERROR] Unsupported URL format: {file_path}"
1296+
1297+ # # 执行转换(支持图片提取)
1298+ # result = converter.convert_to_markdown(
1299+ # file_path, output_path, extract_images
1300+ # )
1301+ # except Exception as e:
1302+ # return f"[ERROR] Docling conversion error: {str(e)}"
1303+ # else:
1304+ # return (
1305+ # "[ERROR] No conversion tools available. Please install docling or PyPDF2."
1306+ # )
1307+
1308+ # if result["success"]:
1309+ # msg = "[SUCCESS] Document converted successfully!\n"
1310+ # msg += f" Input: {result['input_file']}\n"
1311+ # msg += f" Output file: {result['output_file']}\n"
1312+ # msg += f" Conversion time: {result['duration']:.2f} seconds\n"
1313+
1314+ # if result["input_size"] > 0:
1315+ # msg += f" Original size: {result['input_size'] / 1024:.1f} KB\n"
1316+ # msg += f" Markdown size: {result['output_size'] / 1024:.1f} KB\n"
1317+
1318+ # # 显示图片提取信息
1319+ # if extract_images and "images_extracted" in result:
1320+ # images_count = result["images_extracted"]
1321+ # if images_count > 0:
1322+ # msg += f" Images extracted: {images_count}\n"
1323+ # msg += f" Images saved to: {os.path.join(os.path.dirname(result['output_file']), 'images')}\n"
1324+ # else:
1325+ # msg += " No images found in document\n"
1326+
1327+ # # 显示Markdown内容的前几行作为预览
1328+ # content_lines = result["markdown_content"].split("\n")
1329+ # preview_lines = content_lines[:5]
1330+ # if len(content_lines) > 5:
1331+ # preview_lines.append("...")
1332+
1333+ # msg += "\n[PREVIEW] First few lines of converted Markdown:\n"
1334+ # for line in preview_lines:
1335+ # msg += f" {line}\n"
1336+ # else:
1337+ # msg = "[ERROR] Conversion failed!\n"
1338+ # msg += f" Error: {result['error']}"
1339+
1340+ # return msg
13421341
13431342
13441343if __name__ == "__main__" :
0 commit comments