Skip to content

Commit 645375e

Browse files
committed
fix file transfer bugs
1 parent 7690d37 commit 645375e

File tree

1 file changed

+133
-134
lines changed

1 file changed

+133
-134
lines changed

tools/pdf_downloader.py

Lines changed: 133 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,10 @@ async def perform_document_conversion(
100100
if not file_path:
101101
return None
102102

103-
conversion_success = False
104103
conversion_msg = ""
105104

106105
# 首先尝试使用简单的PDF转换器(对于PDF文件)
107-
if file_path.lower().endswith(".pdf") and PYPDF2_AVAILABLE and not extract_images:
106+
if file_path.lower().endswith(".pdf") and PYPDF2_AVAILABLE:
108107
try:
109108
simple_converter = SimplePdfConverter()
110109
conversion_result = simple_converter.convert_pdf_to_markdown(file_path)
@@ -119,40 +118,40 @@ async def perform_document_conversion(
119118
conversion_msg += (
120119
f"\n Pages extracted: {conversion_result['pages_extracted']}"
121120
)
122-
conversion_success = True
121+
123122
else:
124123
conversion_msg = f"\n [WARNING] PDF conversion failed: {conversion_result['error']}"
125124
except Exception as conv_error:
126125
conversion_msg = f"\n [WARNING] PDF conversion error: {str(conv_error)}"
127126

128127
# 如果简单转换失败,尝试使用docling(支持图片提取)
129-
if not conversion_success and DOCLING_AVAILABLE:
130-
try:
131-
converter = DoclingConverter()
132-
if converter.is_supported_format(file_path):
133-
conversion_result = converter.convert_to_markdown(
134-
file_path, extract_images=extract_images
135-
)
136-
if conversion_result["success"]:
137-
conversion_msg = (
138-
"\n [INFO] Document converted to Markdown (docling)"
139-
)
140-
conversion_msg += (
141-
f"\n Markdown file: {conversion_result['output_file']}"
142-
)
143-
conversion_msg += f"\n Conversion time: {conversion_result['duration']:.2f} seconds"
144-
if conversion_result.get("images_extracted", 0) > 0:
145-
conversion_msg += f"\n Images extracted: {conversion_result['images_extracted']}"
146-
images_dir = os.path.join(
147-
os.path.dirname(conversion_result["output_file"]), "images"
148-
)
149-
conversion_msg += f"\n Images saved to: {images_dir}"
150-
else:
151-
conversion_msg = f"\n [WARNING] Docling conversion failed: {conversion_result['error']}"
152-
except Exception as conv_error:
153-
conversion_msg = (
154-
f"\n [WARNING] Docling conversion error: {str(conv_error)}"
155-
)
128+
# if not conversion_success and DOCLING_AVAILABLE:
129+
# try:
130+
# converter = DoclingConverter()
131+
# if converter.is_supported_format(file_path):
132+
# conversion_result = converter.convert_to_markdown(
133+
# file_path, extract_images=extract_images
134+
# )
135+
# if conversion_result["success"]:
136+
# conversion_msg = (
137+
# "\n [INFO] Document converted to Markdown (docling)"
138+
# )
139+
# conversion_msg += (
140+
# f"\n Markdown file: {conversion_result['output_file']}"
141+
# )
142+
# conversion_msg += f"\n Conversion time: {conversion_result['duration']:.2f} seconds"
143+
# if conversion_result.get("images_extracted", 0) > 0:
144+
# conversion_msg += f"\n Images extracted: {conversion_result['images_extracted']}"
145+
# images_dir = os.path.join(
146+
# os.path.dirname(conversion_result["output_file"]), "images"
147+
# )
148+
# conversion_msg += f"\n Images saved to: {images_dir}"
149+
# else:
150+
# conversion_msg = f"\n [WARNING] Docling conversion failed: {conversion_result['error']}"
151+
# except Exception as conv_error:
152+
# conversion_msg = (
153+
# f"\n [WARNING] Docling conversion error: {str(conv_error)}"
154+
# )
156155

157156
return conversion_msg if conversion_msg else None
158157

@@ -1235,110 +1234,110 @@ async def move_file_to(
12351234
return msg + f"[ERROR] Move failed!\n Error: {result['error']}"
12361235

12371236

1238-
@mcp.tool()
1239-
async def convert_document_to_markdown(
1240-
file_path: str, output_path: Optional[str] = None, extract_images: bool = True
1241-
) -> str:
1242-
"""
1243-
Convert a document to Markdown format with image extraction support.
1244-
1245-
Supports both local files and URLs. Uses docling for advanced conversion with image extraction,
1246-
or falls back to PyPDF2 for simple PDF text extraction.
1247-
1248-
Args:
1249-
file_path: Path to the input document file or URL (supports PDF, DOCX, PPTX, HTML, TXT, MD)
1250-
output_path: Path for the output Markdown file (optional, auto-generated if not provided)
1251-
extract_images: Whether to extract images from the document (default: True)
1252-
1253-
Returns:
1254-
Status message about the conversion operation with preview of converted content
1255-
1256-
Examples:
1257-
- "convert_document_to_markdown('paper.pdf')"
1258-
- "convert_document_to_markdown('https://example.com/doc.pdf', 'output.md')"
1259-
- "convert_document_to_markdown('presentation.pptx', extract_images=False)"
1260-
"""
1261-
# 检查是否为URL
1262-
is_url_input = False
1263-
try:
1264-
parsed = urlparse(file_path)
1265-
is_url_input = parsed.scheme in ("http", "https")
1266-
except Exception:
1267-
is_url_input = False
1268-
1269-
# 检查文件是否存在(如果不是URL)
1270-
if not is_url_input and not os.path.exists(file_path):
1271-
return f"[ERROR] Input file not found: {file_path}"
1272-
1273-
# 检查是否是PDF文件,优先使用简单转换器(仅对本地文件)
1274-
if (
1275-
not is_url_input
1276-
and file_path.lower().endswith(".pdf")
1277-
and PYPDF2_AVAILABLE
1278-
and not extract_images
1279-
):
1280-
try:
1281-
simple_converter = SimplePdfConverter()
1282-
result = simple_converter.convert_pdf_to_markdown(file_path, output_path)
1283-
except Exception as e:
1284-
return f"[ERROR] PDF conversion error: {str(e)}"
1285-
elif DOCLING_AVAILABLE:
1286-
try:
1287-
converter = DoclingConverter()
1288-
1289-
# 检查文件格式是否支持
1290-
if not is_url_input and not converter.is_supported_format(file_path):
1291-
supported_formats = [".pdf", ".docx", ".pptx", ".html", ".md", ".txt"]
1292-
return f"[ERROR] Unsupported file format. Supported formats: {', '.join(supported_formats)}"
1293-
elif is_url_input and not file_path.lower().endswith(
1294-
(".pdf", ".docx", ".pptx", ".html", ".md", ".txt")
1295-
):
1296-
return f"[ERROR] Unsupported URL format: {file_path}"
1297-
1298-
# 执行转换(支持图片提取)
1299-
result = converter.convert_to_markdown(
1300-
file_path, output_path, extract_images
1301-
)
1302-
except Exception as e:
1303-
return f"[ERROR] Docling conversion error: {str(e)}"
1304-
else:
1305-
return (
1306-
"[ERROR] No conversion tools available. Please install docling or PyPDF2."
1307-
)
1308-
1309-
if result["success"]:
1310-
msg = "[SUCCESS] Document converted successfully!\n"
1311-
msg += f" Input: {result['input_file']}\n"
1312-
msg += f" Output file: {result['output_file']}\n"
1313-
msg += f" Conversion time: {result['duration']:.2f} seconds\n"
1314-
1315-
if result["input_size"] > 0:
1316-
msg += f" Original size: {result['input_size'] / 1024:.1f} KB\n"
1317-
msg += f" Markdown size: {result['output_size'] / 1024:.1f} KB\n"
1318-
1319-
# 显示图片提取信息
1320-
if extract_images and "images_extracted" in result:
1321-
images_count = result["images_extracted"]
1322-
if images_count > 0:
1323-
msg += f" Images extracted: {images_count}\n"
1324-
msg += f" Images saved to: {os.path.join(os.path.dirname(result['output_file']), 'images')}\n"
1325-
else:
1326-
msg += " No images found in document\n"
1327-
1328-
# 显示Markdown内容的前几行作为预览
1329-
content_lines = result["markdown_content"].split("\n")
1330-
preview_lines = content_lines[:5]
1331-
if len(content_lines) > 5:
1332-
preview_lines.append("...")
1333-
1334-
msg += "\n[PREVIEW] First few lines of converted Markdown:\n"
1335-
for line in preview_lines:
1336-
msg += f" {line}\n"
1337-
else:
1338-
msg = "[ERROR] Conversion failed!\n"
1339-
msg += f" Error: {result['error']}"
1340-
1341-
return msg
1237+
# @mcp.tool()
1238+
# async def convert_document_to_markdown(
1239+
# file_path: str, output_path: Optional[str] = None, extract_images: bool = True
1240+
# ) -> str:
1241+
# """
1242+
# Convert a document to Markdown format with image extraction support.
1243+
1244+
# Supports both local files and URLs. Uses docling for advanced conversion with image extraction,
1245+
# or falls back to PyPDF2 for simple PDF text extraction.
1246+
1247+
# Args:
1248+
# file_path: Path to the input document file or URL (supports PDF, DOCX, PPTX, HTML, TXT, MD)
1249+
# output_path: Path for the output Markdown file (optional, auto-generated if not provided)
1250+
# extract_images: Whether to extract images from the document (default: True)
1251+
1252+
# Returns:
1253+
# Status message about the conversion operation with preview of converted content
1254+
1255+
# Examples:
1256+
# - "convert_document_to_markdown('paper.pdf')"
1257+
# - "convert_document_to_markdown('https://example.com/doc.pdf', 'output.md')"
1258+
# - "convert_document_to_markdown('presentation.pptx', extract_images=False)"
1259+
# """
1260+
# # 检查是否为URL
1261+
# is_url_input = False
1262+
# try:
1263+
# parsed = urlparse(file_path)
1264+
# is_url_input = parsed.scheme in ("http", "https")
1265+
# except Exception:
1266+
# is_url_input = False
1267+
1268+
# # 检查文件是否存在(如果不是URL)
1269+
# if not is_url_input and not os.path.exists(file_path):
1270+
# return f"[ERROR] Input file not found: {file_path}"
1271+
1272+
# # 检查是否是PDF文件,优先使用简单转换器(仅对本地文件)
1273+
# if (
1274+
# not is_url_input
1275+
# and file_path.lower().endswith(".pdf")
1276+
# and PYPDF2_AVAILABLE
1277+
# and not extract_images
1278+
# ):
1279+
# try:
1280+
# simple_converter = SimplePdfConverter()
1281+
# result = simple_converter.convert_pdf_to_markdown(file_path, output_path)
1282+
# except Exception as e:
1283+
# return f"[ERROR] PDF conversion error: {str(e)}"
1284+
# elif DOCLING_AVAILABLE:
1285+
# try:
1286+
# converter = DoclingConverter()
1287+
1288+
# # 检查文件格式是否支持
1289+
# if not is_url_input and not converter.is_supported_format(file_path):
1290+
# supported_formats = [".pdf", ".docx", ".pptx", ".html", ".md", ".txt"]
1291+
# return f"[ERROR] Unsupported file format. Supported formats: {', '.join(supported_formats)}"
1292+
# elif is_url_input and not file_path.lower().endswith(
1293+
# (".pdf", ".docx", ".pptx", ".html", ".md", ".txt")
1294+
# ):
1295+
# return f"[ERROR] Unsupported URL format: {file_path}"
1296+
1297+
# # 执行转换(支持图片提取)
1298+
# result = converter.convert_to_markdown(
1299+
# file_path, output_path, extract_images
1300+
# )
1301+
# except Exception as e:
1302+
# return f"[ERROR] Docling conversion error: {str(e)}"
1303+
# else:
1304+
# return (
1305+
# "[ERROR] No conversion tools available. Please install docling or PyPDF2."
1306+
# )
1307+
1308+
# if result["success"]:
1309+
# msg = "[SUCCESS] Document converted successfully!\n"
1310+
# msg += f" Input: {result['input_file']}\n"
1311+
# msg += f" Output file: {result['output_file']}\n"
1312+
# msg += f" Conversion time: {result['duration']:.2f} seconds\n"
1313+
1314+
# if result["input_size"] > 0:
1315+
# msg += f" Original size: {result['input_size'] / 1024:.1f} KB\n"
1316+
# msg += f" Markdown size: {result['output_size'] / 1024:.1f} KB\n"
1317+
1318+
# # 显示图片提取信息
1319+
# if extract_images and "images_extracted" in result:
1320+
# images_count = result["images_extracted"]
1321+
# if images_count > 0:
1322+
# msg += f" Images extracted: {images_count}\n"
1323+
# msg += f" Images saved to: {os.path.join(os.path.dirname(result['output_file']), 'images')}\n"
1324+
# else:
1325+
# msg += " No images found in document\n"
1326+
1327+
# # 显示Markdown内容的前几行作为预览
1328+
# content_lines = result["markdown_content"].split("\n")
1329+
# preview_lines = content_lines[:5]
1330+
# if len(content_lines) > 5:
1331+
# preview_lines.append("...")
1332+
1333+
# msg += "\n[PREVIEW] First few lines of converted Markdown:\n"
1334+
# for line in preview_lines:
1335+
# msg += f" {line}\n"
1336+
# else:
1337+
# msg = "[ERROR] Conversion failed!\n"
1338+
# msg += f" Error: {result['error']}"
1339+
1340+
# return msg
13421341

13431342

13441343
if __name__ == "__main__":

0 commit comments

Comments
 (0)