diff --git a/user/apps/mineru-mcp-dragonos/.gitignore b/user/apps/mineru-mcp-dragonos/.gitignore new file mode 100644 index 000000000..1ac354611 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/.gitignore @@ -0,0 +1,3 @@ +/target +Cargo.lock +/install/ \ No newline at end of file diff --git a/user/apps/mineru-mcp-dragonos/.mineru.env b/user/apps/mineru-mcp-dragonos/.mineru.env new file mode 100644 index 000000000..0e1a46bd3 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/.mineru.env @@ -0,0 +1,2 @@ +export MCP_PORT=8080 +export MINERU_API_KEY=eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI5MjIwMDY5MCIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc3MzMyNzM2OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiZTc2ZWZmZmYtZWQyOC00YWU0LWE1ZGQtMDAyOGQxNzAxOTk1IiwiZW1haWwiOiIiLCJleHAiOjE3ODExMDMzNjh9.fu7u5TLrOOyqpC3p2jBZyIJmC5zok6sT1qw9Zf7KCd3N7ECkBAzNM7VEzdUz02eg2p8TO2990QsTiv-QFiFLEw \ No newline at end of file diff --git a/user/apps/mineru-mcp-dragonos/Cargo.toml b/user/apps/mineru-mcp-dragonos/Cargo.toml new file mode 100644 index 000000000..beaaa0ed1 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "mineru-mcp-dragonos" +version = "0.1.0" +edition = "2021" +description = "no" +authors = [ "yuming " ] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +axum = "0.7" +bytes = "1.10.1" +chrono = { version = "0.4", features = ["serde"] } +reqwest = { version = "0.12.12", default-features = false, features = ["json", "rustls-tls"] } +rmcp = { version = "0.14.0", features = ["transport-streamable-http-server"] } +schemars = "1.2.0" +serde = { version = "1.0.226", features = ["derive"] } +serde_json = "1.0.145" +thiserror = "1.0.69" +tokio = { version = "1.47.1", features = ["fs", "macros", "rt-multi-thread", "time", "sync"] } +tokio-util = { version = "0.7", features = ["rt"] } +tracing = "0.1.41" +tracing-subscriber = { version = "0.3.20", features = ["env-filter", "fmt"] } +uuid = { version = "1.18.1", features = ["v4"] } +walkdir = "2.5.0" +zip = "0.6.6" + +[dev-dependencies] +tempfile = "3.23.0" +wiremock = "0.6.5" \ No newline at end of file diff --git a/user/apps/mineru-mcp-dragonos/Makefile b/user/apps/mineru-mcp-dragonos/Makefile new file mode 100644 index 000000000..7522ea16c --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/Makefile @@ -0,0 +1,56 @@ +TOOLCHAIN= +RUSTFLAGS= + +ifdef DADK_CURRENT_BUILD_DIR +# 如果是在dadk中编译,那么安装到dadk的安装目录中 + INSTALL_DIR = $(DADK_CURRENT_BUILD_DIR) +else +# 如果是在本地编译,那么安装到当前目录下的install目录中 + INSTALL_DIR = ./install +endif + +ifeq ($(ARCH), x86_64) + export RUST_TARGET=x86_64-unknown-linux-musl +else ifeq ($(ARCH), riscv64) + export RUST_TARGET=riscv64gc-unknown-linux-gnu +else +# 默认为x86_86,用于本地编译 + export RUST_TARGET=x86_64-unknown-linux-musl +endif + +run: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) run --target $(RUST_TARGET) + +build: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) build --target $(RUST_TARGET) + +clean: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) clean --target $(RUST_TARGET) + +test: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) test --target $(RUST_TARGET) + +doc: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) doc --target $(RUST_TARGET) + +fmt: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) fmt + +fmt-check: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) fmt --check + +run-release: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) run --target $(RUST_TARGET) --release + +build-release: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) build --target $(RUST_TARGET) --release + +clean-release: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) clean --target $(RUST_TARGET) --release + +test-release: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) test --target $(RUST_TARGET) --release + +.PHONY: install +install: + RUSTFLAGS=$(RUSTFLAGS) cargo $(TOOLCHAIN) install --target $(RUST_TARGET) --path . --no-track --root $(INSTALL_DIR) --force diff --git a/user/apps/mineru-mcp-dragonos/Prompt.md b/user/apps/mineru-mcp-dragonos/Prompt.md new file mode 100644 index 000000000..99203ccb7 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/Prompt.md @@ -0,0 +1,90 @@ +你是资深 Rust 工程师。请在一个全新的 Cargo 项目中实现一个 MCP stdio server(用于 Dragon S 环境),功能等价于 “mineru-mcp”。 + +硬性要求 +- 语言:Rust(tokio async) +- MCP SDK:使用官方 Rust SDK rmcp,stdio transport +- 提供两个 tools: + 1) parse_documents + 2) get_ocr_languages +- 代码需可编译、可运行、可测试:提供单元/集成测试(不依赖真实 MinerU 线上服务和真实 API key) + +MCP 工具规格 +1) parse_documents +- 入参(用 JSON schema 暴露给 MCP): + - file_sources: string (一个或多个来源,逗号/空格/换行分隔;每个来源要么是 URL,要么是本地文件路径) + - enable_ocr: bool = false + - language: string = "ch" + - page_ranges: string? (仅远程 URL/远程上传模式支持) +- 行为: + - 解析 file_sources,分成 urls 与 local_paths + - 如果 USE_LOCAL_API=true:忽略 urls,只处理 local_paths(行为需与 mineru-mcp 一致) + - 如果 USE_LOCAL_API=false:同时处理 urls 与 local_paths + - 对每个 source 执行 MinerU 解析链路(见“MinerU API 规格”) + - 下载 full_zip_url 的 zip,解压到 OUTPUT_DIR 下独立目录 + - 在解压目录中递归查找 md(优先:与输入文件名同名;否则第一个 .md),读取内容 +- 返回值(结构化 JSON,便于测试): + { + "results": [ + { + "source": "...", + "mode": "remote_url|remote_upload|local_api", + "task_id": "...?" , + "batch_id": "...?" , + "markdown": "...", + "output_dir": "...", + "assets": ["images/xxx.jpg", ...] + } + ] + } +返回值要求(强兼容模式):parse_documents 必须返回 JSON,且 JSON 的字段结构必须与官方 Python mineru-mcp 一致。请先阅读找到 parse_documents 的返回值结构(字段名/层级/类型),在 Rust 中用 serde 定义对应 struct 并严格序列化一致;测试用例需断言返回 JSON 的字段结构与样例一致。 + +1) get_ocr_languages +- 返回 MinerU 支持的 OCR 语言列表(至少包含 ch/en 等常用项),并附上 PaddleOCR 多语言列表链接: + https://www.paddleocr.ai/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html + +环境变量(需实现读取与默认值) +- MINERU_API_BASE 默认 https://mineru.net +- MINERU_API_KEY 必填(远程模式下) +- OUTPUT_DIR 默认 ./downloads +- USE_LOCAL_API 默认 false +- LOCAL_MINERU_API_BASE 默认 http://localhost:8080 + +MinerU API 规格(远程) +A) URL 模式 +- POST {MINERU_API_BASE}/api/v4/extract/task + body: { url, model_version, is_ocr?, enable_formula?, enable_table?, language?, page_ranges? } +- GET {MINERU_API_BASE}/api/v4/extract/task/{task_id} + 轮询直到 state=done,取 data.full_zip_url + +B) 本地文件上传模式 +- POST {MINERU_API_BASE}/api/v4/file-urls/batch + body: { files:[{name,data_id?,is_ocr?,page_ranges?}], model_version, enable_formula?, enable_table?, language? } + -> 返回 batch_id + file_urls[] +- PUT file_urls[i] 上传文件 bytes +- GET {MINERU_API_BASE}/api/v4/extract-results/batch/{batch_id} + 轮询每个文件直到 state=done,取 full_zip_url + +实现建议 +- HTTP client 用 reqwest;加 timeout、重试(对 5xx/网络错误),轮询间隔可配置(默认 2s),最大等待 10min +- zip 解压用 zip crate;目录操作用 std::fs / walkdir +- 日志用 tracing + +测试要求(关键) +- 使用 wiremock/httpmock 在测试里模拟 MinerU API: + - mock POST /api/v4/extract/task -> 返回 task_id + - mock GET /api/v4/extract/task/{id} -> 先返回 running 再返回 done + full_zip_url + - mock GET full_zip_url -> 返回你在测试里动态生成的 zip(包含 1 个 markdown 文件和 images/ 目录) +- 测试 parse_documents 能正确: + - 解析多个 file_sources + - 轮询并下载 zip + - 解压并找到 md + - 返回结构化 results,markdown 字段匹配预期 +- 提供 README:如何设置 env、如何 cargo run、如何与 MCP client 对接 + +交付物 +- Cargo.toml / src/main.rs(或模块化) +- tests/…(可直接 cargo test) +- README.md + +----- +参考的仓库:[mineru-mcp](https://github.com/linxule/mineru-mcp?tab=readme-ov-file) \ No newline at end of file diff --git a/user/apps/mineru-mcp-dragonos/README.md b/user/apps/mineru-mcp-dragonos/README.md new file mode 100644 index 000000000..acdb5cc3a --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/README.md @@ -0,0 +1,71 @@ +# mineru-mcp (Rust) + +该项目在 **Rust + Tokio** 上实现 MinerU MCP stdio server,功能对齐官方 `mineru-mcp` 的 `parse_documents` 与 `get_ocr_languages`。 + +## 功能 + +- MCP stdio server(基于 `rmcp`) +- 支持 URL 与本地文件解析 +- 可选择远程 MinerU API 或本地部署 API +- 自动下载解析结果 zip、解压并读取 Markdown + +## 环境变量 + +| 变量 | 默认值 | 说明 | +| --- | --- | --- | +| `MINERU_API_BASE` | `https://mineru.net` | 远程 MinerU API 基址 | +| `MINERU_API_KEY` | (必填,远程模式) | 远程 API Key | +| `OUTPUT_DIR` | `./downloads` | 解压输出目录 | +| `USE_LOCAL_API` | `false` | 是否启用本地 API | +| `LOCAL_MINERU_API_BASE` | `http://localhost:8080` | 本地 API 基址 | +| `MINERU_POLL_INTERVAL_SECS` | `2` | 轮询间隔(秒) | +| `MINERU_MAX_WAIT_SECS` | `600` | 最大等待时间(秒) | + +## 运行 + +```bash +cd Availiable_Mcp/mineru-mcp +export MINERU_API_KEY=your-api-key +cargo run +``` + +默认通过 stdio transport 提供 MCP 服务,可直接被 MCP client 启动/托管。 + +### MCP Client 对接示例(Claude Desktop) + +```json +{ + "mcpServers": { + "mineru": { + "command": "cargo", + "args": ["run", "--manifest-path", "Availiable_Mcp/mineru-mcp/Cargo.toml"], + "env": { + "MINERU_API_KEY": "your-api-key" + } + } + } +} +``` + +## 工具 + +### parse_documents + +入参: + +- `file_sources`: 以逗号/空格/换行分隔的 URL 或本地路径 +- `enable_ocr`: 是否启用 OCR(默认 `false`) +- `language`: 语言(默认 `ch`) +- `page_ranges`: 页码范围(可选) + +返回:与官方 Python `mineru-mcp` 保持一致的 JSON 结构(单结果或批量结果)。 + +### get_ocr_languages + +返回 OCR 语言列表,并附带 PaddleOCR 多语言支持链接。 + +## 测试 + +```bash +cargo test +``` diff --git a/user/apps/mineru-mcp-dragonos/check_health.sh b/user/apps/mineru-mcp-dragonos/check_health.sh new file mode 100755 index 000000000..df25cbb17 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/check_health.sh @@ -0,0 +1,148 @@ +#!/bin/bash +# MinerU MCP DragonOS 健康检查脚本 +# 用法: ./check_health.sh [port] + +set -e + +# 配置 +PORT="${1:-8080}" +HOST="127.0.0.1" +URL="http://${HOST}:${PORT}/health" +TIMEOUT=5 + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# 辅助函数 +print_green() { printf '%b\n' "${GREEN}$1${NC}"; } +print_red() { printf '%b\n' "${RED}$1${NC}"; } +print_yellow() { printf '%b\n' "${YELLOW}$1${NC}"; } + +echo "=== MinerU MCP DragonOS 健康检查 ===" +echo "目标: ${URL}" +echo "" + +# 检查端口是否在监听 +check_port() { + ss -tlnp 2>/dev/null | grep -q ":${PORT} " +} + +# 发送健康检查请求 +check_health() { + # 使用 --noproxy 绕过系统代理 + local response http_code body + response=$(curl --noproxy '*' -s -w "\n%{http_code}" --max-time "${TIMEOUT}" "${URL}" 2>&1) + http_code=$(echo "$response" | tail -n 1) + body=$(echo "$response" | sed '$d') + + if [ "${http_code}" = "200" ]; then + print_green "✓ HTTP 状态码: ${http_code}" + echo "" + echo "响应内容:" + echo "$body" | python3 -m json.tool 2>/dev/null || echo "$body" + echo "" + validate_response "$body" + return $? + else + print_red "✗ HTTP 状态码: ${http_code}" + echo "响应: ${body}" + return 1 + fi +} + +# 验证响应内容 +validate_response() { + local body="$1" + local errors=0 + + echo "字段验证:" + + # 检查 status 字段 + local status + status=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) + if [ "$status" = "ok" ]; then + print_green " ✓ status: ok" + else + print_red " ✗ status: ${status} (期望: ok)" + ((errors++)) + fi + + # 检查 server 字段 + local server + server=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin).get('server',''))" 2>/dev/null) + if [ "$server" = "mineru-mcp-dragonos" ]; then + print_green " ✓ server: ${server}" + else + print_yellow " ? server: ${server} (期望: mineru-mcp-dragonos)" + fi + + # 检查 api_mode 字段 + local api_mode + api_mode=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin).get('api_mode',''))" 2>/dev/null) + if [ "$api_mode" = "local" ] || [ "$api_mode" = "remote" ]; then + print_green " ✓ api_mode: ${api_mode}" + else + print_red " ✗ api_mode: ${api_mode} (期望: local 或 remote)" + ((errors++)) + fi + + # 检查 has_api_key 字段 + local has_api_key + has_api_key=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin).get('has_api_key',''))" 2>/dev/null) + if [ "$has_api_key" = "True" ] || [ "$has_api_key" = "true" ]; then + print_green " ✓ has_api_key: true" + else + print_yellow " ? has_api_key: false" + fi + + # 检查 version 字段 + local version + version=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin).get('version',''))" 2>/dev/null) + if [ -n "$version" ]; then + print_green " ✓ version: ${version}" + else + print_red " ✗ version: 缺失" + ((errors++)) + fi + + return "$errors" +} + +# 主逻辑 +main() { + # 1. 检查端口 + printf "检查端口 %s... " "$PORT" + if check_port; then + local pid + pid=$(ss -tlnp 2>/dev/null | grep ":${PORT} " | grep -oP 'pid=\K[0-9]+' | head -1) + print_green "已监听 (pid: ${pid:-unknown})" + else + print_red "未监听" + echo "" + print_red "错误: 服务未在端口 ${PORT} 启动" + echo "请先启动服务: source .mineru.env && cargo run --release" + exit 1 + fi + echo "" + + # 2. 发送健康检查请求 + echo "发送健康检查请求..." + if check_health; then + echo "" + print_green "========================================" + print_green "✓ 健康检查通过" + print_green "========================================" + exit 0 + else + echo "" + print_red "========================================" + print_red "✗ 健康检查失败" + print_red "========================================" + exit 1 + fi +} + +main \ No newline at end of file diff --git a/user/apps/mineru-mcp-dragonos/src/lib.rs b/user/apps/mineru-mcp-dragonos/src/lib.rs new file mode 100644 index 000000000..4a261b7e4 --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/src/lib.rs @@ -0,0 +1,1088 @@ +use bytes::Bytes; +use rmcp::{ + ErrorData as McpError, Json, ServerHandler, + handler::server::tool::ToolRouter, + handler::server::wrapper::Parameters, + model::{ServerCapabilities, ServerInfo}, + tool, tool_handler, tool_router, + transport::streamable_http_server::{ + StreamableHttpServerConfig, + StreamableHttpService, + session::local::LocalSessionManager, + }, +}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::{ + collections::{HashMap, HashSet}, + env, + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, Instant}, +}; +use thiserror::Error; +use tokio::time::sleep; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, warn}; +use uuid::Uuid; +use walkdir::WalkDir; + +// HTTP 相关 +use axum::{ + extract::State, + response::Json as AxumJson, + routing::get, + Router, +}; +use std::sync::atomic::{AtomicBool, Ordering}; + +#[derive(Debug, Clone)] +pub struct Settings { + mineru_api_base: String, + mineru_api_key: Option, + output_dir: PathBuf, + use_local_api: bool, + local_mineru_api_base: String, + poll_interval: Duration, + max_wait: Duration, +} + +impl Settings { + pub fn from_env() -> Self { + let mineru_api_base = + env::var("MINERU_API_BASE").unwrap_or_else(|_| "https://mineru.net".to_string()); + let mineru_api_key = env::var("MINERU_API_KEY").ok().filter(|v| !v.is_empty()); + let output_dir = env::var("OUTPUT_DIR").unwrap_or_else(|_| "./downloads".to_string()); + let use_local_api = env::var("USE_LOCAL_API") + .ok() + .map(|value| matches!(value.to_lowercase().as_str(), "true" | "1" | "yes")) + .unwrap_or(false); + let local_mineru_api_base = env::var("LOCAL_MINERU_API_BASE") + .unwrap_or_else(|_| "http://localhost:8080".to_string()); + let poll_interval = env::var("MINERU_POLL_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse::().ok()) + .map(Duration::from_secs) + .unwrap_or(Duration::from_secs(2)); + let max_wait = env::var("MINERU_MAX_WAIT_SECS") + .ok() + .and_then(|v| v.parse::().ok()) + .map(Duration::from_secs) + .unwrap_or(Duration::from_secs(600)); + + Self { + mineru_api_base, + mineru_api_key, + output_dir: PathBuf::from(output_dir), + use_local_api, + local_mineru_api_base, + poll_interval, + max_wait, + } + } +} + +#[derive(Debug, Error)] +pub enum MineruError { + #[error("missing MINERU_API_KEY for remote requests")] + MissingApiKey, + #[error("http error: {0}")] + Http(#[from] reqwest::Error), + #[error("io error: {0}")] + Io(#[from] std::io::Error), + #[error("zip error: {0}")] + Zip(#[from] zip::result::ZipError), + #[error("response error: {0}")] + Response(String), + #[error("timeout waiting for task completion")] + Timeout, + #[error("missing markdown content in output directory")] + MissingMarkdown, +} + +#[derive(Debug, Deserialize)] +struct ApiResponse { + code: Option, + data: Option, + msg: Option, +} + +#[derive(Debug, Deserialize)] +struct TaskCreateData { + task_id: String, +} + +#[derive(Debug, Deserialize)] +struct TaskStatusData { + state: String, + full_zip_url: Option, + err_msg: Option, +} + +#[derive(Debug, Deserialize)] +struct BatchCreateData { + batch_id: String, + file_urls: Vec, +} + +#[derive(Debug, Deserialize)] +struct BatchStatusData { + extract_result: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +struct BatchExtractResult { + file_name: String, + state: String, + full_zip_url: Option, + err_msg: Option, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct ParseDocumentsParams { + pub file_sources: String, + #[serde(default)] + pub enable_ocr: bool, + #[serde(default = "default_language")] + pub language: String, + pub page_ranges: Option, +} + +fn default_language() -> String { + "ch".to_string() +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct ParseDocumentsResponse { + pub status: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub extract_path: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error_message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub results: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub summary: Option, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct ParseDocumentsSummary { + pub total_files: usize, + pub success_count: usize, + pub error_count: usize, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema, Clone)] +pub struct ParseDocumentsResultItem { + pub filename: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_path: Option, + pub status: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub extract_path: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error_message: Option, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct LanguageResponse { + pub status: String, + pub languages: Vec, + pub link: String, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct LanguageInfo { + pub name: String, + pub description: String, + pub code: String, +} + +#[derive(Clone)] +pub struct MineruServer { + settings: Settings, + client: reqwest::Client, + tool_router: ToolRouter, + healthy: Arc, +} + +#[derive(Debug, Serialize)] +pub struct HealthCheckResponse { + pub status: String, + pub server: String, + pub timestamp: String, + pub api_mode: String, + pub api_base: String, + pub has_api_key: bool, + pub version: &'static str, +} + +#[tool_router] +impl MineruServer { + pub fn new(settings: Settings) -> Result { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .no_proxy() + .build()?; + Ok(Self { + settings, + client, + tool_router: Self::tool_router(), + healthy: Arc::new(AtomicBool::new(true)), + }) + } + + #[tool(description = "解析文档(支持本地文件和URL,自动读取内容)")] + pub async fn parse_documents( + &self, + params: Parameters, + ) -> Result, McpError> { + let params = params.0; + let mut sources = parse_sources(¶ms.file_sources); + if sources.is_empty() { + return Ok(Json(ParseDocumentsResponse { + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some("未提供有效的文件路径或URL".to_string()), + message: None, + results: None, + summary: None, + })); + } + + let mut seen = HashSet::new(); + sources.retain(|item| seen.insert(item.to_lowercase())); + + let (url_paths, file_paths) = split_sources(&sources); + + let mut results = Vec::new(); + + if self.settings.use_local_api { + if file_paths.is_empty() { + return Ok(Json(ParseDocumentsResponse { + status: "warning".to_string(), + content: None, + extract_path: None, + error_message: None, + message: Some( + "在本地API模式下,无法处理URL,且未提供有效的本地文件路径".to_string(), + ), + results: None, + summary: None, + })); + } + + info!("使用本地API处理 {} 个文件", file_paths.len()); + for path in file_paths { + results.push(self.handle_local_file(&path, ¶ms).await); + } + } else { + if !url_paths.is_empty() { + info!("使用远程API处理 {} 个URL", url_paths.len()); + for url in url_paths { + results.push(self.handle_remote_url(&url, ¶ms).await); + } + } + + if !file_paths.is_empty() { + info!("使用远程API处理 {} 个本地文件", file_paths.len()); + let batch_results = self.handle_remote_files(&file_paths, ¶ms).await; + results.extend(batch_results); + } + } + + if results.is_empty() { + return Ok(Json(ParseDocumentsResponse { + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some("未处理任何文件".to_string()), + message: None, + results: None, + summary: None, + })); + } + + if results.len() == 1 { + let result = results.into_iter().next().expect("single result exists"); + return Ok(Json(ParseDocumentsResponse { + status: result.status, + content: result.content, + extract_path: result.extract_path, + error_message: result.error_message, + message: None, + results: None, + summary: None, + })); + } + + let success_count = results + .iter() + .filter(|item| item.status == "success") + .count(); + let error_count = results.iter().filter(|item| item.status == "error").count(); + let total_count = results.len(); + + let overall_status = if success_count == 0 { + "error" + } else if error_count > 0 { + "partial_success" + } else { + "success" + }; + + Ok(Json(ParseDocumentsResponse { + status: overall_status.to_string(), + content: None, + extract_path: None, + error_message: None, + message: None, + results: Some(results), + summary: Some(ParseDocumentsSummary { + total_files: total_count, + success_count, + error_count, + }), + })) + } + + #[tool(description = "获取OCR支持的语言列表")] + pub async fn get_ocr_languages(&self) -> Result, McpError> { + let languages = vec![ + LanguageInfo { + name: "中文".to_string(), + description: "Chinese & English".to_string(), + code: "ch".to_string(), + }, + LanguageInfo { + name: "英文".to_string(), + description: "English".to_string(), + code: "en".to_string(), + }, + LanguageInfo { + name: "日文".to_string(), + description: "Japanese".to_string(), + code: "japan".to_string(), + }, + LanguageInfo { + name: "韩文".to_string(), + description: "Korean".to_string(), + code: "korean".to_string(), + }, + LanguageInfo { + name: "法文".to_string(), + description: "French".to_string(), + code: "fr".to_string(), + }, + ]; + + Ok(Json(LanguageResponse { + status: "success".to_string(), + languages, + link: "https://www.paddleocr.ai/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html" + .to_string(), + })) + } +} + +#[tool_handler] +impl ServerHandler for MineruServer { + fn get_info(&self) -> ServerInfo { + ServerInfo { + instructions: Some( + "MCP server for MinerU document parsing: parse_documents and get_ocr_languages" + .to_string(), + ), + capabilities: ServerCapabilities::builder().enable_tools().build(), + ..Default::default() + } + } +} + +impl MineruServer { + async fn handle_remote_url( + &self, + url: &str, + params: &ParseDocumentsParams, + ) -> ParseDocumentsResultItem { + let filename = source_filename(url, true); + match self + .process_remote_url(url, params) + .await + .map(|extracted| ParseDocumentsResultItem { + filename: filename.clone(), + source_url: Some(url.to_string()), + source_path: None, + status: "success".to_string(), + content: Some(extracted.markdown), + extract_path: Some(extracted.output_dir), + error_message: None, + }) { + Ok(item) => item, + Err(err) => { + error!("处理URL失败: {err}"); + ParseDocumentsResultItem { + filename, + source_url: Some(url.to_string()), + source_path: None, + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(err.to_string()), + } + } + } + } + + async fn handle_remote_files( + &self, + file_paths: &[String], + params: &ParseDocumentsParams, + ) -> Vec { + let mut results = Vec::new(); + let mut existing_files = Vec::new(); + + for path in file_paths { + if Path::new(path).exists() { + existing_files.push(path.clone()); + } else { + results.push(ParseDocumentsResultItem { + filename: source_filename(path, false), + source_url: None, + source_path: Some(path.clone()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(format!("文件不存在: {path}")), + }); + } + } + + if existing_files.is_empty() { + return results; + } + + match self.process_remote_files(&existing_files, params).await { + Ok(processed) => { + results.extend(processed); + } + Err(err) => { + error!("处理本地文件失败: {err}"); + for path in existing_files { + results.push(ParseDocumentsResultItem { + filename: source_filename(&path, false), + source_url: None, + source_path: Some(path.clone()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(err.to_string()), + }); + } + } + } + + results + } + + async fn handle_local_file( + &self, + path: &str, + params: &ParseDocumentsParams, + ) -> ParseDocumentsResultItem { + let filename = source_filename(path, false); + if !Path::new(path).exists() { + return ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.to_string()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(format!("文件不存在: {path}")), + }; + } + + match self.process_local_file(path, params).await { + Ok(extracted) => ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.to_string()), + status: "success".to_string(), + content: Some(extracted.markdown), + extract_path: Some(extracted.output_dir), + error_message: None, + }, + Err(err) => ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.to_string()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(err.to_string()), + }, + } + } + + async fn process_remote_url( + &self, + url: &str, + params: &ParseDocumentsParams, + ) -> Result { + let task_id = self + .submit_url_task(&self.settings.mineru_api_base, url, params, true) + .await?; + let status = self + .poll_task(&self.settings.mineru_api_base, &task_id, true) + .await?; + let full_zip_url = status + .full_zip_url + .ok_or_else(|| MineruError::Response("未返回 full_zip_url".to_string()))?; + self.download_and_extract(&full_zip_url, url).await + } + + async fn process_remote_files( + &self, + file_paths: &[String], + params: &ParseDocumentsParams, + ) -> Result, MineruError> { + let batch = self + .submit_file_batch(&self.settings.mineru_api_base, file_paths, params, true) + .await?; + self.upload_files(&batch.file_urls, file_paths).await?; + let results = self + .poll_batch(&self.settings.mineru_api_base, &batch.batch_id, true) + .await?; + + let mut results_by_name: HashMap = HashMap::new(); + for item in results.extract_result { + results_by_name.insert(item.file_name.clone(), item); + } + + let mut items = Vec::new(); + for path in file_paths { + let filename = source_filename(path, false); + let status = results_by_name.get(&filename).cloned(); + match status { + Some(result) if result.state == "done" => { + let full_zip_url = result + .full_zip_url + .ok_or_else(|| MineruError::Response("未返回 full_zip_url".to_string()))?; + match self.download_and_extract(&full_zip_url, path).await { + Ok(extracted) => items.push(ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.clone()), + status: "success".to_string(), + content: Some(extracted.markdown), + extract_path: Some(extracted.output_dir), + error_message: None, + }), + Err(err) => items.push(ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.clone()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(err.to_string()), + }), + } + } + Some(result) => { + let message = result.err_msg.unwrap_or_else(|| "文件处理失败".to_string()); + items.push(ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.clone()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some(message), + }); + } + None => items.push(ParseDocumentsResultItem { + filename, + source_url: None, + source_path: Some(path.clone()), + status: "error".to_string(), + content: None, + extract_path: None, + error_message: Some("未找到批量结果".to_string()), + }), + } + } + + Ok(items) + } + + async fn process_local_file( + &self, + file_path: &str, + params: &ParseDocumentsParams, + ) -> Result { + let batch = self + .submit_file_batch( + &self.settings.local_mineru_api_base, + &[file_path.to_string()], + params, + false, + ) + .await?; + self.upload_files(&batch.file_urls, &[file_path.to_string()]) + .await?; + let results = self + .poll_batch(&self.settings.local_mineru_api_base, &batch.batch_id, false) + .await?; + let filename = source_filename(file_path, false); + let result = results + .extract_result + .into_iter() + .find(|item| item.file_name == filename) + .ok_or_else(|| MineruError::Response("未找到批量结果".to_string()))?; + if result.state != "done" { + return Err(MineruError::Response( + result.err_msg.unwrap_or_else(|| "本地解析失败".to_string()), + )); + } + let full_zip_url = result + .full_zip_url + .ok_or_else(|| MineruError::Response("未返回 full_zip_url".to_string()))?; + self.download_and_extract(&full_zip_url, file_path).await + } + + async fn submit_url_task( + &self, + base_url: &str, + url: &str, + params: &ParseDocumentsParams, + require_auth: bool, + ) -> Result { + let endpoint = format!("{base_url}/api/v4/extract/task"); + let mut body = serde_json::json!({ + "url": url, + "model_version": "pipeline", + }); + if params.enable_ocr { + body["is_ocr"] = serde_json::json!(true); + } + if let Some(page_ranges) = ¶ms.page_ranges { + body["page_ranges"] = serde_json::json!(page_ranges); + } + body["language"] = serde_json::json!(params.language.clone()); + + let response: ApiResponse = self + .request_json(reqwest::Method::POST, &endpoint, Some(body), require_auth) + .await?; + let data = extract_api_data(response)?; + Ok(data.task_id) + } + + async fn poll_task( + &self, + base_url: &str, + task_id: &str, + require_auth: bool, + ) -> Result { + let endpoint = format!("{base_url}/api/v4/extract/task/{task_id}"); + let start = Instant::now(); + loop { + let response: ApiResponse = self + .request_json(reqwest::Method::GET, &endpoint, None, require_auth) + .await?; + let data = extract_api_data(response)?; + match data.state.as_str() { + "done" => return Ok(data), + "failed" => { + return Err(MineruError::Response( + data.err_msg.unwrap_or_else(|| "任务失败".to_string()), + )); + } + _ => {} + } + + if start.elapsed() > self.settings.max_wait { + return Err(MineruError::Timeout); + } + sleep(self.settings.poll_interval).await; + } + } + + async fn submit_file_batch( + &self, + base_url: &str, + file_paths: &[String], + params: &ParseDocumentsParams, + require_auth: bool, + ) -> Result { + let files: Vec<_> = file_paths + .iter() + .map(|path| { + serde_json::json!({ + "name": source_filename(path, false), + "is_ocr": params.enable_ocr, + "page_ranges": params.page_ranges, + }) + }) + .collect(); + let body = serde_json::json!({ + "files": files, + "model_version": "pipeline", + "language": params.language, + }); + let endpoint = format!("{base_url}/api/v4/file-urls/batch"); + let response: ApiResponse = self + .request_json(reqwest::Method::POST, &endpoint, Some(body), require_auth) + .await?; + extract_api_data(response) + } + + async fn upload_files( + &self, + file_urls: &[String], + file_paths: &[String], + ) -> Result<(), MineruError> { + if file_urls.len() != file_paths.len() { + return Err(MineruError::Response( + "file_urls 与 file_paths 数量不一致".to_string(), + )); + } + for (url, path) in file_urls.iter().zip(file_paths.iter()) { + let bytes = tokio::fs::read(path).await?; + self.request_bytes_with_retry(reqwest::Method::PUT, url, Bytes::from(bytes)) + .await?; + } + Ok(()) + } + + async fn poll_batch( + &self, + base_url: &str, + batch_id: &str, + require_auth: bool, + ) -> Result { + let endpoint = format!("{base_url}/api/v4/extract-results/batch/{batch_id}"); + let start = Instant::now(); + loop { + let response: ApiResponse = self + .request_json(reqwest::Method::GET, &endpoint, None, require_auth) + .await?; + let data = extract_api_data(response)?; + let done = data + .extract_result + .iter() + .all(|item| item.state == "done" || item.state == "failed"); + if done { + return Ok(data); + } + if start.elapsed() > self.settings.max_wait { + return Err(MineruError::Timeout); + } + sleep(self.settings.poll_interval).await; + } + } + + async fn download_and_extract( + &self, + zip_url: &str, + source: &str, + ) -> Result { + let response = self + .request_with_retry(reqwest::Method::GET, zip_url, None, false) + .await?; + let bytes = response.bytes().await?; + let output_dir = self.create_output_dir(source)?; + extract_zip(&bytes, &output_dir)?; + let markdown_path = find_markdown(&output_dir, source)?; + let markdown = tokio::fs::read_to_string(markdown_path).await?; + Ok(ExtractedContent { + markdown, + output_dir: output_dir.to_string_lossy().into_owned(), + }) + } + + fn create_output_dir(&self, source: &str) -> Result { + let base = &self.settings.output_dir; + let stem = Path::new(source) + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("output"); + let dir = base.join(format!("{stem}-{}", Uuid::new_v4())); + std::fs::create_dir_all(&dir)?; + Ok(dir) + } + + async fn request_json Deserialize<'de>>( + &self, + method: reqwest::Method, + url: &str, + body: Option, + require_auth: bool, + ) -> Result { + let response = self + .request_with_retry(method, url, body, require_auth) + .await?; + let response = response.error_for_status()?; + let parsed = response.json::().await?; + Ok(parsed) + } + + async fn request_bytes_with_retry( + &self, + method: reqwest::Method, + url: &str, + body: Bytes, + ) -> Result { + let mut attempts = 0; + let max_attempts = 3; + loop { + attempts += 1; + let request = self.client.request(method.clone(), url).body(body.clone()); + match request.send().await { + Ok(response) => { + if response.status().is_server_error() && attempts < max_attempts { + sleep(Duration::from_millis(200 * attempts)).await; + continue; + } + return Ok(response.error_for_status()?); + } + Err(err) => { + if attempts >= max_attempts { + return Err(MineruError::Http(err)); + } + sleep(Duration::from_millis(200 * attempts)).await; + } + } + } + } + + async fn request_with_retry( + &self, + method: reqwest::Method, + url: &str, + body: Option, + require_auth: bool, + ) -> Result { + let mut attempts = 0; + let max_attempts = 3; + loop { + attempts += 1; + let mut request = self.client.request(method.clone(), url); + if require_auth { + let api_key = self + .settings + .mineru_api_key + .clone() + .ok_or(MineruError::MissingApiKey)?; + request = request.bearer_auth(api_key); + } + if let Some(body) = body.clone() { + if !body.is_null() { + request = request.json(&body); + } + } + match request.send().await { + Ok(response) => { + if response.status().is_server_error() && attempts < max_attempts { + sleep(Duration::from_millis(200 * attempts)).await; + continue; + } + return Ok(response); + } + Err(err) => { + if attempts >= max_attempts { + return Err(MineruError::Http(err)); + } + sleep(Duration::from_millis(200 * attempts)).await; + } + } + } + } +} + +struct ExtractedContent { + markdown: String, + output_dir: String, +} + +fn parse_sources(input: &str) -> Vec { + input + .split(|c: char| c == ',' || c.is_whitespace()) + .map(str::trim) + .filter(|item| !item.is_empty()) + .map(|item| item.trim_matches('"').trim_matches('\'').to_string()) + .collect() +} + +fn split_sources(sources: &[String]) -> (Vec, Vec) { + let mut urls = Vec::new(); + let mut files = Vec::new(); + for source in sources { + if source.starts_with("http://") || source.starts_with("https://") { + urls.push(source.clone()); + } else { + files.push(source.clone()); + } + } + (urls, files) +} + +fn source_filename(source: &str, is_url: bool) -> String { + let raw = if is_url { + source.split('/').last().unwrap_or(source) + } else { + Path::new(source) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(source) + }; + raw.split('?').next().unwrap_or(raw).to_string() +} + +fn extract_zip(bytes: &[u8], output_dir: &Path) -> Result<(), MineruError> { + let reader = std::io::Cursor::new(bytes); + let mut archive = zip::ZipArchive::new(reader)?; + for i in 0..archive.len() { + let mut file = archive.by_index(i)?; + let outpath = output_dir.join(file.name()); + if file.name().ends_with('/') { + std::fs::create_dir_all(&outpath)?; + } else { + if let Some(parent) = outpath.parent() { + std::fs::create_dir_all(parent)?; + } + let mut outfile = std::fs::File::create(&outpath)?; + std::io::copy(&mut file, &mut outfile)?; + } + } + Ok(()) +} + +fn extract_api_data(response: ApiResponse) -> Result { + if let Some(code) = response.code { + if code != 0 { + return Err(MineruError::Response( + response + .msg + .unwrap_or_else(|| format!("MinerU API错误: {code}")), + )); + } + } + response + .data + .ok_or_else(|| MineruError::Response("缺少响应数据".to_string())) +} + +fn find_markdown(output_dir: &Path, source: &str) -> Result { + let stem = Path::new(source) + .file_stem() + .and_then(|s| s.to_str()) + .map(|s| s.to_lowercase()); + let mut fallback = None; + for entry in WalkDir::new(output_dir).into_iter().filter_map(Result::ok) { + if entry.file_type().is_file() { + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("md") { + if let Some(stem_name) = path.file_stem().and_then(|s| s.to_str()) { + if let Some(expected) = &stem { + if stem_name.to_lowercase() == *expected { + return Ok(path.to_path_buf()); + } + } + } + if fallback.is_none() { + fallback = Some(path.to_path_buf()); + } + } + } + } + fallback.ok_or(MineruError::MissingMarkdown) +} + +fn init_tracing() { + let filter = + tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| "info".into()); + tracing_subscriber::fmt().with_env_filter(filter).init(); +} + +async fn health_handler(State(server): State) -> AxumJson { + let is_healthy = server.healthy.load(Ordering::Relaxed); + AxumJson(HealthCheckResponse { + status: if is_healthy { "ok".to_string() } else { "error".to_string() }, + server: "mineru-mcp-dragonos".to_string(), + timestamp: chrono::Utc::now().to_rfc3339(), + api_mode: if server.settings.use_local_api { "local".to_string() } else { "remote".to_string() }, + api_base: if server.settings.use_local_api { + server.settings.local_mineru_api_base.clone() + } else { + server.settings.mineru_api_base.clone() + }, + has_api_key: server.settings.mineru_api_key.is_some(), + version: env!("CARGO_PKG_VERSION"), + }) +} + +async fn root_handler() -> &'static str { + "MinerU MCP DragonOS Server\n\nEndpoints:\n GET /health - Health check\n POST /mcp - MCP JSON-RPC requests\n GET /mcp - SSE event stream" +} + +/// 启动 HTTP/SSE MCP 服务器 +pub async fn run() -> Result<(), Box> { + init_tracing(); + let settings = Settings::from_env(); + if !settings.use_local_api && settings.mineru_api_key.is_none() { + warn!("MINERU_API_KEY 未设置,远程解析将失败"); + } + + let ct = CancellationToken::new(); + let port = env::var("MCP_PORT") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(8080); + + // 创建 HTTP/SSE MCP 服务 + let mcp_service: StreamableHttpService = + StreamableHttpService::new( + || { + let s = Settings::from_env(); + MineruServer::new(s).map_err(|e| std::io::Error::other(e.to_string())) + }, + Arc::new(LocalSessionManager::default()), + StreamableHttpServerConfig { + stateful_mode: true, + sse_keep_alive: Some(Duration::from_secs(15)), + cancellation_token: ct.child_token(), + ..Default::default() + }, + ); + + // 创建一个用于健康检查的 server 实例 + let health_server = MineruServer::new(settings)?; + + // 统一的 Axum 路由 + let app = Router::new() + .route("/health", get(health_handler)) + .route("/", get(root_handler)) + .with_state(health_server) + .nest_service("/mcp", mcp_service); + + let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port)).await?; + info!("MCP HTTP/SSE 服务启动在 http://0.0.0.0:{}", port); + info!("端点: GET /health, POST /mcp, GET /mcp"); + + axum::serve(listener, app) + .with_graceful_shutdown(async move { ct.cancelled().await }) + .await?; + + Ok(()) +} diff --git a/user/apps/mineru-mcp-dragonos/src/main.rs b/user/apps/mineru-mcp-dragonos/src/main.rs new file mode 100644 index 000000000..e11c1e80d --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/src/main.rs @@ -0,0 +1,4 @@ +#[tokio::main] +async fn main() -> Result<(), Box> { + mineru_mcp_dragonos::run().await +} diff --git a/user/apps/mineru-mcp-dragonos/tests/parse_documents.rs b/user/apps/mineru-mcp-dragonos/tests/parse_documents.rs new file mode 100644 index 000000000..06e51433e --- /dev/null +++ b/user/apps/mineru-mcp-dragonos/tests/parse_documents.rs @@ -0,0 +1,133 @@ +use mineru_mcp::{MineruServer, ParseDocumentsParams, Settings}; +use rmcp::handler::server::wrapper::Parameters; +use std::{io::Write, path::Path}; +use tempfile::tempdir; +use wiremock::{ + Mock, MockServer, ResponseTemplate, + matchers::{method, path}, +}; + +fn build_zip_bytes() -> Vec { + let cursor = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(cursor); + let options = zip::write::FileOptions::default(); + + zip.start_file("document.md", options).unwrap(); + zip.write_all(b"Hello from mineru").unwrap(); + zip.add_directory("images/", options).unwrap(); + zip.start_file("images/pic.png", options).unwrap(); + zip.write_all(b"fake").unwrap(); + let cursor = zip.finish().unwrap(); + cursor.into_inner() +} + +#[tokio::test] +async fn parse_documents_handles_url_and_file() { + let server = MockServer::start().await; + let zip_bytes = build_zip_bytes(); + + Mock::given(method("POST")) + .and(path("/api/v4/extract/task")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "code": 0, + "data": {"task_id": "task-1"} + }))) + .mount(&server) + .await; + + Mock::given(method("GET")) + .and(path("/api/v4/extract/task/task-1")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "code": 0, + "data": {"state": "done", "full_zip_url": format!("{}/download/result.zip", server.uri())} + }))) + .expect(1) + .mount(&server) + .await; + + Mock::given(method("POST")) + .and(path("/api/v4/file-urls/batch")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "code": 0, + "data": { + "batch_id": "batch-1", + "file_urls": [format!("{}/upload/1", server.uri())] + } + }))) + .mount(&server) + .await; + + Mock::given(method("PUT")) + .and(path("/upload/1")) + .respond_with(ResponseTemplate::new(200)) + .mount(&server) + .await; + + Mock::given(method("GET")) + .and(path("/api/v4/extract-results/batch/batch-1")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "code": 0, + "data": { + "extract_result": [ + { + "file_name": "local.pdf", + "state": "done", + "full_zip_url": format!("{}/download/result.zip", server.uri()) + } + ] + } + }))) + .expect(1) + .mount(&server) + .await; + + Mock::given(method("GET")) + .and(path("/download/result.zip")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(zip_bytes.clone())) + .expect(2) + .mount(&server) + .await; + + let temp = tempdir().unwrap(); + let output_dir = temp.path().join("downloads"); + std::fs::create_dir_all(&output_dir).unwrap(); + let local_path = temp.path().join("local.pdf"); + std::fs::write(&local_path, b"dummy").unwrap(); + + unsafe { + std::env::set_var("MINERU_API_BASE", server.uri()); + std::env::set_var("MINERU_API_KEY", "test-key"); + std::env::set_var("OUTPUT_DIR", output_dir.to_string_lossy().to_string()); + std::env::set_var("USE_LOCAL_API", "false"); + std::env::set_var("MINERU_POLL_INTERVAL_SECS", "1"); + std::env::set_var("MINERU_MAX_WAIT_SECS", "10"); + } + + let server_impl = MineruServer::new(Settings::from_env()).unwrap(); + let params = ParseDocumentsParams { + file_sources: format!("https://example.com/test.pdf {}", local_path.display()), + enable_ocr: false, + language: "ch".to_string(), + page_ranges: None, + }; + + let response = server_impl + .parse_documents(Parameters(params)) + .await + .unwrap(); + + let response = response.0; + assert_eq!(response.status, "success"); + let summary = response.summary.expect("summary"); + assert_eq!(summary.total_files, 2); + assert_eq!(summary.success_count, 2); + assert_eq!(summary.error_count, 0); + let results = response.results.expect("results"); + assert_eq!(results.len(), 2); + for item in results { + assert_eq!(item.status, "success"); + assert!(item.content.as_deref() == Some("Hello from mineru")); + let extract_path = item.extract_path.expect("extract_path"); + assert!(Path::new(&extract_path).exists()); + } +} diff --git a/user/dadk/config/all/mineru_mcp-0.1.0.toml b/user/dadk/config/all/mineru_mcp-0.1.0.toml new file mode 100644 index 000000000..6d45fc37f --- /dev/null +++ b/user/dadk/config/all/mineru_mcp-0.1.0.toml @@ -0,0 +1,36 @@ +# 用户程序名称 +name = "mineru_mcp" +# 版本号 +version = "0.1.0" +# 用户程序描述信息 +description = "MineRU MCP Application for DragonOS" +# (可选)默认: false 是否只构建一次,如果为true,DADK会在构建成功后,将构建结果缓存起来,下次构建时,直接使用缓存的构建结果 +build-once = false +# (可选) 默认: false 是否只安装一次,如果为true,DADK会在安装成功后,不再重复安装 +install-once = false +# 目标架构 +# 可选值:"x86_64", "aarch64", "riscv64" +target-arch = ["x86_64"] +# 任务源 +[task-source] +# 构建类型 +# 可选值:"build-from_source", "install-from-prebuilt" +type = "build-from-source" +# 构建来源 +# "build_from_source" 可选值:"git", "local", "archive" +# "install_from_prebuilt" 可选值:"local", "archive" +source = "local" +# 路径或URL +source-path = "user/apps/mineru-mcp-dragonos" +# 构建相关信息 +[build] +# (可选)构建命令 +build-command = "make install" +# 安装相关信息 +[install] +# (可选)安装到DragonOS的路径 +in-dragonos-path = "/root/" +# 清除相关信息 +[clean] +# (可选)清除命令 +clean-command = "make clean" diff --git a/user/dadk/config/sets/default/mineru_mcp-0.1.0.toml b/user/dadk/config/sets/default/mineru_mcp-0.1.0.toml new file mode 120000 index 000000000..6d0c57402 --- /dev/null +++ b/user/dadk/config/sets/default/mineru_mcp-0.1.0.toml @@ -0,0 +1 @@ +../../all/mineru_mcp-0.1.0.toml \ No newline at end of file