Skip to content

Commit 508675a

Browse files
NanmiCoderclaude
andcommitted
feat(api): add WebUI API server with built frontend
- Add FastAPI server with WebSocket support for real-time logs - Add crawler management API endpoints (start/stop/status) - Add data browsing API endpoints (list files, preview, download) - Include pre-built WebUI assets for serving frontend API endpoints: - POST /api/crawler/start - Start crawler task - POST /api/crawler/stop - Stop crawler task - GET /api/crawler/status - Get crawler status - WS /api/ws/logs - Real-time log streaming - GET /api/data/files - List data files - GET /api/data/stats - Get data statistics 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent eb66e57 commit 508675a

20 files changed

+1467
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,4 +178,4 @@ docs/.vitepress/cache
178178
agent_zone
179179
debug_tools
180180

181-
database/*.db
181+
database/*.db

api/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) 2025 [email protected]
3+
#
4+
# This file is part of MediaCrawler project.
5+
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/__init__.py
6+
# GitHub: https://github.com/NanmiCoder
7+
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
8+
#
9+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
10+
# 1. 不得用于任何商业用途。
11+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
12+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
13+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
14+
# 5. 不得用于任何非法或不当的用途。
15+
#
16+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
17+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
18+
19+
# WebUI API Module for MediaCrawler

api/main.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) 2025 [email protected]
3+
#
4+
# This file is part of MediaCrawler project.
5+
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/main.py
6+
# GitHub: https://github.com/NanmiCoder
7+
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
8+
#
9+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
10+
# 1. 不得用于任何商业用途。
11+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
12+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
13+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
14+
# 5. 不得用于任何非法或不当的用途。
15+
#
16+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
17+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
18+
19+
"""
20+
MediaCrawler WebUI API Server
21+
启动命令: uvicorn api.main:app --port 8080 --reload
22+
或者: python -m api.main
23+
"""
24+
import asyncio
25+
import os
26+
import subprocess
27+
import uvicorn
28+
from fastapi import FastAPI
29+
from fastapi.middleware.cors import CORSMiddleware
30+
from fastapi.staticfiles import StaticFiles
31+
from fastapi.responses import FileResponse
32+
33+
from .routers import crawler_router, data_router, websocket_router
34+
35+
app = FastAPI(
36+
title="MediaCrawler WebUI API",
37+
description="API for controlling MediaCrawler from WebUI",
38+
version="1.0.0"
39+
)
40+
41+
# 获取 webui 静态文件目录
42+
WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
43+
44+
# CORS 配置 - 允许前端开发服务器访问
45+
app.add_middleware(
46+
CORSMiddleware,
47+
allow_origins=[
48+
"http://localhost:5173", # Vite dev server
49+
"http://localhost:3000", # 备用端口
50+
"http://127.0.0.1:5173",
51+
"http://127.0.0.1:3000",
52+
],
53+
allow_credentials=True,
54+
allow_methods=["*"],
55+
allow_headers=["*"],
56+
)
57+
58+
# 注册路由
59+
app.include_router(crawler_router, prefix="/api")
60+
app.include_router(data_router, prefix="/api")
61+
app.include_router(websocket_router, prefix="/api")
62+
63+
64+
@app.get("/")
65+
async def serve_frontend():
66+
"""返回前端页面"""
67+
index_path = os.path.join(WEBUI_DIR, "index.html")
68+
if os.path.exists(index_path):
69+
return FileResponse(index_path)
70+
return {
71+
"message": "MediaCrawler WebUI API",
72+
"version": "1.0.0",
73+
"docs": "/docs",
74+
"note": "WebUI not found, please build it first: cd webui && npm run build"
75+
}
76+
77+
78+
@app.get("/api/health")
79+
async def health_check():
80+
return {"status": "ok"}
81+
82+
83+
@app.get("/api/env/check")
84+
async def check_environment():
85+
"""检测 MediaCrawler 环境是否配置正确"""
86+
try:
87+
# 运行 uv run main.py --help 命令检测环境
88+
process = await asyncio.create_subprocess_exec(
89+
"uv", "run", "main.py", "--help",
90+
stdout=subprocess.PIPE,
91+
stderr=subprocess.PIPE,
92+
cwd="." # 项目根目录
93+
)
94+
stdout, stderr = await asyncio.wait_for(
95+
process.communicate(),
96+
timeout=30.0 # 30秒超时
97+
)
98+
99+
if process.returncode == 0:
100+
return {
101+
"success": True,
102+
"message": "MediaCrawler 环境配置正确",
103+
"output": stdout.decode("utf-8", errors="ignore")[:500] # 截取前500字符
104+
}
105+
else:
106+
error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
107+
return {
108+
"success": False,
109+
"message": "环境检测失败",
110+
"error": error_msg[:500]
111+
}
112+
except asyncio.TimeoutError:
113+
return {
114+
"success": False,
115+
"message": "环境检测超时",
116+
"error": "命令执行超过30秒"
117+
}
118+
except FileNotFoundError:
119+
return {
120+
"success": False,
121+
"message": "未找到 uv 命令",
122+
"error": "请确保已安装 uv 并配置到系统 PATH"
123+
}
124+
except Exception as e:
125+
return {
126+
"success": False,
127+
"message": "环境检测出错",
128+
"error": str(e)
129+
}
130+
131+
132+
@app.get("/api/config/platforms")
133+
async def get_platforms():
134+
"""获取支持的平台列表"""
135+
return {
136+
"platforms": [
137+
{"value": "xhs", "label": "小红书", "icon": "book-open"},
138+
{"value": "dy", "label": "抖音", "icon": "music"},
139+
{"value": "ks", "label": "快手", "icon": "video"},
140+
{"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
141+
{"value": "wb", "label": "微博", "icon": "message-circle"},
142+
{"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
143+
{"value": "zhihu", "label": "知乎", "icon": "help-circle"},
144+
]
145+
}
146+
147+
148+
@app.get("/api/config/options")
149+
async def get_config_options():
150+
"""获取所有配置选项"""
151+
return {
152+
"login_types": [
153+
{"value": "qrcode", "label": "二维码登录"},
154+
{"value": "cookie", "label": "Cookie登录"},
155+
],
156+
"crawler_types": [
157+
{"value": "search", "label": "搜索模式"},
158+
{"value": "detail", "label": "详情模式"},
159+
{"value": "creator", "label": "创作者模式"},
160+
],
161+
"save_options": [
162+
{"value": "json", "label": "JSON 文件"},
163+
{"value": "csv", "label": "CSV 文件"},
164+
{"value": "excel", "label": "Excel 文件"},
165+
{"value": "sqlite", "label": "SQLite 数据库"},
166+
{"value": "db", "label": "MySQL 数据库"},
167+
{"value": "mongodb", "label": "MongoDB 数据库"},
168+
],
169+
}
170+
171+
172+
# 挂载静态资源 - 必须放在所有路由之后
173+
if os.path.exists(WEBUI_DIR):
174+
assets_dir = os.path.join(WEBUI_DIR, "assets")
175+
if os.path.exists(assets_dir):
176+
app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
177+
# 挂载 logos 目录
178+
logos_dir = os.path.join(WEBUI_DIR, "logos")
179+
if os.path.exists(logos_dir):
180+
app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
181+
# 挂载其他静态文件(如 vite.svg)
182+
app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
183+
184+
185+
if __name__ == "__main__":
186+
uvicorn.run(app, host="0.0.0.0", port=8080)

api/routers/__init__.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) 2025 [email protected]
3+
#
4+
# This file is part of MediaCrawler project.
5+
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/__init__.py
6+
# GitHub: https://github.com/NanmiCoder
7+
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
8+
#
9+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
10+
# 1. 不得用于任何商业用途。
11+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
12+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
13+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
14+
# 5. 不得用于任何非法或不当的用途。
15+
#
16+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
17+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
18+
19+
from .crawler import router as crawler_router
20+
from .data import router as data_router
21+
from .websocket import router as websocket_router
22+
23+
__all__ = ["crawler_router", "data_router", "websocket_router"]

api/routers/crawler.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) 2025 [email protected]
3+
#
4+
# This file is part of MediaCrawler project.
5+
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/crawler.py
6+
# GitHub: https://github.com/NanmiCoder
7+
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
8+
#
9+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
10+
# 1. 不得用于任何商业用途。
11+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
12+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
13+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
14+
# 5. 不得用于任何非法或不当的用途。
15+
#
16+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
17+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
18+
19+
from fastapi import APIRouter, HTTPException
20+
21+
from ..schemas import CrawlerStartRequest, CrawlerStatusResponse
22+
from ..services import crawler_manager
23+
24+
router = APIRouter(prefix="/crawler", tags=["crawler"])
25+
26+
27+
@router.post("/start")
28+
async def start_crawler(request: CrawlerStartRequest):
29+
"""启动爬虫任务"""
30+
success = await crawler_manager.start(request)
31+
if not success:
32+
# 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500
33+
if crawler_manager.process and crawler_manager.process.poll() is None:
34+
raise HTTPException(status_code=400, detail="Crawler is already running")
35+
raise HTTPException(status_code=500, detail="Failed to start crawler")
36+
37+
return {"status": "ok", "message": "Crawler started successfully"}
38+
39+
40+
@router.post("/stop")
41+
async def stop_crawler():
42+
"""停止爬虫任务"""
43+
success = await crawler_manager.stop()
44+
if not success:
45+
# 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500
46+
if not crawler_manager.process or crawler_manager.process.poll() is not None:
47+
raise HTTPException(status_code=400, detail="No crawler is running")
48+
raise HTTPException(status_code=500, detail="Failed to stop crawler")
49+
50+
return {"status": "ok", "message": "Crawler stopped successfully"}
51+
52+
53+
@router.get("/status", response_model=CrawlerStatusResponse)
54+
async def get_crawler_status():
55+
"""获取爬虫状态"""
56+
return crawler_manager.get_status()
57+
58+
59+
@router.get("/logs")
60+
async def get_logs(limit: int = 100):
61+
"""获取最近的日志"""
62+
logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
63+
return {"logs": [log.model_dump() for log in logs]}

0 commit comments

Comments
 (0)