Skip to content

Commit 59727d6

Browse files
committed
refactor: 更新文档,优化批量上传和解析脚本说明;移除不再支持的转换功能
1 parent d56cec5 commit 59727d6

File tree

5 files changed

+3
-188
lines changed

5 files changed

+3
-188
lines changed

docs/advanced/document-processing.md

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ docker compose up paddlex --build
6060

6161
## 批量处理脚本
6262

63-
系统提供便捷的批量处理脚本,支持文件上传和解析操作
63+
系统提供便捷的批量处理脚本,用于高效批量上传文档
6464

6565
### 文件上传脚本
6666

@@ -90,26 +90,6 @@ uv run scripts/batch_upload.py upload \
9090

9191
提示:系统按“内容哈希”进行去重;同一知识库已存在相同内容的文件会被拒绝(409)。
9292

93-
### 文件解析脚本
94-
95-
使用 `scripts/batch_upload.py trans` 将文件解析为 Markdown:
96-
97-
```bash
98-
# 批量解析文档
99-
uv run scripts/batch_upload.py trans \
100-
--db-id your_kb_id \
101-
--directory path/to/your/data \
102-
--output-dir path/to/output_markdown \
103-
--pattern "*.docx" \
104-
--base-url http://127.0.0.1:5050/api \
105-
--username your_username \
106-
--password your_password \
107-
--concurrency 4 \
108-
--recursive
109-
```
110-
111-
**输出结果**: 解析后的 Markdown 文件将保存到指定输出目录。
112-
11393
### 脚本功能
11494

11595
- **进度跟踪**: 实时显示处理进度

docs/changelog/faq.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
- 批量上传与转换示例?
3434
- 上传入库:`uv run scripts/batch_upload.py upload --db-id <id> --directory <dir> --username <u> --password <p> --base-url http://127.0.0.1:5050/api`
35-
- 转 Markdown:`uv run scripts/batch_upload.py trans --db-id <id> --directory <dir> --username <u> --password <p>`
3635
- 参考:高级配置 → 文档解析
3736

3837
- 登录失败被锁定?

docs/intro/knowledge-base.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ LightRAG 知识库可在知识库详情中可视化,但不支持在侧边栏
5858
### 批量脚本
5959

6060
- 上传并入库:参见 `scripts/batch_upload.py upload`
61-
- 转换为 Markdown:参见 `scripts/batch_upload.py trans`
6261

6362
## 知识图谱
6463

scripts/batch_upload.py

Lines changed: 0 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -230,74 +230,6 @@ def save_processed_files(record_file: pathlib.Path, processed_files: set[str]):
230230
console.print(f"[bold red]Error: Could not save processed files record: {e}[/bold red]")
231231

232232

233-
async def convert_to_markdown(
234-
client: httpx.AsyncClient,
235-
base_url: str,
236-
db_id: str,
237-
server_file_path: str,
238-
) -> str | None:
239-
"""Calls the file-to-markdown conversion endpoint."""
240-
try:
241-
response = await client.post(
242-
f"{base_url}/knowledge/files/markdown",
243-
json={"db_id": db_id, "file_path": server_file_path},
244-
timeout=600, # 10 minutes timeout for conversion
245-
)
246-
response.raise_for_status()
247-
result = response.json()
248-
if result.get("status") == "success":
249-
return result.get("markdown_content")
250-
else:
251-
console.print(f"[bold red]Failed to convert {server_file_path}: {result.get('message')}[/bold red]")
252-
return None
253-
except httpx.HTTPStatusError as e:
254-
console.print(
255-
f"[bold red]Failed to convert {server_file_path}: {e.response.status_code} - {e.response.text}[/bold red]"
256-
)
257-
return None
258-
except httpx.RequestError as e:
259-
console.print(f"[bold red]Request failed for {server_file_path}: {e}[/bold red]")
260-
return None
261-
262-
263-
async def trans_worker(
264-
semaphore: asyncio.Semaphore,
265-
client: httpx.AsyncClient,
266-
base_url: str,
267-
db_id: str,
268-
file_path: pathlib.Path,
269-
output_dir: pathlib.Path,
270-
progress: Progress,
271-
task_id: int,
272-
):
273-
"""A worker task that uploads a file and converts it to markdown."""
274-
async with semaphore:
275-
# 1. Upload file
276-
server_file_path = await upload_file(client, base_url, db_id, file_path)
277-
if not server_file_path:
278-
progress.update(task_id, advance=1, postfix=f"[red]Upload failed for {file_path.name}[/red]")
279-
return file_path, "upload_failed"
280-
281-
# 2. Convert file to markdown
282-
markdown_content = await convert_to_markdown(client, base_url, db_id, server_file_path)
283-
if not markdown_content:
284-
progress.update(task_id, advance=1, postfix=f"[yellow]Conversion failed for {file_path.name}[/yellow]")
285-
return file_path, "conversion_failed"
286-
287-
# 3. Save markdown content to output directory
288-
try:
289-
output_path = output_dir / file_path.with_suffix(".md").name
290-
output_path.parent.mkdir(parents=True, exist_ok=True)
291-
with open(output_path, "w", encoding="utf-8") as f:
292-
f.write(markdown_content)
293-
progress.update(task_id, advance=1, postfix=f"[green]Converted {file_path.name}[/green]")
294-
return file_path, "success"
295-
except OSError as e:
296-
console.print(f"[bold red]Error saving markdown for {file_path.name}: {e}[/bold red]")
297-
progress.update(task_id, advance=1, postfix=f"[red]Save failed for {file_path.name}[/red]")
298-
return file_path, "save_failed"
299-
300-
301233
@app.command()
302234
def upload(
303235
db_id: str = typer.Option(..., help="The ID of the knowledge base."),
@@ -447,91 +379,6 @@ async def run():
447379
asyncio.run(run())
448380

449381

450-
@app.command()
451-
def trans(
452-
db_id: str = typer.Option(..., help="The ID of the knowledge base (for temporary file upload)."),
453-
directory: pathlib.Path = typer.Option(
454-
..., help="The directory containing files to convert.", exists=True, file_okay=False
455-
),
456-
output_dir: pathlib.Path = typer.Option("output_markdown", help="The directory to save converted markdown files."),
457-
pattern: str = typer.Option("*.docx", help="The glob pattern for files to convert (e.g., '*.pdf', '*.docx')."),
458-
base_url: str = typer.Option("http://127.0.0.1:5050/api", help="The base URL of the API server."),
459-
username: str = typer.Option(..., help="Admin username for login."),
460-
password: str = typer.Option(..., help="Admin password for login."),
461-
concurrency: int = typer.Option(4, help="The number of concurrent conversion tasks."),
462-
recursive: bool = typer.Option(False, "--recursive", "-r", help="Search for files recursively in subdirectories."),
463-
):
464-
"""
465-
Batch convert files to Markdown format.
466-
"""
467-
console.print(f"[bold green]Starting batch conversion for files in: {directory}[/bold green]")
468-
output_dir.mkdir(parents=True, exist_ok=True)
469-
470-
# Discover files
471-
glob_method = directory.rglob if recursive else directory.glob
472-
files_to_convert = list(glob_method(pattern))
473-
if not files_to_convert:
474-
console.print(f"[bold yellow]No files found in '{directory}' matching '{pattern}'. Aborting.[/bold yellow]")
475-
raise typer.Exit()
476-
477-
# 过滤掉macos的隐藏文件
478-
files_to_convert = [f for f in files_to_convert if not f.name.startswith("._")]
479-
480-
console.print(f"Found {len(files_to_convert)} files to convert.")
481-
482-
async def run():
483-
async with httpx.AsyncClient() as client:
484-
# Login
485-
token = await login(client, base_url, username, password)
486-
if not token:
487-
raise typer.Exit(code=1)
488-
489-
client.headers = {"Authorization": f"Bearer {token}"}
490-
491-
# Setup concurrency and tasks
492-
semaphore = asyncio.Semaphore(concurrency)
493-
tasks = []
494-
495-
with Progress(
496-
SpinnerColumn(),
497-
TextColumn("[progress.description]{task.description}"),
498-
BarColumn(),
499-
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
500-
TimeElapsedColumn(),
501-
TextColumn("{task.fields[postfix]}"),
502-
console=console,
503-
transient=True,
504-
) as progress:
505-
task_id = progress.add_task("[bold blue]Converting...", total=len(files_to_convert), postfix="")
506-
507-
for file_path in files_to_convert:
508-
task = asyncio.create_task(
509-
trans_worker(semaphore, client, base_url, db_id, file_path, output_dir, progress, task_id)
510-
)
511-
tasks.append(task)
512-
513-
results = await asyncio.gather(*tasks)
514-
515-
# Summarize results
516-
successful_files = []
517-
failed_files = []
518-
519-
for file_path, status in results:
520-
if status == "success":
521-
successful_files.append(file_path)
522-
else:
523-
failed_files.append((file_path, status))
524-
525-
console.print("[bold green]Batch conversion complete.[/bold green]")
526-
console.print(f" - [green]Successful:[/green] {len(successful_files)}")
527-
console.print(f" - [red]Failed:[/red] {len(failed_files)}")
528-
if failed_files:
529-
for f, status in failed_files:
530-
console.print(f" - {f} (Reason: {status})")
531-
532-
asyncio.run(run())
533-
534-
535382
"""
536383
# Example for upload
537384
uv run scripts/batch_upload.py upload \
@@ -544,18 +391,6 @@ async def run():
544391
--concurrency 4 \
545392
--recursive \
546393
--record-file scripts/tmp/batch_processed_files.txt
547-
548-
# Example for trans
549-
uv run scripts/batch_upload.py trans \
550-
--db-id your_kb_id \
551-
--directory path/to/your/data \
552-
--output-dir path/to/output_markdown \
553-
--pattern "*.docx" \
554-
--base-url http://127.0.0.1:5050/api \
555-
--username your_username \
556-
--password your_password \
557-
--concurrency 4 \
558-
--recursive
559394
"""
560395
if __name__ == "__main__":
561396
app()

src/storage/minio/client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@ def __init__(self):
5151
host_ip = host_ip.split("://")[-1]
5252
host_ip = host_ip.rstrip("/")
5353
self.public_endpoint = f"{host_ip}:9000"
54+
logger.debug(f"Docker MinIOClient public_endpoint: {self.public_endpoint}")
5455
else:
5556
self.public_endpoint = "localhost:9000"
57+
logger.debug(f"Default_client: {self.public_endpoint}")
5658

5759
@property
5860
def client(self) -> Minio:

0 commit comments

Comments
 (0)