@@ -230,74 +230,6 @@ def save_processed_files(record_file: pathlib.Path, processed_files: set[str]):
230230 console .print (f"[bold red]Error: Could not save processed files record: { e } [/bold red]" )
231231
232232
233- async def convert_to_markdown (
234- client : httpx .AsyncClient ,
235- base_url : str ,
236- db_id : str ,
237- server_file_path : str ,
238- ) -> str | None :
239- """Calls the file-to-markdown conversion endpoint."""
240- try :
241- response = await client .post (
242- f"{ base_url } /knowledge/files/markdown" ,
243- json = {"db_id" : db_id , "file_path" : server_file_path },
244- timeout = 600 , # 10 minutes timeout for conversion
245- )
246- response .raise_for_status ()
247- result = response .json ()
248- if result .get ("status" ) == "success" :
249- return result .get ("markdown_content" )
250- else :
251- console .print (f"[bold red]Failed to convert { server_file_path } : { result .get ('message' )} [/bold red]" )
252- return None
253- except httpx .HTTPStatusError as e :
254- console .print (
255- f"[bold red]Failed to convert { server_file_path } : { e .response .status_code } - { e .response .text } [/bold red]"
256- )
257- return None
258- except httpx .RequestError as e :
259- console .print (f"[bold red]Request failed for { server_file_path } : { e } [/bold red]" )
260- return None
261-
262-
263- async def trans_worker (
264- semaphore : asyncio .Semaphore ,
265- client : httpx .AsyncClient ,
266- base_url : str ,
267- db_id : str ,
268- file_path : pathlib .Path ,
269- output_dir : pathlib .Path ,
270- progress : Progress ,
271- task_id : int ,
272- ):
273- """A worker task that uploads a file and converts it to markdown."""
274- async with semaphore :
275- # 1. Upload file
276- server_file_path = await upload_file (client , base_url , db_id , file_path )
277- if not server_file_path :
278- progress .update (task_id , advance = 1 , postfix = f"[red]Upload failed for { file_path .name } [/red]" )
279- return file_path , "upload_failed"
280-
281- # 2. Convert file to markdown
282- markdown_content = await convert_to_markdown (client , base_url , db_id , server_file_path )
283- if not markdown_content :
284- progress .update (task_id , advance = 1 , postfix = f"[yellow]Conversion failed for { file_path .name } [/yellow]" )
285- return file_path , "conversion_failed"
286-
287- # 3. Save markdown content to output directory
288- try :
289- output_path = output_dir / file_path .with_suffix (".md" ).name
290- output_path .parent .mkdir (parents = True , exist_ok = True )
291- with open (output_path , "w" , encoding = "utf-8" ) as f :
292- f .write (markdown_content )
293- progress .update (task_id , advance = 1 , postfix = f"[green]Converted { file_path .name } [/green]" )
294- return file_path , "success"
295- except OSError as e :
296- console .print (f"[bold red]Error saving markdown for { file_path .name } : { e } [/bold red]" )
297- progress .update (task_id , advance = 1 , postfix = f"[red]Save failed for { file_path .name } [/red]" )
298- return file_path , "save_failed"
299-
300-
301233@app .command ()
302234def upload (
303235 db_id : str = typer .Option (..., help = "The ID of the knowledge base." ),
@@ -447,91 +379,6 @@ async def run():
447379 asyncio .run (run ())
448380
449381
450- @app .command ()
451- def trans (
452- db_id : str = typer .Option (..., help = "The ID of the knowledge base (for temporary file upload)." ),
453- directory : pathlib .Path = typer .Option (
454- ..., help = "The directory containing files to convert." , exists = True , file_okay = False
455- ),
456- output_dir : pathlib .Path = typer .Option ("output_markdown" , help = "The directory to save converted markdown files." ),
457- pattern : str = typer .Option ("*.docx" , help = "The glob pattern for files to convert (e.g., '*.pdf', '*.docx')." ),
458- base_url : str = typer .Option ("http://127.0.0.1:5050/api" , help = "The base URL of the API server." ),
459- username : str = typer .Option (..., help = "Admin username for login." ),
460- password : str = typer .Option (..., help = "Admin password for login." ),
461- concurrency : int = typer .Option (4 , help = "The number of concurrent conversion tasks." ),
462- recursive : bool = typer .Option (False , "--recursive" , "-r" , help = "Search for files recursively in subdirectories." ),
463- ):
464- """
465- Batch convert files to Markdown format.
466- """
467- console .print (f"[bold green]Starting batch conversion for files in: { directory } [/bold green]" )
468- output_dir .mkdir (parents = True , exist_ok = True )
469-
470- # Discover files
471- glob_method = directory .rglob if recursive else directory .glob
472- files_to_convert = list (glob_method (pattern ))
473- if not files_to_convert :
474- console .print (f"[bold yellow]No files found in '{ directory } ' matching '{ pattern } '. Aborting.[/bold yellow]" )
475- raise typer .Exit ()
476-
477- # 过滤掉macos的隐藏文件
478- files_to_convert = [f for f in files_to_convert if not f .name .startswith ("._" )]
479-
480- console .print (f"Found { len (files_to_convert )} files to convert." )
481-
482- async def run ():
483- async with httpx .AsyncClient () as client :
484- # Login
485- token = await login (client , base_url , username , password )
486- if not token :
487- raise typer .Exit (code = 1 )
488-
489- client .headers = {"Authorization" : f"Bearer { token } " }
490-
491- # Setup concurrency and tasks
492- semaphore = asyncio .Semaphore (concurrency )
493- tasks = []
494-
495- with Progress (
496- SpinnerColumn (),
497- TextColumn ("[progress.description]{task.description}" ),
498- BarColumn (),
499- TextColumn ("[progress.percentage]{task.percentage:>3.0f}%" ),
500- TimeElapsedColumn (),
501- TextColumn ("{task.fields[postfix]}" ),
502- console = console ,
503- transient = True ,
504- ) as progress :
505- task_id = progress .add_task ("[bold blue]Converting..." , total = len (files_to_convert ), postfix = "" )
506-
507- for file_path in files_to_convert :
508- task = asyncio .create_task (
509- trans_worker (semaphore , client , base_url , db_id , file_path , output_dir , progress , task_id )
510- )
511- tasks .append (task )
512-
513- results = await asyncio .gather (* tasks )
514-
515- # Summarize results
516- successful_files = []
517- failed_files = []
518-
519- for file_path , status in results :
520- if status == "success" :
521- successful_files .append (file_path )
522- else :
523- failed_files .append ((file_path , status ))
524-
525- console .print ("[bold green]Batch conversion complete.[/bold green]" )
526- console .print (f" - [green]Successful:[/green] { len (successful_files )} " )
527- console .print (f" - [red]Failed:[/red] { len (failed_files )} " )
528- if failed_files :
529- for f , status in failed_files :
530- console .print (f" - { f } (Reason: { status } )" )
531-
532- asyncio .run (run ())
533-
534-
535382"""
536383# Example for upload
537384uv run scripts/batch_upload.py upload \
@@ -544,18 +391,6 @@ async def run():
544391 --concurrency 4 \
545392 --recursive \
546393 --record-file scripts/tmp/batch_processed_files.txt
547-
548- # Example for trans
549- uv run scripts/batch_upload.py trans \
550- --db-id your_kb_id \
551- --directory path/to/your/data \
552- --output-dir path/to/output_markdown \
553- --pattern "*.docx" \
554- --base-url http://127.0.0.1:5050/api \
555- --username your_username \
556- --password your_password \
557- --concurrency 4 \
558- --recursive
559394"""
560395if __name__ == "__main__" :
561396 app ()
0 commit comments