33import argparse
44import logging
55import sys
6+ import threading
7+ import time
68from pathlib import Path
79
10+ import shtab
11+ from tqdm import tqdm
12+
813from docproc .doc .loaders import get_supported_extensions
914
1015logging .basicConfig (
1318logger = logging .getLogger (__name__ )
1419
1520
21+ def _run_init_config ():
22+ """Populate ~/.config/docproc/docproc.yml from .env (one-time)."""
23+ import os
24+ import yaml
25+ from dotenv import load_dotenv
26+
27+ parser = argparse .ArgumentParser (prog = "docproc init-config" )
28+ parser .add_argument ("--env" , type = str , default = ".env" , help = "Path to .env file" )
29+ args = parser .parse_args (sys .argv [2 :])
30+ load_dotenv (args .env )
31+ cfg_dir = Path .home () / ".config" / "docproc"
32+ cfg_dir .mkdir (parents = True , exist_ok = True )
33+ out_path = cfg_dir / "docproc.yml"
34+ raw = {}
35+ if os .getenv ("AZURE_OPENAI_API_KEY" ):
36+ raw ["ai_providers" ] = [
37+ {
38+ "provider" : "azure" ,
39+ "api_key" : os .getenv ("AZURE_OPENAI_API_KEY" ),
40+ "base_url" : os .getenv ("AZURE_OPENAI_ENDPOINT" ),
41+ "default_model" : os .getenv ("AZURE_OPENAI_DEPLOYMENT" , "gpt-4o" ),
42+ "default_vision_model" : os .getenv ("AZURE_OPENAI_DEPLOYMENT" , "gpt-4o" ),
43+ "extra" : {
44+ "azure_deployment" : os .getenv ("AZURE_OPENAI_DEPLOYMENT" ),
45+ "azure_embedding_deployment" : os .getenv ("AZURE_OPENAI_EMBEDDING_DEPLOYMENT" ),
46+ "azure_vision_endpoint" : os .getenv ("AZURE_VISION_ENDPOINT" ),
47+ },
48+ }
49+ ]
50+ raw ["primary_ai" ] = "azure"
51+ elif os .getenv ("OPENAI_API_KEY" ):
52+ raw ["ai_providers" ] = [{"provider" : "openai" , "api_key" : os .getenv ("OPENAI_API_KEY" )}]
53+ raw ["primary_ai" ] = "openai"
54+ elif os .getenv ("ANTHROPIC_API_KEY" ):
55+ raw ["ai_providers" ] = [{"provider" : "anthropic" , "api_key" : os .getenv ("ANTHROPIC_API_KEY" )}]
56+ raw ["primary_ai" ] = "anthropic"
57+ else :
58+ raw ["ai_providers" ] = [
59+ {"provider" : "ollama" , "base_url" : "http://localhost:11434" , "default_vision_model" : "llava" }
60+ ]
61+ raw ["primary_ai" ] = "ollama"
62+ if os .getenv ("DATABASE_URL" ):
63+ raw ["database" ] = {"provider" : "pgvector" , "connection_string" : os .getenv ("DATABASE_URL" )}
64+ else :
65+ raw ["database" ] = {"provider" : "memory" }
66+ raw ["rag" ] = {"backend" : "clara" , "top_k" : 5 }
67+ raw ["ingest" ] = {"use_vision" : True , "use_llm_refine" : True }
68+ with open (out_path , "w" ) as f :
69+ yaml .dump (raw , f , default_flow_style = False , sort_keys = False )
70+ logger .info ("Wrote %s" , out_path )
71+ return 0
72+
73+
1674def parse_args ():
1775 parser = argparse .ArgumentParser (
1876 description = "Extract document to markdown (vision + optional LLM refine)"
@@ -26,7 +84,31 @@ def parse_args():
2684 return parser .parse_args ()
2785
2886
87+ def _get_completion_parser ():
88+ """Parser used for shtab completion generation (matches main extract args)."""
89+ parser = argparse .ArgumentParser (prog = "docproc" )
90+ parser .add_argument ("--file" , "-f" , help = "Input document" ).complete = shtab .FILE
91+ parser .add_argument ("-o" , "--output" , help = "Output .md path" ).complete = shtab .FILE
92+ parser .add_argument ("--config" , help = "Config file path" ).complete = shtab .FILE
93+ parser .add_argument ("-v" , "--verbose" , action = "store_true" )
94+ return parser
95+
96+
97+ def _run_completions ():
98+ """Print shell completion script. Usage: docproc completions [bash|zsh]."""
99+ parser = _get_completion_parser ()
100+ shell = sys .argv [2 ] if len (sys .argv ) > 2 else "bash"
101+ if shell not in ("bash" , "zsh" ):
102+ shell = "bash"
103+ print (shtab .complete (parser , shell = shell ))
104+ return 0
105+
106+
29107def main ():
108+ if len (sys .argv ) > 1 and sys .argv [1 ] == "init-config" :
109+ return _run_init_config ()
110+ if len (sys .argv ) > 1 and sys .argv [1 ] == "completions" :
111+ return _run_completions ()
30112 args = parse_args ()
31113 if args .verbose :
32114 logging .getLogger ().setLevel (logging .DEBUG )
@@ -60,15 +142,76 @@ def main():
60142 else :
61143 load_config ()
62144
63- def progress (page : int , total : int , message : str ):
64- logger .info ("%s (%d/%d)" , message , page , total )
145+ # Single-line UX: suppress all logs during extraction
146+ _log = logging .getLogger
147+ _quiet = [_log ("httpx" ), _log ("httpcore" ), _log ("openai" ), _log ("docproc.extractors.vision_llm" )]
148+ _saved = [g .level for g in _quiet ]
149+ for g in _quiet :
150+ g .setLevel (logging .WARNING )
151+
152+ pbar : tqdm | None = None
153+ spin_idx = [0 ]
154+ SPINNER = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
155+ _C = "\033 [36m"
156+ _G = "\033 [32m"
157+ _Y = "\033 [33m"
158+ _R = "\033 [0m"
159+ stop_spinner = threading .Event ()
160+ spinner_thread : threading .Thread | None = None
161+
162+ def spinner_loop ():
163+ while not stop_spinner .wait (0.08 ):
164+ if pbar is None :
165+ continue
166+ spin_idx [0 ] = (spin_idx [0 ] + 1 ) % len (SPINNER )
167+ pbar .set_description_str (f"{ _C } docproc { SPINNER [spin_idx [0 ]]} { _R } " )
168+ pbar .refresh ()
65169
66- full_text = extract_document_to_text (
67- input_path , progress_callback = progress
68- )
69- Path (output_path ).parent .mkdir (parents = True , exist_ok = True )
70- Path (output_path ).write_text (full_text , encoding = "utf-8" )
71- logger .info ("Wrote %s" , output_path )
170+ def progress (page : int , total : int , message : str ):
171+ nonlocal pbar , spinner_thread
172+ if total == 1 and "Refining" in message :
173+ if pbar is not None :
174+ pbar .n = pbar .total
175+ pbar .set_postfix_str ("refining…" , refresh = False )
176+ pbar .refresh ()
177+ return
178+ if pbar is None :
179+ pbar = tqdm (
180+ total = max (1 , total ),
181+ unit = "" ,
182+ desc = f"{ _C } docproc { SPINNER [0 ]} { _R } " ,
183+ bar_format = f"{{desc}} { _G } {{bar}}{ _R } { _Y } {{n_fmt}}/{{total_fmt}}{ _R } {{postfix}}" ,
184+ dynamic_ncols = True ,
185+ leave = False ,
186+ mininterval = 0.2 ,
187+ maxinterval = 0.5 ,
188+ )
189+ spinner_thread = threading .Thread (target = spinner_loop , daemon = True )
190+ spinner_thread .start ()
191+ # Only advance; parallel batches complete out of order
192+ new_n = min (page , pbar .total - 1 ) if pbar .total else page
193+ if new_n > pbar .n :
194+ pbar .n = new_n
195+ pbar .set_postfix_str (message [:40 ].strip (), refresh = False )
196+ pbar .refresh ()
197+
198+ try :
199+ full_text = extract_document_to_text (
200+ input_path , progress_callback = progress
201+ )
202+ finally :
203+ stop_spinner .set ()
204+ if spinner_thread is not None :
205+ spinner_thread .join (timeout = 0.5 )
206+ for g , level in zip (_quiet , _saved ):
207+ g .setLevel (level )
208+ if pbar is not None :
209+ pbar .close ()
210+ out = Path (output_path )
211+ out .parent .mkdir (parents = True , exist_ok = True )
212+ out .write_text (full_text , encoding = "utf-8" )
213+ full_path = str (out .resolve ())
214+ sys .stderr .write (f"\r \033 [K{ _G } ✓ Wrote{ _R } { full_path } \n " )
72215 return 0
73216 except Exception as e :
74217 logger .error ("Failed to extract: %s" , e , exc_info = args .verbose )
0 commit comments