Skip to content

Commit 0b1f7d2

Browse files
CLI: global config, tqdm UX, vision dedupe+batch, shell completions
- Config: ~/.config/docproc/docproc.yml in search path; docproc init-config populates from .env - CLI: tqdm progress bar, single-line output, quiet logs (httpx/openai/vision_llm), colors, full output path - Spinner: background thread so spinner rotates constantly during extraction - Vision: dedupe images by content hash (skip logos/repeats); batch 4-5 images per request; 3 batches in parallel; timeout per image - Completions: docproc completions [bash|zsh]; shtab + tqdm deps - Progress: bar only advances (no backward jump); refine phase shows 100% + refining Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 3443794 commit 0b1f7d2

File tree

6 files changed

+429
-37
lines changed

6 files changed

+429
-37
lines changed

docproc/bin/cli.py

Lines changed: 151 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@
33
import argparse
44
import logging
55
import sys
6+
import threading
7+
import time
68
from pathlib import Path
79

10+
import shtab
11+
from tqdm import tqdm
12+
813
from docproc.doc.loaders import get_supported_extensions
914

1015
logging.basicConfig(
@@ -13,6 +18,59 @@
1318
logger = logging.getLogger(__name__)
1419

1520

21+
def _run_init_config():
22+
"""Populate ~/.config/docproc/docproc.yml from .env (one-time)."""
23+
import os
24+
import yaml
25+
from dotenv import load_dotenv
26+
27+
parser = argparse.ArgumentParser(prog="docproc init-config")
28+
parser.add_argument("--env", type=str, default=".env", help="Path to .env file")
29+
args = parser.parse_args(sys.argv[2:])
30+
load_dotenv(args.env)
31+
cfg_dir = Path.home() / ".config" / "docproc"
32+
cfg_dir.mkdir(parents=True, exist_ok=True)
33+
out_path = cfg_dir / "docproc.yml"
34+
raw = {}
35+
if os.getenv("AZURE_OPENAI_API_KEY"):
36+
raw["ai_providers"] = [
37+
{
38+
"provider": "azure",
39+
"api_key": os.getenv("AZURE_OPENAI_API_KEY"),
40+
"base_url": os.getenv("AZURE_OPENAI_ENDPOINT"),
41+
"default_model": os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
42+
"default_vision_model": os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
43+
"extra": {
44+
"azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
45+
"azure_embedding_deployment": os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
46+
"azure_vision_endpoint": os.getenv("AZURE_VISION_ENDPOINT"),
47+
},
48+
}
49+
]
50+
raw["primary_ai"] = "azure"
51+
elif os.getenv("OPENAI_API_KEY"):
52+
raw["ai_providers"] = [{"provider": "openai", "api_key": os.getenv("OPENAI_API_KEY")}]
53+
raw["primary_ai"] = "openai"
54+
elif os.getenv("ANTHROPIC_API_KEY"):
55+
raw["ai_providers"] = [{"provider": "anthropic", "api_key": os.getenv("ANTHROPIC_API_KEY")}]
56+
raw["primary_ai"] = "anthropic"
57+
else:
58+
raw["ai_providers"] = [
59+
{"provider": "ollama", "base_url": "http://localhost:11434", "default_vision_model": "llava"}
60+
]
61+
raw["primary_ai"] = "ollama"
62+
if os.getenv("DATABASE_URL"):
63+
raw["database"] = {"provider": "pgvector", "connection_string": os.getenv("DATABASE_URL")}
64+
else:
65+
raw["database"] = {"provider": "memory"}
66+
raw["rag"] = {"backend": "clara", "top_k": 5}
67+
raw["ingest"] = {"use_vision": True, "use_llm_refine": True}
68+
with open(out_path, "w") as f:
69+
yaml.dump(raw, f, default_flow_style=False, sort_keys=False)
70+
logger.info("Wrote %s", out_path)
71+
return 0
72+
73+
1674
def parse_args():
1775
parser = argparse.ArgumentParser(
1876
description="Extract document to markdown (vision + optional LLM refine)"
@@ -26,7 +84,31 @@ def parse_args():
2684
return parser.parse_args()
2785

2886

87+
def _get_completion_parser():
88+
"""Parser used for shtab completion generation (matches main extract args)."""
89+
parser = argparse.ArgumentParser(prog="docproc")
90+
parser.add_argument("--file", "-f", help="Input document").complete = shtab.FILE
91+
parser.add_argument("-o", "--output", help="Output .md path").complete = shtab.FILE
92+
parser.add_argument("--config", help="Config file path").complete = shtab.FILE
93+
parser.add_argument("-v", "--verbose", action="store_true")
94+
return parser
95+
96+
97+
def _run_completions():
98+
"""Print shell completion script. Usage: docproc completions [bash|zsh]."""
99+
parser = _get_completion_parser()
100+
shell = sys.argv[2] if len(sys.argv) > 2 else "bash"
101+
if shell not in ("bash", "zsh"):
102+
shell = "bash"
103+
print(shtab.complete(parser, shell=shell))
104+
return 0
105+
106+
29107
def main():
108+
if len(sys.argv) > 1 and sys.argv[1] == "init-config":
109+
return _run_init_config()
110+
if len(sys.argv) > 1 and sys.argv[1] == "completions":
111+
return _run_completions()
30112
args = parse_args()
31113
if args.verbose:
32114
logging.getLogger().setLevel(logging.DEBUG)
@@ -60,15 +142,76 @@ def main():
60142
else:
61143
load_config()
62144

63-
def progress(page: int, total: int, message: str):
64-
logger.info("%s (%d/%d)", message, page, total)
145+
# Single-line UX: suppress all logs during extraction
146+
_log = logging.getLogger
147+
_quiet = [_log("httpx"), _log("httpcore"), _log("openai"), _log("docproc.extractors.vision_llm")]
148+
_saved = [g.level for g in _quiet]
149+
for g in _quiet:
150+
g.setLevel(logging.WARNING)
151+
152+
pbar: tqdm | None = None
153+
spin_idx = [0]
154+
SPINNER = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
155+
_C = "\033[36m"
156+
_G = "\033[32m"
157+
_Y = "\033[33m"
158+
_R = "\033[0m"
159+
stop_spinner = threading.Event()
160+
spinner_thread: threading.Thread | None = None
161+
162+
def spinner_loop():
163+
while not stop_spinner.wait(0.08):
164+
if pbar is None:
165+
continue
166+
spin_idx[0] = (spin_idx[0] + 1) % len(SPINNER)
167+
pbar.set_description_str(f"{_C}docproc {SPINNER[spin_idx[0]]}{_R}")
168+
pbar.refresh()
65169

66-
full_text = extract_document_to_text(
67-
input_path, progress_callback=progress
68-
)
69-
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
70-
Path(output_path).write_text(full_text, encoding="utf-8")
71-
logger.info("Wrote %s", output_path)
170+
def progress(page: int, total: int, message: str):
171+
nonlocal pbar, spinner_thread
172+
if total == 1 and "Refining" in message:
173+
if pbar is not None:
174+
pbar.n = pbar.total
175+
pbar.set_postfix_str("refining…", refresh=False)
176+
pbar.refresh()
177+
return
178+
if pbar is None:
179+
pbar = tqdm(
180+
total=max(1, total),
181+
unit="",
182+
desc=f"{_C}docproc {SPINNER[0]}{_R}",
183+
bar_format=f"{{desc}} {_G}{{bar}}{_R} {_Y}{{n_fmt}}/{{total_fmt}}{_R} {{postfix}}",
184+
dynamic_ncols=True,
185+
leave=False,
186+
mininterval=0.2,
187+
maxinterval=0.5,
188+
)
189+
spinner_thread = threading.Thread(target=spinner_loop, daemon=True)
190+
spinner_thread.start()
191+
# Only advance; parallel batches complete out of order
192+
new_n = min(page, pbar.total - 1) if pbar.total else page
193+
if new_n > pbar.n:
194+
pbar.n = new_n
195+
pbar.set_postfix_str(message[:40].strip(), refresh=False)
196+
pbar.refresh()
197+
198+
try:
199+
full_text = extract_document_to_text(
200+
input_path, progress_callback=progress
201+
)
202+
finally:
203+
stop_spinner.set()
204+
if spinner_thread is not None:
205+
spinner_thread.join(timeout=0.5)
206+
for g, level in zip(_quiet, _saved):
207+
g.setLevel(level)
208+
if pbar is not None:
209+
pbar.close()
210+
out = Path(output_path)
211+
out.parent.mkdir(parents=True, exist_ok=True)
212+
out.write_text(full_text, encoding="utf-8")
213+
full_path = str(out.resolve())
214+
sys.stderr.write(f"\r\033[K{_G}✓ Wrote{_R} {full_path}\n")
72215
return 0
73216
except Exception as e:
74217
logger.error("Failed to extract: %s", e, exc_info=args.verbose)

docproc/config/loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def load_config(path: Optional[str] = None) -> DocProcConfig:
3535
os.getenv("DOCPROC_CONFIG"),
3636
"docproc.yaml",
3737
"docproc.yml",
38+
os.path.expanduser("~/.config/docproc/docproc.yml"),
3839
os.path.expanduser("~/.docproc.yaml"),
3940
]
4041
config_path = None

0 commit comments

Comments
 (0)