Skip to content

Commit 27ae831

Browse files
committed
Remove README CLI section
1 parent 57e1171 commit 27ae831

File tree

4 files changed

+608
-12
lines changed

4 files changed

+608
-12
lines changed

bib_dedupe/cli.py

Lines changed: 369 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,376 @@
1-
import click
1+
"""Command-line interface for :mod:`bib_dedupe`."""
22

3-
import bib_dedupe.debug
3+
from __future__ import annotations
44

5+
import argparse
6+
import sys
7+
from dataclasses import dataclass
8+
from importlib import metadata
9+
from pathlib import Path
10+
from typing import Optional, Sequence
511

6-
@click.group()
7-
def main() -> None:
8-
"""A simple CLI for bib_dedupe."""
9-
pass
12+
import pandas as pd
1013

14+
from bib_dedupe import cluster as _cluster
15+
from bib_dedupe import maybe_cases
16+
from bib_dedupe.bib_dedupe import (
17+
block,
18+
export_maybe,
19+
import_maybe,
20+
match,
21+
merge,
22+
prep,
23+
)
1124

12-
@main.command()
13-
@click.pass_context
14-
def debug(ctx: click.Context) -> None:
15-
bib_dedupe.debug.debug()
1625

26+
class CLIError(Exception):
27+
"""Raised for command-line usage errors."""
28+
29+
30+
@dataclass
31+
class RuntimeOptions:
32+
"""Common runtime options passed to library functions."""
33+
34+
cpu: int
35+
verbosity_level: Optional[int]
36+
37+
38+
DataFrame = pd.DataFrame
39+
40+
41+
def read_df(path: Path) -> DataFrame:
42+
"""Load a dataframe from *path* based on its file extension."""
43+
44+
if not path.exists():
45+
raise CLIError(f"Input file not found: {path}")
46+
47+
suffix = path.suffix.lower()
48+
try:
49+
if suffix == ".csv":
50+
return pd.read_csv(path, keep_default_na=False, low_memory=False)
51+
if suffix in {".parquet", ".pq"}:
52+
try:
53+
return pd.read_parquet(path)
54+
except ImportError as exc: # pragma: no cover - depends on optional backend
55+
raise CLIError(
56+
"Parquet support requires an optional dependency such as 'pyarrow'."
57+
) from exc
58+
if suffix == ".json":
59+
return pd.read_json(path)
60+
except ValueError as exc:
61+
raise CLIError(f"Failed to read {path}: {exc}") from exc
62+
63+
raise CLIError(
64+
"Unsupported file extension for input. Supported extensions: .csv, .parquet, .json"
65+
)
66+
67+
68+
def _ensure_output_path(path: Path) -> Path:
69+
if path.suffix:
70+
return path
71+
return path.with_suffix(".csv")
72+
73+
74+
def write_df(df: DataFrame, path: Path) -> None:
75+
"""Persist *df* to *path*, inferring the format from the file suffix."""
76+
77+
output_path = _ensure_output_path(path)
78+
output_path.parent.mkdir(parents=True, exist_ok=True)
79+
80+
suffix = output_path.suffix.lower()
81+
if suffix == ".csv":
82+
df.to_csv(output_path, index=False)
83+
return
84+
if suffix in {".parquet", ".pq"}:
85+
try:
86+
df.to_parquet(output_path, index=False)
87+
except Exception as exc: # pragma: no cover - backend availability specific
88+
raise CLIError(f"Failed to write parquet file {output_path}: {exc}") from exc
89+
return
90+
if suffix == ".json":
91+
df.to_json(output_path, orient="records", indent=2)
92+
return
93+
94+
raise CLIError(
95+
"Unsupported file extension for output. Supported extensions: .csv, .parquet, .json"
96+
)
97+
98+
99+
def _resolve_verbosity(args: argparse.Namespace) -> Optional[int]:
100+
verbosity_level = getattr(args, "verbosity_level", None)
101+
quiet = getattr(args, "quiet", False)
102+
verbose = getattr(args, "verbose", False)
103+
104+
if quiet and verbose:
105+
raise CLIError("--quiet and --verbose cannot be used together.")
106+
107+
if quiet:
108+
if verbosity_level is not None:
109+
raise CLIError("--quiet cannot be combined with --verbosity-level.")
110+
return 0
111+
if verbose:
112+
if verbosity_level is not None:
113+
raise CLIError("--verbose cannot be combined with --verbosity-level.")
114+
return 2
115+
return verbosity_level
116+
117+
118+
def _collect_runtime_options(args: argparse.Namespace) -> RuntimeOptions:
119+
cpu = getattr(args, "cpu", -1)
120+
verbosity_level = _resolve_verbosity(args)
121+
return RuntimeOptions(cpu=cpu, verbosity_level=verbosity_level)
122+
123+
124+
def _describe_maybe_cases() -> str:
125+
path = Path(maybe_cases.MAYBE_CASES_FILEPATH)
126+
if path.exists():
127+
return f"Maybe cases exported to: {path.resolve()}"
128+
return "No maybe cases were exported."
129+
130+
131+
def run_merge(args: argparse.Namespace) -> int:
132+
options = _collect_runtime_options(args)
133+
records_df = read_df(Path(args.input))
134+
n_input = len(records_df)
135+
136+
matched_df: Optional[DataFrame] = None
137+
duplicate_id_sets: Optional[list] = None
138+
pairs_df: Optional[DataFrame] = None
139+
140+
if args.export_maybe:
141+
prep_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
142+
pairs_df = block(prep_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
143+
matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
144+
export_maybe(
145+
records_df,
146+
matched_df=matched_df,
147+
verbosity_level=options.verbosity_level,
148+
)
149+
print(_describe_maybe_cases())
150+
print("Review the exported maybe cases and rerun with --import-maybe to apply decisions.")
151+
return 0
152+
153+
if args.import_maybe:
154+
prep_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
155+
pairs_df = block(prep_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
156+
matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
157+
matched_df = import_maybe(matched_df, verbosity_level=options.verbosity_level)
158+
duplicate_id_sets = _cluster.get_connected_components(matched_df)
159+
merged_df = merge(
160+
records_df,
161+
duplicate_id_sets=duplicate_id_sets,
162+
verbosity_level=options.verbosity_level,
163+
)
164+
else:
165+
merged_df = merge(
166+
records_df,
167+
verbosity_level=options.verbosity_level,
168+
)
169+
170+
write_df(merged_df, Path(args.output))
171+
172+
if args.stats:
173+
stats_lines = [
174+
f"Input records: {n_input}",
175+
f"Merged records: {len(merged_df)}",
176+
]
177+
if pairs_df is not None:
178+
stats_lines.append(f"Blocked pairs: {len(pairs_df)}")
179+
if matched_df is not None and "duplicate_label" in matched_df.columns:
180+
label_counts = matched_df["duplicate_label"].value_counts()
181+
n_true = int(label_counts.get("duplicate", 0))
182+
n_maybe = int(label_counts.get("maybe", 0))
183+
stats_lines.append(f"Confirmed matches: {n_true}")
184+
stats_lines.append(f"Maybe matches: {n_maybe}")
185+
print(" | ".join(stats_lines))
186+
187+
return 0
188+
189+
190+
def run_prep(args: argparse.Namespace) -> int:
191+
options = _collect_runtime_options(args)
192+
records_df = read_df(Path(args.input))
193+
prepared_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
194+
write_df(prepared_df, Path(args.output))
195+
return 0
196+
197+
198+
def run_block(args: argparse.Namespace) -> int:
199+
options = _collect_runtime_options(args)
200+
records_df = read_df(Path(args.input))
201+
pairs_df = block(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
202+
write_df(pairs_df, Path(args.output))
203+
return 0
204+
205+
206+
def run_match(args: argparse.Namespace) -> int:
207+
options = _collect_runtime_options(args)
208+
pairs_df = read_df(Path(args.input))
209+
matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu)
210+
write_df(matched_df, Path(args.output))
211+
212+
if args.export_maybe:
213+
if not args.records:
214+
raise CLIError("--export-maybe requires --records to provide the original records.")
215+
records_df = read_df(Path(args.records))
216+
export_maybe(
217+
records_df,
218+
matched_df=matched_df,
219+
verbosity_level=options.verbosity_level,
220+
)
221+
print(_describe_maybe_cases())
222+
223+
return 0
224+
225+
226+
def run_export_maybe(args: argparse.Namespace) -> int:
227+
options = _collect_runtime_options(args)
228+
records_df = read_df(Path(args.records))
229+
matches_df = read_df(Path(args.matches))
230+
export_maybe(
231+
records_df,
232+
matched_df=matches_df,
233+
verbosity_level=options.verbosity_level,
234+
)
235+
print(_describe_maybe_cases())
236+
return 0
237+
238+
239+
def run_import_maybe(args: argparse.Namespace) -> int:
240+
options = _collect_runtime_options(args)
241+
matches_df = read_df(Path(args.input))
242+
updated_matches = import_maybe(matches_df, verbosity_level=options.verbosity_level)
243+
244+
if args.output:
245+
write_df(updated_matches, Path(args.output))
246+
else:
247+
updated_matches.to_csv(sys.stdout, index=False)
248+
return 0
249+
250+
251+
def run_version(_: argparse.Namespace) -> int:
252+
try:
253+
dist_version = metadata.version("bib-dedupe")
254+
except metadata.PackageNotFoundError:
255+
dist_version = "unknown"
256+
print(dist_version)
257+
return 0
258+
259+
260+
def build_parser() -> argparse.ArgumentParser:
261+
parser = argparse.ArgumentParser(
262+
prog="bib-dedupe",
263+
description="Deduplicate bibliographic records from the command line.",
264+
)
265+
266+
common_parser = argparse.ArgumentParser(add_help=False)
267+
common_parser.add_argument("--verbosity-level", type=int, help="Override verbosity level.")
268+
common_parser.add_argument("-q", "--quiet", action="store_true", help="Silence verbose output.")
269+
common_parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity level.")
270+
271+
cpu_parser = argparse.ArgumentParser(add_help=False)
272+
cpu_parser.add_argument("--cpu", type=int, default=-1, help="Number of CPUs to use (default: -1 for auto).")
273+
274+
subparsers = parser.add_subparsers(dest="command")
275+
276+
merge_parser = subparsers.add_parser(
277+
"merge",
278+
parents=[common_parser, cpu_parser],
279+
help="Run the full deduplication workflow and write merged records.",
280+
)
281+
merge_parser.add_argument("-i", "--input", required=True, help="Input records file path.")
282+
merge_parser.add_argument("-o", "--output", required=True, help="Output file path for merged records.")
283+
merge_parser.add_argument("--stats", action="store_true", help="Print a short statistics summary.")
284+
merge_parser.add_argument(
285+
"--export-maybe",
286+
action="store_true",
287+
help="Export potential duplicates for manual review and exit.",
288+
)
289+
merge_parser.add_argument(
290+
"--import-maybe",
291+
action="store_true",
292+
help="Re-import maybe decisions before merging.",
293+
)
294+
merge_parser.set_defaults(func=run_merge)
295+
296+
prep_parser = subparsers.add_parser(
297+
"prep",
298+
parents=[common_parser, cpu_parser],
299+
help="Preprocess records before blocking.",
300+
)
301+
prep_parser.add_argument("-i", "--input", required=True, help="Input records file path.")
302+
prep_parser.add_argument("-o", "--output", required=True, help="Output file path for prepared records.")
303+
prep_parser.set_defaults(func=run_prep)
304+
305+
block_parser = subparsers.add_parser(
306+
"block",
307+
parents=[common_parser, cpu_parser],
308+
help="Generate candidate record pairs for matching.",
309+
)
310+
block_parser.add_argument("-i", "--input", required=True, help="Input preprocessed records file path.")
311+
block_parser.add_argument("-o", "--output", required=True, help="Output file path for blocked pairs.")
312+
block_parser.set_defaults(func=run_block)
313+
314+
match_parser = subparsers.add_parser(
315+
"match",
316+
parents=[common_parser, cpu_parser],
317+
help="Score candidate pairs and classify matches.",
318+
)
319+
match_parser.add_argument("-i", "--input", required=True, help="Input candidate pairs file path.")
320+
match_parser.add_argument("-o", "--output", required=True, help="Output file path for match decisions.")
321+
match_parser.add_argument(
322+
"--export-maybe",
323+
action="store_true",
324+
help="Export maybe cases immediately after matching.",
325+
)
326+
match_parser.add_argument(
327+
"--records",
328+
help="Records file path required when using --export-maybe.",
329+
)
330+
match_parser.set_defaults(func=run_match)
331+
332+
export_maybe_parser = subparsers.add_parser(
333+
"export-maybe",
334+
parents=[common_parser],
335+
help="Export maybe cases for manual review.",
336+
)
337+
export_maybe_parser.add_argument("--records", required=True, help="Path to the records file.")
338+
export_maybe_parser.add_argument("--matches", required=True, help="Path to the matches file.")
339+
export_maybe_parser.set_defaults(func=run_export_maybe)
340+
341+
import_maybe_parser = subparsers.add_parser(
342+
"import-maybe",
343+
parents=[common_parser],
344+
help="Apply manual maybe decisions to match results.",
345+
)
346+
import_maybe_parser.add_argument("-i", "--input", required=True, help="Matches file containing maybe decisions.")
347+
import_maybe_parser.add_argument("-o", "--output", help="Optional output path for updated matches.")
348+
import_maybe_parser.set_defaults(func=run_import_maybe)
349+
350+
version_parser = subparsers.add_parser("version", help="Print the installed bib-dedupe version.")
351+
version_parser.set_defaults(func=run_version)
352+
353+
return parser
354+
355+
356+
def main(argv: Optional[Sequence[str]] = None) -> int:
357+
parser = build_parser()
358+
args = parser.parse_args(argv)
359+
360+
if not hasattr(args, "func"):
361+
parser.print_help()
362+
return 0
363+
364+
try:
365+
return args.func(args)
366+
except CLIError as exc:
367+
print(f"Error: {exc}", file=sys.stderr)
368+
return 2
369+
except Exception as exc: # pragma: no cover - safeguard for unexpected failures
370+
print(f"Unexpected error: {exc}", file=sys.stderr)
371+
return 1
372+
373+
374+
if __name__ == "__main__": # pragma: no cover - manual execution
375+
sys.exit(main())
17376

18-
if __name__ == "__main__":
19-
main()

0 commit comments

Comments
 (0)