|
1 | | -import click |
| 1 | +"""Command-line interface for :mod:`bib_dedupe`.""" |
2 | 2 |
|
3 | | -import bib_dedupe.debug |
| 3 | +from __future__ import annotations |
4 | 4 |
|
| 5 | +import argparse |
| 6 | +import sys |
| 7 | +from dataclasses import dataclass |
| 8 | +from importlib import metadata |
| 9 | +from pathlib import Path |
| 10 | +from typing import Optional, Sequence |
5 | 11 |
|
6 | | -@click.group() |
7 | | -def main() -> None: |
8 | | - """A simple CLI for bib_dedupe.""" |
9 | | - pass |
| 12 | +import pandas as pd |
10 | 13 |
|
| 14 | +from bib_dedupe import cluster as _cluster |
| 15 | +from bib_dedupe import maybe_cases |
| 16 | +from bib_dedupe.bib_dedupe import ( |
| 17 | + block, |
| 18 | + export_maybe, |
| 19 | + import_maybe, |
| 20 | + match, |
| 21 | + merge, |
| 22 | + prep, |
| 23 | +) |
11 | 24 |
|
12 | | -@main.command() |
13 | | -@click.pass_context |
14 | | -def debug(ctx: click.Context) -> None: |
15 | | - bib_dedupe.debug.debug() |
16 | 25 |
|
| 26 | +class CLIError(Exception): |
| 27 | + """Raised for command-line usage errors.""" |
| 28 | + |
| 29 | + |
| 30 | +@dataclass |
| 31 | +class RuntimeOptions: |
| 32 | + """Common runtime options passed to library functions.""" |
| 33 | + |
| 34 | + cpu: int |
| 35 | + verbosity_level: Optional[int] |
| 36 | + |
| 37 | + |
| 38 | +DataFrame = pd.DataFrame |
| 39 | + |
| 40 | + |
| 41 | +def read_df(path: Path) -> DataFrame: |
| 42 | + """Load a dataframe from *path* based on its file extension.""" |
| 43 | + |
| 44 | + if not path.exists(): |
| 45 | + raise CLIError(f"Input file not found: {path}") |
| 46 | + |
| 47 | + suffix = path.suffix.lower() |
| 48 | + try: |
| 49 | + if suffix == ".csv": |
| 50 | + return pd.read_csv(path, keep_default_na=False, low_memory=False) |
| 51 | + if suffix in {".parquet", ".pq"}: |
| 52 | + try: |
| 53 | + return pd.read_parquet(path) |
| 54 | + except ImportError as exc: # pragma: no cover - depends on optional backend |
| 55 | + raise CLIError( |
| 56 | + "Parquet support requires an optional dependency such as 'pyarrow'." |
| 57 | + ) from exc |
| 58 | + if suffix == ".json": |
| 59 | + return pd.read_json(path) |
| 60 | + except ValueError as exc: |
| 61 | + raise CLIError(f"Failed to read {path}: {exc}") from exc |
| 62 | + |
| 63 | + raise CLIError( |
| 64 | + "Unsupported file extension for input. Supported extensions: .csv, .parquet, .json" |
| 65 | + ) |
| 66 | + |
| 67 | + |
| 68 | +def _ensure_output_path(path: Path) -> Path: |
| 69 | + if path.suffix: |
| 70 | + return path |
| 71 | + return path.with_suffix(".csv") |
| 72 | + |
| 73 | + |
| 74 | +def write_df(df: DataFrame, path: Path) -> None: |
| 75 | + """Persist *df* to *path*, inferring the format from the file suffix.""" |
| 76 | + |
| 77 | + output_path = _ensure_output_path(path) |
| 78 | + output_path.parent.mkdir(parents=True, exist_ok=True) |
| 79 | + |
| 80 | + suffix = output_path.suffix.lower() |
| 81 | + if suffix == ".csv": |
| 82 | + df.to_csv(output_path, index=False) |
| 83 | + return |
| 84 | + if suffix in {".parquet", ".pq"}: |
| 85 | + try: |
| 86 | + df.to_parquet(output_path, index=False) |
| 87 | + except Exception as exc: # pragma: no cover - backend availability specific |
| 88 | + raise CLIError(f"Failed to write parquet file {output_path}: {exc}") from exc |
| 89 | + return |
| 90 | + if suffix == ".json": |
| 91 | + df.to_json(output_path, orient="records", indent=2) |
| 92 | + return |
| 93 | + |
| 94 | + raise CLIError( |
| 95 | + "Unsupported file extension for output. Supported extensions: .csv, .parquet, .json" |
| 96 | + ) |
| 97 | + |
| 98 | + |
| 99 | +def _resolve_verbosity(args: argparse.Namespace) -> Optional[int]: |
| 100 | + verbosity_level = getattr(args, "verbosity_level", None) |
| 101 | + quiet = getattr(args, "quiet", False) |
| 102 | + verbose = getattr(args, "verbose", False) |
| 103 | + |
| 104 | + if quiet and verbose: |
| 105 | + raise CLIError("--quiet and --verbose cannot be used together.") |
| 106 | + |
| 107 | + if quiet: |
| 108 | + if verbosity_level is not None: |
| 109 | + raise CLIError("--quiet cannot be combined with --verbosity-level.") |
| 110 | + return 0 |
| 111 | + if verbose: |
| 112 | + if verbosity_level is not None: |
| 113 | + raise CLIError("--verbose cannot be combined with --verbosity-level.") |
| 114 | + return 2 |
| 115 | + return verbosity_level |
| 116 | + |
| 117 | + |
| 118 | +def _collect_runtime_options(args: argparse.Namespace) -> RuntimeOptions: |
| 119 | + cpu = getattr(args, "cpu", -1) |
| 120 | + verbosity_level = _resolve_verbosity(args) |
| 121 | + return RuntimeOptions(cpu=cpu, verbosity_level=verbosity_level) |
| 122 | + |
| 123 | + |
| 124 | +def _describe_maybe_cases() -> str: |
| 125 | + path = Path(maybe_cases.MAYBE_CASES_FILEPATH) |
| 126 | + if path.exists(): |
| 127 | + return f"Maybe cases exported to: {path.resolve()}" |
| 128 | + return "No maybe cases were exported." |
| 129 | + |
| 130 | + |
| 131 | +def run_merge(args: argparse.Namespace) -> int: |
| 132 | + options = _collect_runtime_options(args) |
| 133 | + records_df = read_df(Path(args.input)) |
| 134 | + n_input = len(records_df) |
| 135 | + |
| 136 | + matched_df: Optional[DataFrame] = None |
| 137 | + duplicate_id_sets: Optional[list] = None |
| 138 | + pairs_df: Optional[DataFrame] = None |
| 139 | + |
| 140 | + if args.export_maybe: |
| 141 | + prep_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 142 | + pairs_df = block(prep_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 143 | + matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 144 | + export_maybe( |
| 145 | + records_df, |
| 146 | + matched_df=matched_df, |
| 147 | + verbosity_level=options.verbosity_level, |
| 148 | + ) |
| 149 | + print(_describe_maybe_cases()) |
| 150 | + print("Review the exported maybe cases and rerun with --import-maybe to apply decisions.") |
| 151 | + return 0 |
| 152 | + |
| 153 | + if args.import_maybe: |
| 154 | + prep_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 155 | + pairs_df = block(prep_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 156 | + matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 157 | + matched_df = import_maybe(matched_df, verbosity_level=options.verbosity_level) |
| 158 | + duplicate_id_sets = _cluster.get_connected_components(matched_df) |
| 159 | + merged_df = merge( |
| 160 | + records_df, |
| 161 | + duplicate_id_sets=duplicate_id_sets, |
| 162 | + verbosity_level=options.verbosity_level, |
| 163 | + ) |
| 164 | + else: |
| 165 | + merged_df = merge( |
| 166 | + records_df, |
| 167 | + verbosity_level=options.verbosity_level, |
| 168 | + ) |
| 169 | + |
| 170 | + write_df(merged_df, Path(args.output)) |
| 171 | + |
| 172 | + if args.stats: |
| 173 | + stats_lines = [ |
| 174 | + f"Input records: {n_input}", |
| 175 | + f"Merged records: {len(merged_df)}", |
| 176 | + ] |
| 177 | + if pairs_df is not None: |
| 178 | + stats_lines.append(f"Blocked pairs: {len(pairs_df)}") |
| 179 | + if matched_df is not None and "duplicate_label" in matched_df.columns: |
| 180 | + label_counts = matched_df["duplicate_label"].value_counts() |
| 181 | + n_true = int(label_counts.get("duplicate", 0)) |
| 182 | + n_maybe = int(label_counts.get("maybe", 0)) |
| 183 | + stats_lines.append(f"Confirmed matches: {n_true}") |
| 184 | + stats_lines.append(f"Maybe matches: {n_maybe}") |
| 185 | + print(" | ".join(stats_lines)) |
| 186 | + |
| 187 | + return 0 |
| 188 | + |
| 189 | + |
| 190 | +def run_prep(args: argparse.Namespace) -> int: |
| 191 | + options = _collect_runtime_options(args) |
| 192 | + records_df = read_df(Path(args.input)) |
| 193 | + prepared_df = prep(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 194 | + write_df(prepared_df, Path(args.output)) |
| 195 | + return 0 |
| 196 | + |
| 197 | + |
| 198 | +def run_block(args: argparse.Namespace) -> int: |
| 199 | + options = _collect_runtime_options(args) |
| 200 | + records_df = read_df(Path(args.input)) |
| 201 | + pairs_df = block(records_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 202 | + write_df(pairs_df, Path(args.output)) |
| 203 | + return 0 |
| 204 | + |
| 205 | + |
| 206 | +def run_match(args: argparse.Namespace) -> int: |
| 207 | + options = _collect_runtime_options(args) |
| 208 | + pairs_df = read_df(Path(args.input)) |
| 209 | + matched_df = match(pairs_df, verbosity_level=options.verbosity_level, cpu=options.cpu) |
| 210 | + write_df(matched_df, Path(args.output)) |
| 211 | + |
| 212 | + if args.export_maybe: |
| 213 | + if not args.records: |
| 214 | + raise CLIError("--export-maybe requires --records to provide the original records.") |
| 215 | + records_df = read_df(Path(args.records)) |
| 216 | + export_maybe( |
| 217 | + records_df, |
| 218 | + matched_df=matched_df, |
| 219 | + verbosity_level=options.verbosity_level, |
| 220 | + ) |
| 221 | + print(_describe_maybe_cases()) |
| 222 | + |
| 223 | + return 0 |
| 224 | + |
| 225 | + |
| 226 | +def run_export_maybe(args: argparse.Namespace) -> int: |
| 227 | + options = _collect_runtime_options(args) |
| 228 | + records_df = read_df(Path(args.records)) |
| 229 | + matches_df = read_df(Path(args.matches)) |
| 230 | + export_maybe( |
| 231 | + records_df, |
| 232 | + matched_df=matches_df, |
| 233 | + verbosity_level=options.verbosity_level, |
| 234 | + ) |
| 235 | + print(_describe_maybe_cases()) |
| 236 | + return 0 |
| 237 | + |
| 238 | + |
| 239 | +def run_import_maybe(args: argparse.Namespace) -> int: |
| 240 | + options = _collect_runtime_options(args) |
| 241 | + matches_df = read_df(Path(args.input)) |
| 242 | + updated_matches = import_maybe(matches_df, verbosity_level=options.verbosity_level) |
| 243 | + |
| 244 | + if args.output: |
| 245 | + write_df(updated_matches, Path(args.output)) |
| 246 | + else: |
| 247 | + updated_matches.to_csv(sys.stdout, index=False) |
| 248 | + return 0 |
| 249 | + |
| 250 | + |
| 251 | +def run_version(_: argparse.Namespace) -> int: |
| 252 | + try: |
| 253 | + dist_version = metadata.version("bib-dedupe") |
| 254 | + except metadata.PackageNotFoundError: |
| 255 | + dist_version = "unknown" |
| 256 | + print(dist_version) |
| 257 | + return 0 |
| 258 | + |
| 259 | + |
| 260 | +def build_parser() -> argparse.ArgumentParser: |
| 261 | + parser = argparse.ArgumentParser( |
| 262 | + prog="bib-dedupe", |
| 263 | + description="Deduplicate bibliographic records from the command line.", |
| 264 | + ) |
| 265 | + |
| 266 | + common_parser = argparse.ArgumentParser(add_help=False) |
| 267 | + common_parser.add_argument("--verbosity-level", type=int, help="Override verbosity level.") |
| 268 | + common_parser.add_argument("-q", "--quiet", action="store_true", help="Silence verbose output.") |
| 269 | + common_parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity level.") |
| 270 | + |
| 271 | + cpu_parser = argparse.ArgumentParser(add_help=False) |
| 272 | + cpu_parser.add_argument("--cpu", type=int, default=-1, help="Number of CPUs to use (default: -1 for auto).") |
| 273 | + |
| 274 | + subparsers = parser.add_subparsers(dest="command") |
| 275 | + |
| 276 | + merge_parser = subparsers.add_parser( |
| 277 | + "merge", |
| 278 | + parents=[common_parser, cpu_parser], |
| 279 | + help="Run the full deduplication workflow and write merged records.", |
| 280 | + ) |
| 281 | + merge_parser.add_argument("-i", "--input", required=True, help="Input records file path.") |
| 282 | + merge_parser.add_argument("-o", "--output", required=True, help="Output file path for merged records.") |
| 283 | + merge_parser.add_argument("--stats", action="store_true", help="Print a short statistics summary.") |
| 284 | + merge_parser.add_argument( |
| 285 | + "--export-maybe", |
| 286 | + action="store_true", |
| 287 | + help="Export potential duplicates for manual review and exit.", |
| 288 | + ) |
| 289 | + merge_parser.add_argument( |
| 290 | + "--import-maybe", |
| 291 | + action="store_true", |
| 292 | + help="Re-import maybe decisions before merging.", |
| 293 | + ) |
| 294 | + merge_parser.set_defaults(func=run_merge) |
| 295 | + |
| 296 | + prep_parser = subparsers.add_parser( |
| 297 | + "prep", |
| 298 | + parents=[common_parser, cpu_parser], |
| 299 | + help="Preprocess records before blocking.", |
| 300 | + ) |
| 301 | + prep_parser.add_argument("-i", "--input", required=True, help="Input records file path.") |
| 302 | + prep_parser.add_argument("-o", "--output", required=True, help="Output file path for prepared records.") |
| 303 | + prep_parser.set_defaults(func=run_prep) |
| 304 | + |
| 305 | + block_parser = subparsers.add_parser( |
| 306 | + "block", |
| 307 | + parents=[common_parser, cpu_parser], |
| 308 | + help="Generate candidate record pairs for matching.", |
| 309 | + ) |
| 310 | + block_parser.add_argument("-i", "--input", required=True, help="Input preprocessed records file path.") |
| 311 | + block_parser.add_argument("-o", "--output", required=True, help="Output file path for blocked pairs.") |
| 312 | + block_parser.set_defaults(func=run_block) |
| 313 | + |
| 314 | + match_parser = subparsers.add_parser( |
| 315 | + "match", |
| 316 | + parents=[common_parser, cpu_parser], |
| 317 | + help="Score candidate pairs and classify matches.", |
| 318 | + ) |
| 319 | + match_parser.add_argument("-i", "--input", required=True, help="Input candidate pairs file path.") |
| 320 | + match_parser.add_argument("-o", "--output", required=True, help="Output file path for match decisions.") |
| 321 | + match_parser.add_argument( |
| 322 | + "--export-maybe", |
| 323 | + action="store_true", |
| 324 | + help="Export maybe cases immediately after matching.", |
| 325 | + ) |
| 326 | + match_parser.add_argument( |
| 327 | + "--records", |
| 328 | + help="Records file path required when using --export-maybe.", |
| 329 | + ) |
| 330 | + match_parser.set_defaults(func=run_match) |
| 331 | + |
| 332 | + export_maybe_parser = subparsers.add_parser( |
| 333 | + "export-maybe", |
| 334 | + parents=[common_parser], |
| 335 | + help="Export maybe cases for manual review.", |
| 336 | + ) |
| 337 | + export_maybe_parser.add_argument("--records", required=True, help="Path to the records file.") |
| 338 | + export_maybe_parser.add_argument("--matches", required=True, help="Path to the matches file.") |
| 339 | + export_maybe_parser.set_defaults(func=run_export_maybe) |
| 340 | + |
| 341 | + import_maybe_parser = subparsers.add_parser( |
| 342 | + "import-maybe", |
| 343 | + parents=[common_parser], |
| 344 | + help="Apply manual maybe decisions to match results.", |
| 345 | + ) |
| 346 | + import_maybe_parser.add_argument("-i", "--input", required=True, help="Matches file containing maybe decisions.") |
| 347 | + import_maybe_parser.add_argument("-o", "--output", help="Optional output path for updated matches.") |
| 348 | + import_maybe_parser.set_defaults(func=run_import_maybe) |
| 349 | + |
| 350 | + version_parser = subparsers.add_parser("version", help="Print the installed bib-dedupe version.") |
| 351 | + version_parser.set_defaults(func=run_version) |
| 352 | + |
| 353 | + return parser |
| 354 | + |
| 355 | + |
| 356 | +def main(argv: Optional[Sequence[str]] = None) -> int: |
| 357 | + parser = build_parser() |
| 358 | + args = parser.parse_args(argv) |
| 359 | + |
| 360 | + if not hasattr(args, "func"): |
| 361 | + parser.print_help() |
| 362 | + return 0 |
| 363 | + |
| 364 | + try: |
| 365 | + return args.func(args) |
| 366 | + except CLIError as exc: |
| 367 | + print(f"Error: {exc}", file=sys.stderr) |
| 368 | + return 2 |
| 369 | + except Exception as exc: # pragma: no cover - safeguard for unexpected failures |
| 370 | + print(f"Unexpected error: {exc}", file=sys.stderr) |
| 371 | + return 1 |
| 372 | + |
| 373 | + |
| 374 | +if __name__ == "__main__": # pragma: no cover - manual execution |
| 375 | + sys.exit(main()) |
17 | 376 |
|
18 | | -if __name__ == "__main__": |
19 | | - main() |
|
0 commit comments