Skip to content

Commit 9e43cbb

Browse files
author
Gerit Wagner
committed
revise cli
1 parent 576b174 commit 9e43cbb

File tree

3 files changed

+185
-7
lines changed

3 files changed

+185
-7
lines changed

bib_dedupe/bib_dedupe.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
import bib_dedupe.maybe_cases
1414
import bib_dedupe.merge
1515
import bib_dedupe.prep
16-
import bib_dedupe.sim
1716
from bib_dedupe import verbose_print
17+
from bib_dedupe.constants.fields import ORIGIN
1818

1919

2020
def prep(
@@ -148,6 +148,7 @@ def merge(
148148
matched_df: typing.Optional[pd.DataFrame] = None,
149149
duplicate_id_sets: typing.Optional[list] = None,
150150
verbosity_level: typing.Optional[int] = None,
151+
origin_column: str = ORIGIN,
151152
) -> pd.DataFrame:
152153
"""
153154
Merges duplicate records in the given dataframe.
@@ -174,7 +175,9 @@ def merge(
174175
matched_df = match(blocked_df)
175176
duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
176177

177-
return bib_dedupe.merge.merge(records_df, duplicate_id_sets=duplicate_id_sets)
178+
return bib_dedupe.merge.merge(
179+
records_df, duplicate_id_sets=duplicate_id_sets, origin_column=origin_column
180+
)
178181

179182

180183
def _download_file_from_github(url: str, local_path: str) -> None:

bib_dedupe/cli.py

Lines changed: 180 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
#! /usr/bin/env python
12
"""Command-line interface for :mod:`bib_dedupe`."""
23
from __future__ import annotations
34

45
import argparse
6+
import ast
7+
import json
8+
import re
59
import sys
10+
import typing
611
from dataclasses import dataclass
712
from importlib import metadata
813
from pathlib import Path
@@ -20,6 +25,15 @@
2025
from bib_dedupe.bib_dedupe import merge
2126
from bib_dedupe.bib_dedupe import prep
2227

28+
# --- colrev availability check (exit otherwise) ------------------------------
29+
try:
30+
import colrev.loader.load_utils as _colrev_load
31+
import colrev.writer.write_utils as _colrev_write
32+
33+
_HAS_COLREV = True
34+
except Exception: # broad on purpose: any import issue means it's unavailable
35+
_HAS_COLREV = False
36+
2337

2438
class CLIError(Exception):
2539
"""Raised for command-line usage errors."""
@@ -36,6 +50,20 @@ class RuntimeOptions:
3650
DataFrame = pd.DataFrame
3751

3852

53+
# ----------------------------- I/O helpers --------------------------------- #
54+
def read_records_colrev(path: Path) -> DataFrame:
55+
"""Load bibliographic records via colrev and return a pandas DataFrame."""
56+
try:
57+
records = _colrev_load.load(filename=str(path))
58+
except Exception as exc:
59+
raise CLIError(f"Failed to load records via colrev from {path}: {exc}") from exc
60+
try:
61+
# return pd.DataFrame.from_records(list(records))
62+
return pd.DataFrame.from_dict(records, orient="index")
63+
except Exception as exc:
64+
raise CLIError(f"Failed to convert colrev records to DataFrame: {exc}") from exc
65+
66+
3967
def read_df(path: Path) -> DataFrame:
4068
"""Load a dataframe from *path* based on its file extension."""
4169

@@ -69,6 +97,139 @@ def _ensure_output_path(path: Path) -> Path:
6997
return path.with_suffix(".csv")
7098

7199

100+
def parse_colrev_origin(value: typing.Any) -> typing.List[str]:
101+
"""Return a clean list like ['dblp.bib/000591', 'files.bib/000037'] from messy inputs."""
102+
# Already a list → normalize & dedupe
103+
if isinstance(value, list):
104+
seen, out = set(), []
105+
for x in value:
106+
t = str(x).strip()
107+
if t and t not in seen:
108+
seen.add(t)
109+
out.append(t)
110+
return out
111+
112+
# Not a string → nothing usable
113+
if not isinstance(value, str):
114+
return []
115+
116+
s = value.strip()
117+
if not s:
118+
return []
119+
120+
# Fix common broken joiners/separators from bib exports
121+
s = (
122+
s.replace("'];['", ",")
123+
.replace("';['", ",")
124+
.replace("']['", ",")
125+
.replace("];[", ",")
126+
.replace("] ; [", ",")
127+
.replace(";", ",")
128+
)
129+
130+
# Strip trailing commas
131+
while s.endswith(","):
132+
s = s[:-1].strip()
133+
134+
# Repeatedly strip outer quotes/braces/brackets
135+
# (handles "'[...]'", "{[...]}", "{{[...]}}", etc.)
136+
while len(s) >= 2 and (
137+
(s[0] in "'\"" and s[-1] == s[0])
138+
or (s[0] == "{" and s[-1] == "}")
139+
or (s[0] == "[" and s[-1] == "]")
140+
):
141+
s = s[1:-1].strip()
142+
143+
# Try proper parsers first
144+
for parser in (json.loads, ast.literal_eval):
145+
try:
146+
parsed = parser(s)
147+
if isinstance(parsed, list):
148+
return parse_colrev_origin(parsed) # normalize/dedupe
149+
except Exception:
150+
pass
151+
152+
# Last-resort: extract tokens that look like "source.bib/000123"
153+
candidates = re.findall(r"[A-Za-z0-9._-]+(?:\.bib)?/[A-Za-z0-9._-]+", s)
154+
# Dedupe while preserving order
155+
seen, out = set(), []
156+
for tok in candidates:
157+
if tok not in seen:
158+
seen.add(tok)
159+
out.append(tok)
160+
return out
161+
162+
163+
def maybe_cast_literal(value):
164+
"""Parse list/dict-ish strings (incl. BibTeX double-braced) into Python objects."""
165+
if not isinstance(value, str):
166+
return value
167+
168+
s = value.strip()
169+
170+
# Trim a trailing comma (common in .bib exports)
171+
if s.endswith(","):
172+
s = s[:-1].strip()
173+
174+
# Unwrap quotes if the whole thing is quoted: "'[... ]'" or '"{ ... }"'
175+
if len(s) >= 2 and s[0] in ("'", '"') and s[-1] == s[0]:
176+
inner = s[1:-1].strip()
177+
# Only unwrap if the inner looks like a literal container
178+
if (inner.startswith("[") and inner.endswith("]")) or (
179+
inner.startswith("{") and inner.endswith("}")
180+
):
181+
s = inner
182+
183+
# Handle BibTeX-style double braces: {{ ... }}
184+
if s.startswith("{{") and s.endswith("}}"):
185+
s = s[1:-1].strip()
186+
187+
# If it doesn't look like a literal, skip work
188+
if not (
189+
(s.startswith("{") and s.endswith("}"))
190+
or (s.startswith("[") and s.endswith("]"))
191+
):
192+
return value
193+
194+
# Safely parse Python literal (handles single quotes etc.). Avoid eval().
195+
try:
196+
return ast.literal_eval(s)
197+
except Exception:
198+
return value
199+
200+
201+
def write_records_colrev(df: DataFrame, path: Path) -> None:
202+
"""Persist bibliographic records via colrev writer based on *path* suffix."""
203+
output_path = _ensure_output_path(path)
204+
output_path.parent.mkdir(parents=True, exist_ok=True)
205+
try:
206+
records = df.to_dict(orient="index")
207+
208+
for rec_id, record in records.items():
209+
# Normalize colrev_origin
210+
if "colrev_origin" in record:
211+
record["colrev_origin"] = parse_colrev_origin(record["colrev_origin"])
212+
# keep origin in sync for CoLRev
213+
if isinstance(record["colrev_origin"], list):
214+
record["origin"] = record["colrev_origin"]
215+
216+
for key in list(record.keys()):
217+
record[key] = maybe_cast_literal(record[key])
218+
219+
for key in list(record.keys()):
220+
if isinstance(record[key], (list, dict)):
221+
continue
222+
if pd.isnull(record[key]) or record[key] == "" or record[key] == "nan":
223+
records[rec_id].pop(key)
224+
records[rec_id].pop("origin", None) # remove 'origin' if present
225+
226+
_colrev_write.write_file(records, filename=str(output_path))
227+
except Exception as exc:
228+
raise CLIError(
229+
f"Failed to write records via colrev to {output_path}: {exc}"
230+
) from exc
231+
232+
72233
def write_df(df: DataFrame, path: Path) -> None:
73234
"""Persist *df* to *path*, inferring the format from the file suffix."""
74235

@@ -96,6 +257,7 @@ def write_df(df: DataFrame, path: Path) -> None:
96257
)
97258

98259

260+
# ----------------------------- options helpers ------------------------------ #
99261
def _resolve_verbosity(args: argparse.Namespace) -> Optional[int]:
100262
verbosity_level = getattr(args, "verbosity_level", None)
101263
quiet = getattr(args, "quiet", False)
@@ -128,9 +290,10 @@ def _describe_maybe_cases() -> str:
128290
return "No maybe cases were exported."
129291

130292

293+
# ----------------------------- commands ------------------------------------- #
131294
def run_merge(args: argparse.Namespace) -> int:
132295
options = _collect_runtime_options(args)
133-
records_df = read_df(Path(args.input))
296+
records_df = read_records_colrev(Path(args.input))
134297
n_input = len(records_df)
135298

136299
matched_df: Optional[DataFrame] = None
@@ -174,14 +337,16 @@ def run_merge(args: argparse.Namespace) -> int:
174337
records_df,
175338
duplicate_id_sets=duplicate_id_sets,
176339
verbosity_level=options.verbosity_level,
340+
origin_column="colrev_origin",
177341
)
178342
else:
179343
merged_df = merge(
180344
records_df,
181345
verbosity_level=options.verbosity_level,
346+
origin_column="colrev_origin",
182347
)
183348

184-
write_df(merged_df, Path(args.output))
349+
write_records_colrev(merged_df, Path(args.output))
185350

186351
if args.stats:
187352
stats_lines = [
@@ -203,6 +368,7 @@ def run_merge(args: argparse.Namespace) -> int:
203368

204369
def run_prep(args: argparse.Namespace) -> int:
205370
options = _collect_runtime_options(args)
371+
# prep produces an internal artifact used by block; keep non-colrev IO here
206372
records_df = read_df(Path(args.input))
207373
prepared_df = prep(
208374
records_df, verbosity_level=options.verbosity_level, cpu=options.cpu
@@ -234,7 +400,7 @@ def run_match(args: argparse.Namespace) -> int:
234400
raise CLIError(
235401
"--export-maybe requires --records to provide the original records."
236402
)
237-
records_df = read_df(Path(args.records))
403+
records_df = read_records_colrev(Path(args.records))
238404
export_maybe(
239405
records_df,
240406
matched_df=matched_df,
@@ -247,7 +413,7 @@ def run_match(args: argparse.Namespace) -> int:
247413

248414
def run_export_maybe(args: argparse.Namespace) -> int:
249415
options = _collect_runtime_options(args)
250-
records_df = read_df(Path(args.records))
416+
records_df = read_records_colrev(Path(args.records))
251417
matches_df = read_df(Path(args.matches))
252418
export_maybe(
253419
records_df,
@@ -279,6 +445,7 @@ def run_version(_: argparse.Namespace) -> int:
279445
return 0
280446

281447

448+
# ----------------------------- CLI wiring ----------------------------------- #
282449
def build_parser() -> argparse.ArgumentParser:
283450
parser = argparse.ArgumentParser(
284451
prog="bib-dedupe",
@@ -415,6 +582,15 @@ def build_parser() -> argparse.ArgumentParser:
415582

416583

417584
def main(argv: Optional[Sequence[str]] = None) -> int:
585+
# Enforce colrev requirement globally (exit otherwise)
586+
if not _HAS_COLREV:
587+
print(
588+
"Error: 'colrev' is required but not installed. "
589+
"Please install it (e.g., `pip install colrev`) and retry.",
590+
file=sys.stderr,
591+
)
592+
return 2
593+
418594
parser = build_parser()
419595
args = parser.parse_args(argv)
420596

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""Integration tests for the bib-dedupe command line interface."""
2-
32
from __future__ import annotations
43

54
import os

0 commit comments

Comments
 (0)