1+ #! /usr/bin/env python
12"""Command-line interface for :mod:`bib_dedupe`."""
23from __future__ import annotations
34
45import argparse
6+ import ast
7+ import json
8+ import re
59import sys
10+ import typing
611from dataclasses import dataclass
712from importlib import metadata
813from pathlib import Path
2025from bib_dedupe .bib_dedupe import merge
2126from bib_dedupe .bib_dedupe import prep
2227
28+ # --- colrev availability check (exit otherwise) ------------------------------
29+ try :
30+ import colrev .loader .load_utils as _colrev_load
31+ import colrev .writer .write_utils as _colrev_write
32+
33+ _HAS_COLREV = True
34+ except Exception : # broad on purpose: any import issue means it's unavailable
35+ _HAS_COLREV = False
36+
2337
2438class CLIError (Exception ):
2539 """Raised for command-line usage errors."""
@@ -36,6 +50,20 @@ class RuntimeOptions:
3650DataFrame = pd .DataFrame
3751
3852
53+ # ----------------------------- I/O helpers --------------------------------- #
54+ def read_records_colrev (path : Path ) -> DataFrame :
55+ """Load bibliographic records via colrev and return a pandas DataFrame."""
56+ try :
57+ records = _colrev_load .load (filename = str (path ))
58+ except Exception as exc :
59+ raise CLIError (f"Failed to load records via colrev from { path } : { exc } " ) from exc
60+ try :
61+ # return pd.DataFrame.from_records(list(records))
62+ return pd .DataFrame .from_dict (records , orient = "index" )
63+ except Exception as exc :
64+ raise CLIError (f"Failed to convert colrev records to DataFrame: { exc } " ) from exc
65+
66+
3967def read_df (path : Path ) -> DataFrame :
4068 """Load a dataframe from *path* based on its file extension."""
4169
@@ -69,6 +97,139 @@ def _ensure_output_path(path: Path) -> Path:
6997 return path .with_suffix (".csv" )
7098
7199
100+ def parse_colrev_origin (value : typing .Any ) -> typing .List [str ]:
101+ """Return a clean list like ['dblp.bib/000591', 'files.bib/000037'] from messy inputs."""
102+ # Already a list → normalize & dedupe
103+ if isinstance (value , list ):
104+ seen , out = set (), []
105+ for x in value :
106+ t = str (x ).strip ()
107+ if t and t not in seen :
108+ seen .add (t )
109+ out .append (t )
110+ return out
111+
112+ # Not a string → nothing usable
113+ if not isinstance (value , str ):
114+ return []
115+
116+ s = value .strip ()
117+ if not s :
118+ return []
119+
120+ # Fix common broken joiners/separators from bib exports
121+ s = (
122+ s .replace ("'];['" , "," )
123+ .replace ("';['" , "," )
124+ .replace ("']['" , "," )
125+ .replace ("];[" , "," )
126+ .replace ("] ; [" , "," )
127+ .replace (";" , "," )
128+ )
129+
130+ # Strip trailing commas
131+ while s .endswith ("," ):
132+ s = s [:- 1 ].strip ()
133+
134+ # Repeatedly strip outer quotes/braces/brackets
135+ # (handles "'[...]'", "{[...]}", "{{[...]}}", etc.)
136+ while len (s ) >= 2 and (
137+ (s [0 ] in "'\" " and s [- 1 ] == s [0 ])
138+ or (s [0 ] == "{" and s [- 1 ] == "}" )
139+ or (s [0 ] == "[" and s [- 1 ] == "]" )
140+ ):
141+ s = s [1 :- 1 ].strip ()
142+
143+ # Try proper parsers first
144+ for parser in (json .loads , ast .literal_eval ):
145+ try :
146+ parsed = parser (s )
147+ if isinstance (parsed , list ):
148+ return parse_colrev_origin (parsed ) # normalize/dedupe
149+ except Exception :
150+ pass
151+
152+ # Last-resort: extract tokens that look like "source.bib/000123"
153+ candidates = re .findall (r"[A-Za-z0-9._-]+(?:\.bib)?/[A-Za-z0-9._-]+" , s )
154+ # Dedupe while preserving order
155+ seen , out = set (), []
156+ for tok in candidates :
157+ if tok not in seen :
158+ seen .add (tok )
159+ out .append (tok )
160+ return out
161+
162+
163+ def maybe_cast_literal (value ):
164+ """Parse list/dict-ish strings (incl. BibTeX double-braced) into Python objects."""
165+ if not isinstance (value , str ):
166+ return value
167+
168+ s = value .strip ()
169+
170+ # Trim a trailing comma (common in .bib exports)
171+ if s .endswith ("," ):
172+ s = s [:- 1 ].strip ()
173+
174+ # Unwrap quotes if the whole thing is quoted: "'[... ]'" or '"{ ... }"'
175+ if len (s ) >= 2 and s [0 ] in ("'" , '"' ) and s [- 1 ] == s [0 ]:
176+ inner = s [1 :- 1 ].strip ()
177+ # Only unwrap if the inner looks like a literal container
178+ if (inner .startswith ("[" ) and inner .endswith ("]" )) or (
179+ inner .startswith ("{" ) and inner .endswith ("}" )
180+ ):
181+ s = inner
182+
183+ # Handle BibTeX-style double braces: {{ ... }}
184+ if s .startswith ("{{" ) and s .endswith ("}}" ):
185+ s = s [1 :- 1 ].strip ()
186+
187+ # If it doesn't look like a literal, skip work
188+ if not (
189+ (s .startswith ("{" ) and s .endswith ("}" ))
190+ or (s .startswith ("[" ) and s .endswith ("]" ))
191+ ):
192+ return value
193+
194+ # Safely parse Python literal (handles single quotes etc.). Avoid eval().
195+ try :
196+ return ast .literal_eval (s )
197+ except Exception :
198+ return value
199+
200+
201+ def write_records_colrev (df : DataFrame , path : Path ) -> None :
202+ """Persist bibliographic records via colrev writer based on *path* suffix."""
203+ output_path = _ensure_output_path (path )
204+ output_path .parent .mkdir (parents = True , exist_ok = True )
205+ try :
206+ records = df .to_dict (orient = "index" )
207+
208+ for rec_id , record in records .items ():
209+ # Normalize colrev_origin
210+ if "colrev_origin" in record :
211+ record ["colrev_origin" ] = parse_colrev_origin (record ["colrev_origin" ])
212+ # keep origin in sync for CoLRev
213+ if isinstance (record ["colrev_origin" ], list ):
214+ record ["origin" ] = record ["colrev_origin" ]
215+
216+ for key in list (record .keys ()):
217+ record [key ] = maybe_cast_literal (record [key ])
218+
219+ for key in list (record .keys ()):
220+ if isinstance (record [key ], (list , dict )):
221+ continue
222+ if pd .isnull (record [key ]) or record [key ] == "" or record [key ] == "nan" :
223+ records [rec_id ].pop (key )
224+ records [rec_id ].pop ("origin" , None ) # remove 'origin' if present
225+
226+ _colrev_write .write_file (records , filename = str (output_path ))
227+ except Exception as exc :
228+ raise CLIError (
229+ f"Failed to write records via colrev to { output_path } : { exc } "
230+ ) from exc
231+
232+
72233def write_df (df : DataFrame , path : Path ) -> None :
73234 """Persist *df* to *path*, inferring the format from the file suffix."""
74235
@@ -96,6 +257,7 @@ def write_df(df: DataFrame, path: Path) -> None:
96257 )
97258
98259
260+ # ----------------------------- options helpers ------------------------------ #
99261def _resolve_verbosity (args : argparse .Namespace ) -> Optional [int ]:
100262 verbosity_level = getattr (args , "verbosity_level" , None )
101263 quiet = getattr (args , "quiet" , False )
@@ -128,9 +290,10 @@ def _describe_maybe_cases() -> str:
128290 return "No maybe cases were exported."
129291
130292
293+ # ----------------------------- commands ------------------------------------- #
131294def run_merge (args : argparse .Namespace ) -> int :
132295 options = _collect_runtime_options (args )
133- records_df = read_df (Path (args .input ))
296+ records_df = read_records_colrev (Path (args .input ))
134297 n_input = len (records_df )
135298
136299 matched_df : Optional [DataFrame ] = None
@@ -174,14 +337,16 @@ def run_merge(args: argparse.Namespace) -> int:
174337 records_df ,
175338 duplicate_id_sets = duplicate_id_sets ,
176339 verbosity_level = options .verbosity_level ,
340+ origin_column = "colrev_origin" ,
177341 )
178342 else :
179343 merged_df = merge (
180344 records_df ,
181345 verbosity_level = options .verbosity_level ,
346+ origin_column = "colrev_origin" ,
182347 )
183348
184- write_df (merged_df , Path (args .output ))
349+ write_records_colrev (merged_df , Path (args .output ))
185350
186351 if args .stats :
187352 stats_lines = [
@@ -203,6 +368,7 @@ def run_merge(args: argparse.Namespace) -> int:
203368
204369def run_prep (args : argparse .Namespace ) -> int :
205370 options = _collect_runtime_options (args )
371+ # prep produces an internal artifact used by block; keep non-colrev IO here
206372 records_df = read_df (Path (args .input ))
207373 prepared_df = prep (
208374 records_df , verbosity_level = options .verbosity_level , cpu = options .cpu
@@ -234,7 +400,7 @@ def run_match(args: argparse.Namespace) -> int:
234400 raise CLIError (
235401 "--export-maybe requires --records to provide the original records."
236402 )
237- records_df = read_df (Path (args .records ))
403+ records_df = read_records_colrev (Path (args .records ))
238404 export_maybe (
239405 records_df ,
240406 matched_df = matched_df ,
@@ -247,7 +413,7 @@ def run_match(args: argparse.Namespace) -> int:
247413
248414def run_export_maybe (args : argparse .Namespace ) -> int :
249415 options = _collect_runtime_options (args )
250- records_df = read_df (Path (args .records ))
416+ records_df = read_records_colrev (Path (args .records ))
251417 matches_df = read_df (Path (args .matches ))
252418 export_maybe (
253419 records_df ,
@@ -279,6 +445,7 @@ def run_version(_: argparse.Namespace) -> int:
279445 return 0
280446
281447
448+ # ----------------------------- CLI wiring ----------------------------------- #
282449def build_parser () -> argparse .ArgumentParser :
283450 parser = argparse .ArgumentParser (
284451 prog = "bib-dedupe" ,
@@ -415,6 +582,15 @@ def build_parser() -> argparse.ArgumentParser:
415582
416583
417584def main (argv : Optional [Sequence [str ]] = None ) -> int :
585+ # Enforce colrev requirement globally (exit otherwise)
586+ if not _HAS_COLREV :
587+ print (
588+ "Error: 'colrev' is required but not installed. "
589+ "Please install it (e.g., `pip install colrev`) and retry." ,
590+ file = sys .stderr ,
591+ )
592+ return 2
593+
418594 parser = build_parser ()
419595 args = parser .parse_args (argv )
420596
0 commit comments