Skip to content

Commit 4526d60

Browse files
author
Gerit Wagner
committed
cli: debug
1 parent 4f7e521 commit 4526d60

File tree

1 file changed

+84
-51
lines changed

1 file changed

+84
-51
lines changed

bib_dedupe/debug.py

Lines changed: 84 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,97 @@
11
#! /usr/bin/env python
2-
"""Debug for dedupe"""
2+
"""
3+
Debug for dedupe — pick a CSV in the current directory (interactive),
4+
then load records via CoLRev and continue with prep/block/match.
5+
"""
6+
from __future__ import annotations
7+
38
from pathlib import Path
49

10+
import colrev.loader.load_utils
11+
import inquirer
512
import pandas as pd
613

7-
from bib_dedupe import verbose_print
14+
import bib_dedupe.cluster
815
from bib_dedupe.bib_dedupe import block
916
from bib_dedupe.bib_dedupe import match
1017
from bib_dedupe.bib_dedupe import prep
11-
from bib_dedupe.dedupe_benchmark import DedupeBenchmarker
1218

1319

14-
def debug() -> None:
20+
def _select_from_list(
21+
prompt: str, choices: list[str], multi: bool = False
22+
) -> list[str] | str:
23+
"""inquirer-first selection helper; falls back to terminal input."""
1524
try:
16-
df_blocks = pd.read_csv("blocks_FN_list.csv")
17-
except (pd.errors.EmptyDataError, FileNotFoundError):
18-
df_blocks = pd.DataFrame()
25+
if multi:
26+
q = [inquirer.Checkbox("sel", message=prompt, choices=choices)]
27+
ans = inquirer.prompt(q)
28+
if not ans or "sel" not in ans or not ans["sel"]:
29+
raise RuntimeError("Nothing selected.")
30+
return ans["sel"]
31+
else:
32+
q = [inquirer.List("sel", message=prompt, choices=choices)]
33+
ans = inquirer.prompt(q)
34+
if not ans or "sel" not in ans:
35+
raise RuntimeError("Nothing selected.")
36+
return ans["sel"]
37+
except ModuleNotFoundError:
38+
print("Optional dependency missing: `inquirer` (pip install inquirer).")
39+
for i, c in enumerate(choices, start=1):
40+
print(f"{i}: {c}")
41+
if multi:
42+
raw = input(f"{prompt} (comma-separated numbers): ").strip()
43+
idxs = [int(x) for x in raw.split(",") if x.strip()]
44+
return [choices[i - 1] for i in idxs]
45+
idx = int(input(f"{prompt} (number): ").strip())
46+
return choices[idx - 1]
1947

20-
try:
21-
df_matches = pd.read_csv("matches_FN_list.csv")
22-
except (pd.errors.EmptyDataError, FileNotFoundError):
23-
df_matches = pd.DataFrame()
2448

25-
try:
26-
df_matches_fp = pd.read_csv("matches_FP_list.csv")
27-
except (pd.errors.EmptyDataError, FileNotFoundError):
28-
df_matches_fp = pd.DataFrame()
29-
30-
dedupe_benchmark = DedupeBenchmarker(benchmark_path=Path.cwd())
31-
32-
records_df = dedupe_benchmark.get_records_for_dedupe()
33-
records_df = prep(records_df)
34-
35-
while True:
36-
id_pair = input("id_pair:")
37-
38-
try:
39-
if "case" in df_blocks.columns and id_pair in df_blocks["case"].values:
40-
print(f"ID pair {id_pair} found in blocks_FN_list.csv")
41-
if "case" in df_matches.columns and id_pair in df_matches["case"].values:
42-
print(f"ID pair {id_pair} found in matches_FN_list.csv")
43-
if (
44-
"case" in df_matches_fp.columns
45-
and id_pair in df_matches_fp["case"].values
46-
):
47-
print(f"ID pair {id_pair} found in matches_FP_list.csv")
48-
49-
id1, id2 = id_pair.split(";")
50-
selected_prepared_records_df = records_df[
51-
records_df["ID"].apply(lambda x: id1 == x or id2 == x)
52-
]
53-
54-
if selected_prepared_records_df.empty:
55-
print("selected_prepared_records_df is empty. Continuing...")
56-
continue
57-
58-
actual_blocked_df = block(
59-
records_df=selected_prepared_records_df, verbosity_level=2
60-
)
61-
matches = match(actual_blocked_df, verbosity_level=2)
62-
verbose_print.p_printer.pprint(matches)
63-
except ValueError as exc:
64-
print(exc)
49+
def _select_csv_in_cwd() -> Path:
50+
csv_files = sorted(Path(".").glob("*.csv"))
51+
if not csv_files:
52+
raise FileNotFoundError("No *.csv files found in the current directory.")
53+
selected = _select_from_list(
54+
"Select a CSV file", [p.name for p in csv_files], multi=False
55+
)
56+
return Path(str(selected))
57+
58+
59+
def _load_records_df_via_colrev(filename: Path) -> pd.DataFrame:
60+
records = colrev.loader.load_utils.load(filename=str(filename))
61+
df = pd.DataFrame.from_dict(records, orient="index")
62+
63+
# Only add index as ID if there isn't already an ID column
64+
if "ID" not in df.columns:
65+
df = df.reset_index().rename(columns={"index": "ID"})
66+
else:
67+
df = df.reset_index(drop=True)
68+
69+
return df
70+
71+
72+
def _select_components(records_df: pd.DataFrame) -> list[str]:
73+
if "component" not in records_df.columns:
74+
raise KeyError(
75+
"No 'component' column found. "
76+
"If this CSV is a matches/component list, it must contain a 'component' column."
77+
)
78+
79+
comps = sorted(records_df["component"].dropna().astype(str).unique().tolist())
80+
selected = _select_from_list("Select component(s) to debug", comps, multi=True)
81+
return list(selected)
82+
83+
84+
def debug() -> None:
85+
selected_csv = _select_csv_in_cwd()
86+
records_df = _load_records_df_via_colrev(selected_csv)
87+
88+
selected_components = _select_components(records_df)
89+
subset_df = records_df[records_df["component"].isin(selected_components)].copy()
90+
91+
prep_df = prep(subset_df)
92+
pairs_df = block(records_df=prep_df)
93+
matched_df = match(pairs_df, verbosity_level=2)
94+
print(matched_df)
95+
96+
duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df)
97+
print(duplicate_id_sets)

0 commit comments

Comments
 (0)