|
1 | 1 | #! /usr/bin/env python |
2 | | -"""Debug for dedupe""" |
| 2 | +""" |
| 3 | +Debug for dedupe — pick a CSV in the current directory (interactive), |
| 4 | +then load records via CoLRev and continue with prep/block/match. |
| 5 | +""" |
| 6 | +from __future__ import annotations |
| 7 | + |
3 | 8 | from pathlib import Path |
4 | 9 |
|
| 10 | +import colrev.loader.load_utils |
| 11 | +import inquirer |
5 | 12 | import pandas as pd |
6 | 13 |
|
7 | | -from bib_dedupe import verbose_print |
| 14 | +import bib_dedupe.cluster |
8 | 15 | from bib_dedupe.bib_dedupe import block |
9 | 16 | from bib_dedupe.bib_dedupe import match |
10 | 17 | from bib_dedupe.bib_dedupe import prep |
11 | | -from bib_dedupe.dedupe_benchmark import DedupeBenchmarker |
12 | 18 |
|
13 | 19 |
|
14 | | -def debug() -> None: |
| 20 | +def _select_from_list( |
| 21 | + prompt: str, choices: list[str], multi: bool = False |
| 22 | +) -> list[str] | str: |
| 23 | + """inquirer-first selection helper; falls back to terminal input.""" |
15 | 24 | try: |
16 | | - df_blocks = pd.read_csv("blocks_FN_list.csv") |
17 | | - except (pd.errors.EmptyDataError, FileNotFoundError): |
18 | | - df_blocks = pd.DataFrame() |
| 25 | + if multi: |
| 26 | + q = [inquirer.Checkbox("sel", message=prompt, choices=choices)] |
| 27 | + ans = inquirer.prompt(q) |
| 28 | + if not ans or "sel" not in ans or not ans["sel"]: |
| 29 | + raise RuntimeError("Nothing selected.") |
| 30 | + return ans["sel"] |
| 31 | + else: |
| 32 | + q = [inquirer.List("sel", message=prompt, choices=choices)] |
| 33 | + ans = inquirer.prompt(q) |
| 34 | + if not ans or "sel" not in ans: |
| 35 | + raise RuntimeError("Nothing selected.") |
| 36 | + return ans["sel"] |
| 37 | + except ModuleNotFoundError: |
| 38 | + print("Optional dependency missing: `inquirer` (pip install inquirer).") |
| 39 | + for i, c in enumerate(choices, start=1): |
| 40 | + print(f"{i}: {c}") |
| 41 | + if multi: |
| 42 | + raw = input(f"{prompt} (comma-separated numbers): ").strip() |
| 43 | + idxs = [int(x) for x in raw.split(",") if x.strip()] |
| 44 | + return [choices[i - 1] for i in idxs] |
| 45 | + idx = int(input(f"{prompt} (number): ").strip()) |
| 46 | + return choices[idx - 1] |
19 | 47 |
|
20 | | - try: |
21 | | - df_matches = pd.read_csv("matches_FN_list.csv") |
22 | | - except (pd.errors.EmptyDataError, FileNotFoundError): |
23 | | - df_matches = pd.DataFrame() |
24 | 48 |
|
25 | | - try: |
26 | | - df_matches_fp = pd.read_csv("matches_FP_list.csv") |
27 | | - except (pd.errors.EmptyDataError, FileNotFoundError): |
28 | | - df_matches_fp = pd.DataFrame() |
29 | | - |
30 | | - dedupe_benchmark = DedupeBenchmarker(benchmark_path=Path.cwd()) |
31 | | - |
32 | | - records_df = dedupe_benchmark.get_records_for_dedupe() |
33 | | - records_df = prep(records_df) |
34 | | - |
35 | | - while True: |
36 | | - id_pair = input("id_pair:") |
37 | | - |
38 | | - try: |
39 | | - if "case" in df_blocks.columns and id_pair in df_blocks["case"].values: |
40 | | - print(f"ID pair {id_pair} found in blocks_FN_list.csv") |
41 | | - if "case" in df_matches.columns and id_pair in df_matches["case"].values: |
42 | | - print(f"ID pair {id_pair} found in matches_FN_list.csv") |
43 | | - if ( |
44 | | - "case" in df_matches_fp.columns |
45 | | - and id_pair in df_matches_fp["case"].values |
46 | | - ): |
47 | | - print(f"ID pair {id_pair} found in matches_FP_list.csv") |
48 | | - |
49 | | - id1, id2 = id_pair.split(";") |
50 | | - selected_prepared_records_df = records_df[ |
51 | | - records_df["ID"].apply(lambda x: id1 == x or id2 == x) |
52 | | - ] |
53 | | - |
54 | | - if selected_prepared_records_df.empty: |
55 | | - print("selected_prepared_records_df is empty. Continuing...") |
56 | | - continue |
57 | | - |
58 | | - actual_blocked_df = block( |
59 | | - records_df=selected_prepared_records_df, verbosity_level=2 |
60 | | - ) |
61 | | - matches = match(actual_blocked_df, verbosity_level=2) |
62 | | - verbose_print.p_printer.pprint(matches) |
63 | | - except ValueError as exc: |
64 | | - print(exc) |
| 49 | +def _select_csv_in_cwd() -> Path: |
| 50 | + csv_files = sorted(Path(".").glob("*.csv")) |
| 51 | + if not csv_files: |
| 52 | + raise FileNotFoundError("No *.csv files found in the current directory.") |
| 53 | + selected = _select_from_list( |
| 54 | + "Select a CSV file", [p.name for p in csv_files], multi=False |
| 55 | + ) |
| 56 | + return Path(str(selected)) |
| 57 | + |
| 58 | + |
| 59 | +def _load_records_df_via_colrev(filename: Path) -> pd.DataFrame: |
| 60 | + records = colrev.loader.load_utils.load(filename=str(filename)) |
| 61 | + df = pd.DataFrame.from_dict(records, orient="index") |
| 62 | + |
| 63 | + # Only add index as ID if there isn't already an ID column |
| 64 | + if "ID" not in df.columns: |
| 65 | + df = df.reset_index().rename(columns={"index": "ID"}) |
| 66 | + else: |
| 67 | + df = df.reset_index(drop=True) |
| 68 | + |
| 69 | + return df |
| 70 | + |
| 71 | + |
| 72 | +def _select_components(records_df: pd.DataFrame) -> list[str]: |
| 73 | + if "component" not in records_df.columns: |
| 74 | + raise KeyError( |
| 75 | + "No 'component' column found. " |
| 76 | + "If this CSV is a matches/component list, it must contain a 'component' column." |
| 77 | + ) |
| 78 | + |
| 79 | + comps = sorted(records_df["component"].dropna().astype(str).unique().tolist()) |
| 80 | + selected = _select_from_list("Select component(s) to debug", comps, multi=True) |
| 81 | + return list(selected) |
| 82 | + |
| 83 | + |
| 84 | +def debug() -> None: |
| 85 | + selected_csv = _select_csv_in_cwd() |
| 86 | + records_df = _load_records_df_via_colrev(selected_csv) |
| 87 | + |
| 88 | + selected_components = _select_components(records_df) |
| 89 | + subset_df = records_df[records_df["component"].isin(selected_components)].copy() |
| 90 | + |
| 91 | + prep_df = prep(subset_df) |
| 92 | + pairs_df = block(records_df=prep_df) |
| 93 | + matched_df = match(pairs_df, verbosity_level=2) |
| 94 | + print(matched_df) |
| 95 | + |
| 96 | + duplicate_id_sets = bib_dedupe.cluster.get_connected_components(matched_df) |
| 97 | + print(duplicate_id_sets) |
0 commit comments