Skip to content

Commit f16c4e4

Browse files
committed
Skip incomplete extraction directories when resolving active data path
Prevents the backend from trying to load from a partially completed extraction (e.g. missing FAISS index after a failed pipeline run).
1 parent 32556bf commit f16c4e4

File tree

1 file changed

+30
-5
lines changed

1 file changed

+30
-5
lines changed

src/lean_explore/config.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,24 +60,49 @@ def _get_timestamped_directories(data_directory: pathlib.Path) -> list[pathlib.P
6060
return timestamped_directories
6161

6262

63+
_REQUIRED_EXTRACTION_FILES = [
64+
"lean_explore.db",
65+
"informalization_faiss.index",
66+
"informalization_faiss_ids_map.json",
67+
"bm25_ids_map.json",
68+
"bm25_name_raw",
69+
"bm25_name_spaced",
70+
]
71+
72+
73+
def _is_complete_extraction(directory: pathlib.Path) -> bool:
74+
"""Check whether an extraction directory contains all required files.
75+
76+
Args:
77+
directory: Path to a timestamped extraction directory.
78+
79+
Returns:
80+
True if all required files and directories are present.
81+
"""
82+
return all((directory / name).exists() for name in _REQUIRED_EXTRACTION_FILES)
83+
84+
6385
def _resolve_active_data_path(
6486
data_directory: pathlib.Path, active_version: str
6587
) -> pathlib.Path:
6688
"""Resolve the active data path using the best available source.
6789
6890
Priority:
6991
1. DATA_DIRECTORY if it contains lean_explore.db directly
70-
2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
92+
2. Most recent complete timestamped extraction directory (YYYYMMDD_HHMMSS)
7193
3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
94+
95+
Only directories that contain all required extraction files are considered
96+
complete. Incomplete extractions (e.g. from a failed pipeline run) are
97+
skipped.
7298
"""
7399
if (data_directory / "lean_explore.db").exists():
74100
return data_directory
75101

76102
timestamped_dirs = _get_timestamped_directories(data_directory)
77-
if timestamped_dirs:
78-
latest = timestamped_dirs[0]
79-
if (latest / "lean_explore.db").exists():
80-
return latest
103+
for directory in timestamped_dirs:
104+
if _is_complete_extraction(directory):
105+
return directory
81106

82107
return data_directory / active_version
83108

0 commit comments

Comments
 (0)