@@ -60,24 +60,49 @@ def _get_timestamped_directories(data_directory: pathlib.Path) -> list[pathlib.P
6060 return timestamped_directories
6161
6262
63+ _REQUIRED_EXTRACTION_FILES = [
64+ "lean_explore.db" ,
65+ "informalization_faiss.index" ,
66+ "informalization_faiss_ids_map.json" ,
67+ "bm25_ids_map.json" ,
68+ "bm25_name_raw" ,
69+ "bm25_name_spaced" ,
70+ ]
71+
72+
73+ def _is_complete_extraction (directory : pathlib .Path ) -> bool :
74+ """Check whether an extraction directory contains all required files.
75+
76+ Args:
77+ directory: Path to a timestamped extraction directory.
78+
79+ Returns:
80+ True if all required files and directories are present.
81+ """
82+ return all ((directory / name ).exists () for name in _REQUIRED_EXTRACTION_FILES )
83+
84+
6385def _resolve_active_data_path (
6486 data_directory : pathlib .Path , active_version : str
6587) -> pathlib .Path :
6688 """Resolve the active data path using the best available source.
6789
6890 Priority:
6991 1. DATA_DIRECTORY if it contains lean_explore.db directly
70- 2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
92+ 2. Most recent complete timestamped extraction directory (YYYYMMDD_HHMMSS)
7193 3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
94+
95+ Only directories that contain all required extraction files are considered
96+ complete. Incomplete extractions (e.g. from a failed pipeline run) are
97+ skipped.
7298 """
7399 if (data_directory / "lean_explore.db" ).exists ():
74100 return data_directory
75101
76102 timestamped_dirs = _get_timestamped_directories (data_directory )
77- if timestamped_dirs :
78- latest = timestamped_dirs [0 ]
79- if (latest / "lean_explore.db" ).exists ():
80- return latest
103+ for directory in timestamped_dirs :
104+ if _is_complete_extraction (directory ):
105+ return directory
81106
82107 return data_directory / active_version
83108
0 commit comments