Skip to content

Commit 1c4acea

Browse files
committed
migration : added docstring + type hints
1 parent 0c2fca1 commit 1c4acea

File tree

1 file changed

+112
-14
lines changed

1 file changed

+112
-14
lines changed

chebai/preprocessing/migration/chebi_data_migration.py

Lines changed: 112 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,30 @@
1010

1111

1212
class ChebiDataMigration:
13+
"""
14+
A class to handle migration of ChEBI dataset to a new structure.
15+
16+
Attributes:
17+
__MODULE_PATH (str): The path to the module containing ChEBI classes.
18+
__DATA_ROOT_DIR (str): The root directory for data.
19+
_chebi_cls (_ChEBIDataExtractor): The ChEBI class instance.
20+
_chebi_version (int): The version of the ChEBI dataset.
21+
_single_class (int, optional): The ID of a single class to predict.
22+
_class_name (str): The name of the ChEBI class.
23+
"""
24+
1325
__MODULE_PATH: str = "chebai.preprocessing.datasets.chebi"
1426
__DATA_ROOT_DIR: str = "data"
1527

1628
def __init__(self, class_name: str, chebi_version: int, single_class: int = None):
17-
# Chebi class instance according to new data structure
29+
"""
30+
Initialize the ChebiDataMigration class.
31+
32+
Args:
33+
class_name (str): The name of the ChEBI class.
34+
chebi_version (int): The version of the ChEBI dataset.
35+
single_class (int, optional): The ID of the single class to predict.
36+
"""
1837
self._chebi_cls: Type[_ChEBIDataExtractor] = self._dynamic_import_chebi_cls(
1938
class_name, chebi_version, single_class
2039
)
@@ -26,12 +45,26 @@ def __init__(self, class_name: str, chebi_version: int, single_class: int = None
2645
def _dynamic_import_chebi_cls(
2746
cls, class_name: str, chebi_version: int, single_class: int
2847
) -> Type[_ChEBIDataExtractor]:
48+
"""
49+
Dynamically import the ChEBI class.
50+
51+
Args:
52+
class_name (str): The name of the ChEBI class.
53+
chebi_version (int): The version of the ChEBI dataset.
54+
single_class (int): The ID of the single class to predict.
55+
56+
Returns:
57+
_ChEBIDataExtractor: An instance of the dynamically imported class.
58+
"""
2959
class_name = class_name.strip()
3060
module = __import__(cls.__MODULE_PATH, fromlist=[class_name])
3161
_class = getattr(module, class_name)
3262
return _class(**{"chebi_version": chebi_version, "single_class": single_class})
3363

34-
def migrate(self):
64+
def migrate(self) -> None:
65+
"""
66+
Start the migration process for the ChEBI dataset.
67+
"""
3568
os.makedirs(self._chebi_cls.base_dir, exist_ok=True)
3669
print("Migration started.....")
3770
self._migrate_old_raw_data()
@@ -43,7 +76,10 @@ def migrate(self):
4376
self._chebi_cls.setup_processed()
4477
print("Migration completed.....")
4578

46-
def _migrate_old_raw_data(self):
79+
def _migrate_old_raw_data(self) -> None:
80+
"""
81+
Migrate old raw data files to the new data folder structure.
82+
"""
4783
print("-" * 50)
4884
print("Migrating old raw Data....")
4985

@@ -66,15 +102,17 @@ def _migrate_old_raw_data(self):
66102
self._old_raw_dir, old_splits_file_names
67103
)
68104

69-
# data_df.to_pickle(data_file_path)
70105
self._chebi_cls.save_processed(data_df, "data.pkl")
71106
print(f"File {data_file_path} saved to new data-folder structure")
72107

73108
split_file = os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
74109
split_ass_df.to_csv(split_file) # overwrites the files with same name
75110
print(f"File {split_file} saved to new data-folder structure")
76111

77-
def _migrate_old_processed_data(self):
112+
def _migrate_old_processed_data(self) -> None:
113+
"""
114+
Migrate old processed data files to the new data folder structure.
115+
"""
78116
print("-" * 50)
79117
print("Migrating old processed data.....")
80118

@@ -99,9 +137,19 @@ def _migrate_old_processed_data(self):
99137
def _combine_pt_splits(
100138
self, old_dir: str, old_splits_file_names: Dict[str, str]
101139
) -> pd.DataFrame:
140+
"""
141+
Combine old `.pt` split files into a single DataFrame.
142+
143+
Args:
144+
old_dir (str): The directory containing the old split files.
145+
old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
146+
147+
Returns:
148+
pd.DataFrame: The combined DataFrame.
149+
"""
102150
self._check_if_old_splits_exists(old_dir, old_splits_file_names)
103151

104-
print("Combinig `.pt` splits...")
152+
print("Combining `.pt` splits...")
105153
df_list: List[pd.DataFrame] = []
106154
for split, file_name in old_splits_file_names.items():
107155
file_path = os.path.join(old_dir, file_name)
@@ -113,6 +161,16 @@ def _combine_pt_splits(
113161
def _combine_pkl_splits(
114162
self, old_dir: str, old_splits_file_names: Dict[str, str]
115163
) -> Tuple[pd.DataFrame, pd.DataFrame]:
164+
"""
165+
Combine old `.pkl` split files into a single DataFrame and create split assignments.
166+
167+
Args:
168+
old_dir (str): The directory containing the old split files.
169+
old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
170+
171+
Returns:
172+
Tuple[pd.DataFrame, pd.DataFrame]: The combined DataFrame and split assignments DataFrame.
173+
"""
116174
self._check_if_old_splits_exists(old_dir, old_splits_file_names)
117175

118176
df_list: List[pd.DataFrame] = []
@@ -135,18 +193,41 @@ def _combine_pkl_splits(
135193
return combined_df, combined_split_assignment
136194

137195
@staticmethod
138-
def _check_if_old_splits_exists(old_dir, old_splits_file_names):
196+
def _check_if_old_splits_exists(
197+
old_dir: str, old_splits_file_names: Dict[str, str]
198+
) -> None:
199+
"""
200+
Check if the old split files exist in the specified directory.
201+
202+
Args:
203+
old_dir (str): The directory containing the old split files.
204+
old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
205+
206+
Raises:
207+
FileNotFoundError: If any of the split files do not exist.
208+
"""
139209
if any(
140210
not os.path.isfile(os.path.join(old_dir, file))
141211
for file in old_splits_file_names.values()
142212
):
143213
raise FileNotFoundError(
144-
f"One of the split {old_splits_file_names.values()} doesn't exists "
214+
f"One of the split {old_splits_file_names.values()} doesn't exist "
145215
f"in old data-folder structure: {old_dir}"
146216
)
147217

148218
@staticmethod
149-
def _copy_file(old_file_dir, new_file_dir, file_name):
219+
def _copy_file(old_file_dir: str, new_file_dir: str, file_name: str) -> None:
220+
"""
221+
Copy a file from the old directory to the new directory.
222+
223+
Args:
224+
old_file_dir (str): The directory containing the old file.
225+
new_file_dir (str): The directory to copy the file to.
226+
file_name (str): The name of the file to copy.
227+
228+
Raises:
229+
FileNotFoundError: If the file does not exist in the old directory.
230+
"""
150231
os.makedirs(new_file_dir, exist_ok=True)
151232
new_file_path = os.path.join(new_file_dir, file_name)
152233
if os.path.isfile(new_file_path):
@@ -156,22 +237,34 @@ def _copy_file(old_file_dir, new_file_dir, file_name):
156237
old_file_path = os.path.join(old_file_dir, file_name)
157238
if not os.path.isfile(old_file_path):
158239
raise FileNotFoundError(
159-
f"File {old_file_path} doesn't exists in old data-folder structure"
240+
f"File {old_file_path} doesn't exist in old data-folder structure"
160241
)
161242

162243
shutil.copy2(os.path.abspath(old_file_path), os.path.abspath(new_file_path))
163244
print(f"Copied from {old_file_path} to {new_file_path}")
164245

165246
@property
166-
def _old_base_dir(self):
247+
def _old_base_dir(self) -> str:
248+
"""
249+
Get the base directory for the old data structure.
250+
251+
Returns:
252+
str: The base directory for the old data.
253+
"""
167254
return os.path.join(
168255
self.__DATA_ROOT_DIR,
169256
self._chebi_cls._name,
170257
f"chebi_v{self._chebi_cls.chebi_version}",
171258
)
172259

173260
@property
174-
def _old_processed_dir(self):
261+
def _old_processed_dir(self) -> str:
262+
"""
263+
Get the processed directory for the old data structure.
264+
265+
Returns:
266+
str: The processed directory for the old data.
267+
"""
175268
res = os.path.join(
176269
self._old_base_dir,
177270
"processed",
@@ -183,8 +276,13 @@ def _old_processed_dir(self):
183276
return os.path.join(res, f"single_{self._chebi_cls.single_class}")
184277

185278
@property
186-
def _old_raw_dir(self):
187-
"""name of dir where the raw data is stored"""
279+
def _old_raw_dir(self) -> str:
280+
"""
281+
Get the raw directory for the old data structure.
282+
283+
Returns:
284+
str: The raw directory for the old data.
285+
"""
188286
return os.path.join(self._old_base_dir, "raw")
189287

190288

0 commit comments

Comments
 (0)