1010
1111
1212class ChebiDataMigration :
13+ """
14+ A class to handle migration of ChEBI dataset to a new structure.
15+
16+ Attributes:
17+ __MODULE_PATH (str): The path to the module containing ChEBI classes.
18+ __DATA_ROOT_DIR (str): The root directory for data.
19+ _chebi_cls (_ChEBIDataExtractor): The ChEBI class instance.
20+ _chebi_version (int): The version of the ChEBI dataset.
21+ _single_class (int, optional): The ID of a single class to predict.
22+ _class_name (str): The name of the ChEBI class.
23+ """
24+
1325 __MODULE_PATH : str = "chebai.preprocessing.datasets.chebi"
1426 __DATA_ROOT_DIR : str = "data"
1527
1628 def __init__ (self , class_name : str , chebi_version : int , single_class : int = None ):
17- # Chebi class instance according to new data structure
29+ """
30+ Initialize the ChebiDataMigration class.
31+
32+ Args:
33+ class_name (str): The name of the ChEBI class.
34+ chebi_version (int): The version of the ChEBI dataset.
35+ single_class (int, optional): The ID of the single class to predict.
36+ """
1837 self ._chebi_cls : Type [_ChEBIDataExtractor ] = self ._dynamic_import_chebi_cls (
1938 class_name , chebi_version , single_class
2039 )
@@ -26,12 +45,26 @@ def __init__(self, class_name: str, chebi_version: int, single_class: int = None
2645 def _dynamic_import_chebi_cls (
2746 cls , class_name : str , chebi_version : int , single_class : int
2847 ) -> Type [_ChEBIDataExtractor ]:
48+ """
49+ Dynamically import the ChEBI class.
50+
51+ Args:
52+ class_name (str): The name of the ChEBI class.
53+ chebi_version (int): The version of the ChEBI dataset.
54+ single_class (int): The ID of the single class to predict.
55+
56+ Returns:
57+ _ChEBIDataExtractor: An instance of the dynamically imported class.
58+ """
2959 class_name = class_name .strip ()
3060 module = __import__ (cls .__MODULE_PATH , fromlist = [class_name ])
3161 _class = getattr (module , class_name )
3262 return _class (** {"chebi_version" : chebi_version , "single_class" : single_class })
3363
34- def migrate (self ):
64+ def migrate (self ) -> None :
65+ """
66+ Start the migration process for the ChEBI dataset.
67+ """
3568 os .makedirs (self ._chebi_cls .base_dir , exist_ok = True )
3669 print ("Migration started....." )
3770 self ._migrate_old_raw_data ()
@@ -43,7 +76,10 @@ def migrate(self):
4376 self ._chebi_cls .setup_processed ()
4477 print ("Migration completed....." )
4578
46- def _migrate_old_raw_data (self ):
79+ def _migrate_old_raw_data (self ) -> None :
80+ """
81+ Migrate old raw data files to the new data folder structure.
82+ """
4783 print ("-" * 50 )
4884 print ("Migrating old raw Data...." )
4985
@@ -66,15 +102,17 @@ def _migrate_old_raw_data(self):
66102 self ._old_raw_dir , old_splits_file_names
67103 )
68104
69- # data_df.to_pickle(data_file_path)
70105 self ._chebi_cls .save_processed (data_df , "data.pkl" )
71106 print (f"File { data_file_path } saved to new data-folder structure" )
72107
73108 split_file = os .path .join (self ._chebi_cls .processed_dir_main , "splits.csv" )
74109 split_ass_df .to_csv (split_file ) # overwrites the files with same name
75110 print (f"File { split_file } saved to new data-folder structure" )
76111
77- def _migrate_old_processed_data (self ):
112+ def _migrate_old_processed_data (self ) -> None :
113+ """
114+ Migrate old processed data files to the new data folder structure.
115+ """
78116 print ("-" * 50 )
79117 print ("Migrating old processed data....." )
80118
@@ -99,9 +137,19 @@ def _migrate_old_processed_data(self):
99137 def _combine_pt_splits (
100138 self , old_dir : str , old_splits_file_names : Dict [str , str ]
101139 ) -> pd .DataFrame :
140+ """
141+ Combine old `.pt` split files into a single DataFrame.
142+
143+ Args:
144+ old_dir (str): The directory containing the old split files.
145+ old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
146+
147+ Returns:
148+ pd.DataFrame: The combined DataFrame.
149+ """
102150 self ._check_if_old_splits_exists (old_dir , old_splits_file_names )
103151
104- print ("Combinig `.pt` splits..." )
152+ print ("Combining `.pt` splits..." )
105153 df_list : List [pd .DataFrame ] = []
106154 for split , file_name in old_splits_file_names .items ():
107155 file_path = os .path .join (old_dir , file_name )
@@ -113,6 +161,16 @@ def _combine_pt_splits(
113161 def _combine_pkl_splits (
114162 self , old_dir : str , old_splits_file_names : Dict [str , str ]
115163 ) -> Tuple [pd .DataFrame , pd .DataFrame ]:
164+ """
165+ Combine old `.pkl` split files into a single DataFrame and create split assignments.
166+
167+ Args:
168+ old_dir (str): The directory containing the old split files.
169+ old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
170+
171+ Returns:
172+ Tuple[pd.DataFrame, pd.DataFrame]: The combined DataFrame and split assignments DataFrame.
173+ """
116174 self ._check_if_old_splits_exists (old_dir , old_splits_file_names )
117175
118176 df_list : List [pd .DataFrame ] = []
@@ -135,18 +193,41 @@ def _combine_pkl_splits(
135193 return combined_df , combined_split_assignment
136194
137195 @staticmethod
138- def _check_if_old_splits_exists (old_dir , old_splits_file_names ):
196+ def _check_if_old_splits_exists (
197+ old_dir : str , old_splits_file_names : Dict [str , str ]
198+ ) -> None :
199+ """
200+ Check if the old split files exist in the specified directory.
201+
202+ Args:
203+ old_dir (str): The directory containing the old split files.
204+ old_splits_file_names (Dict[str, str]): A dictionary of split names and file names.
205+
206+ Raises:
207+ FileNotFoundError: If any of the split files do not exist.
208+ """
139209 if any (
140210 not os .path .isfile (os .path .join (old_dir , file ))
141211 for file in old_splits_file_names .values ()
142212 ):
143213 raise FileNotFoundError (
144- f"One of the split { old_splits_file_names .values ()} doesn't exists "
214+ f"One of the split { old_splits_file_names .values ()} doesn't exist "
145215 f"in old data-folder structure: { old_dir } "
146216 )
147217
148218 @staticmethod
149- def _copy_file (old_file_dir , new_file_dir , file_name ):
219+ def _copy_file (old_file_dir : str , new_file_dir : str , file_name : str ) -> None :
220+ """
221+ Copy a file from the old directory to the new directory.
222+
223+ Args:
224+ old_file_dir (str): The directory containing the old file.
225+ new_file_dir (str): The directory to copy the file to.
226+ file_name (str): The name of the file to copy.
227+
228+ Raises:
229+ FileNotFoundError: If the file does not exist in the old directory.
230+ """
150231 os .makedirs (new_file_dir , exist_ok = True )
151232 new_file_path = os .path .join (new_file_dir , file_name )
152233 if os .path .isfile (new_file_path ):
@@ -156,22 +237,34 @@ def _copy_file(old_file_dir, new_file_dir, file_name):
156237 old_file_path = os .path .join (old_file_dir , file_name )
157238 if not os .path .isfile (old_file_path ):
158239 raise FileNotFoundError (
159- f"File { old_file_path } doesn't exists in old data-folder structure"
240+ f"File { old_file_path } doesn't exist in old data-folder structure"
160241 )
161242
162243 shutil .copy2 (os .path .abspath (old_file_path ), os .path .abspath (new_file_path ))
163244 print (f"Copied from { old_file_path } to { new_file_path } " )
164245
165246 @property
166- def _old_base_dir (self ):
247+ def _old_base_dir (self ) -> str :
248+ """
249+ Get the base directory for the old data structure.
250+
251+ Returns:
252+ str: The base directory for the old data.
253+ """
167254 return os .path .join (
168255 self .__DATA_ROOT_DIR ,
169256 self ._chebi_cls ._name ,
170257 f"chebi_v{ self ._chebi_cls .chebi_version } " ,
171258 )
172259
173260 @property
174- def _old_processed_dir (self ):
261+ def _old_processed_dir (self ) -> str :
262+ """
263+ Get the processed directory for the old data structure.
264+
265+ Returns:
266+ str: The processed directory for the old data.
267+ """
175268 res = os .path .join (
176269 self ._old_base_dir ,
177270 "processed" ,
@@ -183,8 +276,13 @@ def _old_processed_dir(self):
183276 return os .path .join (res , f"single_{ self ._chebi_cls .single_class } " )
184277
185278 @property
186- def _old_raw_dir (self ):
187- """name of dir where the raw data is stored"""
279+ def _old_raw_dir (self ) -> str :
280+ """
281+ Get the raw directory for the old data structure.
282+
283+ Returns:
284+ str: The raw directory for the old data.
285+ """
188286 return os .path .join (self ._old_base_dir , "raw" )
189287
190288
0 commit comments