1414 MONGODB_READING_BATCH_SIZE ,
1515 METADATA_FILE_NAME ,
1616 DATA_FILES_PATH ,
17- INIT_SYNC_CURRENT_SKIP_FILE_NAME ,
17+ # INIT_SYNC_CURRENT_SKIP_FILE_NAME,
18+ # added the two new files to save the initial sync status and last parquet file number
19+ INIT_SYNC_STATUS_FILE_NAME ,
20+ LAST_PARQUET_FILE_NUMBER ,
1821 INIT_SYNC_LAST_ID_FILE_NAME ,
1922 INIT_SYNC_MAX_ID_FILE_NAME ,
2023)
2124import schema_utils
2225from utils import get_parquet_full_path_filename , to_string , get_table_dir
2326from push_file_to_lz import push_file_to_lz
24- from flags import set_init_flag , clear_init_flag
27+ # not required as now init_sync stat is stored in LZ
28+ #from flags import set_init_flag, clear_init_flag
2529from file_utils import FileType , read_from_file , write_to_file , delete_file
2630
2731
2832def init_sync (collection_name : str ):
2933 logger = logging .getLogger (f"{ __name__ } [{ collection_name } ]" )
30- # skip init_sync if there's already parquet files and no current_skip/last_id file
31- table_dir = get_table_dir (collection_name )
32- current_skip_file_path = os .path .join (table_dir , INIT_SYNC_CURRENT_SKIP_FILE_NAME )
33- last_id_file_path = os .path .join (table_dir , INIT_SYNC_LAST_ID_FILE_NAME )
34- # needs to exclude the situation of cache or temp parquet files exist but
35- # not normal numbered parquet files, in which case we shouldn't skip init sync
36- if (
37- not os .path .exists (last_id_file_path )
38- and os .path .exists (table_dir )
39- and any (
40- file .endswith (".parquet" ) and os .path .splitext (file )[0 ].isnumeric ()
41- for file in os .listdir (table_dir )
42- )
43- ):
34+
35+ # detect if there's a init_sync_stat file in LZ, and get its value
36+ init_sync_stat_flag = read_from_file (
37+ collection_name , INIT_SYNC_STATUS_FILE_NAME , FileType .PICKLE
38+ )
39+ if init_sync_stat_flag == "Y" :
4440 logger .info (
4541 f"init sync for collection { collection_name } has already finished previously. Skipping init sync this time."
4642 )
4743 return
44+
45+ # detect if there's a last_id file, and restore last_id from it
46+ last_id = read_from_file (
47+ collection_name , INIT_SYNC_LAST_ID_FILE_NAME , FileType .PICKLE
48+ )
49+ if (init_sync_stat_flag == "N" and last_id ):
50+ logger .info (
51+ f"interrupted init sync detected, continuing with previous _id={ last_id } "
52+ )
53+ # skip old logic with LZ file for init_sync_stat
54+ # skip init_sync if there's already parquet files and no current_skip/last_id file
55+ #table_dir = get_table_dir(collection_name)
56+ #current_skip_file_path = os.path.join(table_dir, INIT_SYNC_CURRENT_SKIP_FILE_NAME)
57+ #last_id_file_path = os.path.join(table_dir, INIT_SYNC_LAST_ID_FILE_NAME)
58+ # needs to exclude the situation of cache or temp parquet files exist but
59+ # not normal numbered parquet files, in which case we shouldn't skip init sync
60+ # if (
61+ # not os.path.exists(last_id_file_path)
62+ # and os.path.exists(table_dir)
63+ # and any(
64+ # file.endswith(".parquet") and os.path.splitext(file)[0].isnumeric()
65+ # for file in os.listdir(table_dir)
66+ # )
67+ # ):
68+
4869 logger .info (f"begin init sync for { collection_name } " )
49- set_init_flag (collection_name )
70+
71+ # begin by writing init_sync_stat file with "N" as value
72+ #set_init_flag(collection_name)
73+ if not init_sync_stat_flag :
74+ # writing init_sync_stat file with "N"
75+ init_sync_stat_flag = "N"
76+ logger .info (f"writing init sync stat file with as 'N' for { collection_name } " )
77+ write_to_file (
78+ init_sync_stat_flag , collection_name , INIT_SYNC_STATUS_FILE_NAME , FileType .PICKLE
79+ )
80+
5081 db_name = os .getenv ("MONGO_DB_NAME" )
5182 logger .debug (f"db_name={ db_name } " )
5283 logger .debug (f"collection={ collection_name } " )
@@ -77,14 +108,15 @@ def init_sync(collection_name: str):
77108
78109 columns_to_convert_to_str = None
79110
111+ #moved to the begining to check if initial sync is completed
80112 # detect if there's a last_id file, and restore last_id from it
81- last_id = read_from_file (
82- collection_name , INIT_SYNC_LAST_ID_FILE_NAME , FileType .PICKLE
83- )
84- if last_id :
85- logger .info (
86- f"interrupted init sync detected, continuing with previous _id={ last_id } "
87- )
113+ # last_id = read_from_file(
114+ # collection_name, INIT_SYNC_LAST_ID_FILE_NAME, FileType.PICKLE
115+ # )
116+ # if last_id:
117+ # logger.info(
118+ # f"interrupted init sync detected, continuing with previous _id={last_id}"
119+ # )
88120
89121 while last_id is None or last_id < max_id :
90122 # for debug only
@@ -128,7 +160,16 @@ def init_sync(collection_name: str):
128160 logger .info (f"TIME: trans took { trans_end_time - read_end_time :.2f} seconds" )
129161
130162 logger .debug ("creating parquet file..." )
131- parquet_full_path_filename = get_parquet_full_path_filename (collection_name )
163+ # changed to get last parquet file number from LZ for resilience
164+ #parquet_full_path_filename = get_parquet_full_path_filename(collection_name)
165+ last_parquet_file_num = read_from_file (
166+ collection_name , LAST_PARQUET_FILE_NUMBER , FileType .PICKLE
167+ )
168+ if not last_parquet_file_num :
169+ last_parquet_file_num = 0
170+
171+ parquet_full_path_filename = get_parquet_full_path_filename (collection_name , last_parquet_file_num )
172+
132173 logger .info (f"writing parquet file: { parquet_full_path_filename } " )
133174 batch_df .to_parquet (parquet_full_path_filename , index = False )
134175 write_end_time = time .time ()
@@ -139,11 +180,10 @@ def init_sync(collection_name: str):
139180 metadata_json_path = os .path .join (
140181 os .path .dirname (os .path .abspath (__file__ )), METADATA_FILE_NAME
141182 )
142- #Diana 143
143183 logger .info ("writing metadata file to LZ" )
144184 push_file_to_lz (metadata_json_path , collection_name )
185+ # write the current batch to LZ
145186 push_start_time = time .time ()
146- #Diana 147
147187 logger .info ("writing parquet file to LZ" )
148188 push_file_to_lz (parquet_full_path_filename , collection_name )
149189 push_end_time = time .time ()
@@ -156,12 +196,26 @@ def init_sync(collection_name: str):
156196 write_to_file (
157197 last_id , collection_name , INIT_SYNC_LAST_ID_FILE_NAME , FileType .PICKLE
158198 )
199+ # write last parquet file number to file
200+ last_parquet_file_num += 1
201+ logger .info (f"writing last parquet number into file: { last_parquet_file_num } " )
202+ write_to_file (
203+ last_parquet_file_num ,
204+ collection_name ,
205+ LAST_PARQUET_FILE_NUMBER ,
206+ FileType .PICKLE ,
207+ )
159208
160209 # delete last_id file, as init sync is complete
161210 logger .info ("removing the last_id file" )
162211 delete_file (collection_name , INIT_SYNC_LAST_ID_FILE_NAME )
163212
164- clear_init_flag (collection_name )
213+ #set_init_flag_stat as complete = Y
214+ logger .info ("Setting init_sync_stat flag as Y" )
215+ init_sync_stat_flag = "Y"
216+ write_to_file (
217+ init_sync_stat_flag , collection_name , INIT_SYNC_STATUS_FILE_NAME , FileType .PICKLE
218+ )
165219 logger .info (f"init sync completed for collection { collection_name } " )
166220
167221
0 commit comments