Merge pull request #8 from mongodb-partners/new_listening

DianaMDB · web-flow · commit e1b0431b2a7d · 2024-11-07T15:30:04.000+05:30
Added resilience by storing initi_sync_stat and last_parquet_File num…
diff --git a/constants.py b/constants.py
@@ -33,7 +33,12 @@
 
 TEMP_PREFIX_DURING_INIT = "Temp_"
 
-INIT_SYNC_CURRENT_SKIP_FILE_NAME = "init_sync_current_skip"
+#INIT_SYNC_CURRENT_SKIP_FILE_NAME = "init_sync_current_skip"
+# added the two new files to save the initial sync status and last parquet file number
+
+INIT_SYNC_STATUS_FILE_NAME = "_init_sync_status.pkl"
+
+LAST_PARQUET_FILE_NUMBER = "_last_created_parquet.pkl"
 
 INIT_SYNC_LAST_ID_FILE_NAME = "_last_id.pkl"
 
diff --git a/flags.py b/flags.py
diff --git a/init_sync.py b/init_sync.py
@@ -14,39 +14,70 @@
     MONGODB_READING_BATCH_SIZE,
     METADATA_FILE_NAME,
     DATA_FILES_PATH,
-    INIT_SYNC_CURRENT_SKIP_FILE_NAME,
+#    INIT_SYNC_CURRENT_SKIP_FILE_NAME,
+# added the two new files to save the initial sync status and last parquet file number
+    INIT_SYNC_STATUS_FILE_NAME,
+    LAST_PARQUET_FILE_NUMBER,
     INIT_SYNC_LAST_ID_FILE_NAME,
     INIT_SYNC_MAX_ID_FILE_NAME,
 )
 import schema_utils
 from utils import get_parquet_full_path_filename, to_string, get_table_dir
 from push_file_to_lz import push_file_to_lz
-from flags import set_init_flag, clear_init_flag
+# not required as now init_sync stat is stored in LZ
+#from flags import set_init_flag, clear_init_flag
 from file_utils import FileType, read_from_file, write_to_file, delete_file
 
 
 def init_sync(collection_name: str):
     logger = logging.getLogger(f"{__name__}[{collection_name}]")
-    # skip init_sync if there's already parquet files and no current_skip/last_id file
-    table_dir = get_table_dir(collection_name)
-    current_skip_file_path = os.path.join(table_dir, INIT_SYNC_CURRENT_SKIP_FILE_NAME)
-    last_id_file_path = os.path.join(table_dir, INIT_SYNC_LAST_ID_FILE_NAME)
-    # needs to exclude the situation of cache or temp parquet files exist but
-    # not normal numbered parquet files, in which case we shouldn't skip init sync
-    if (
-        not os.path.exists(last_id_file_path)
-        and os.path.exists(table_dir)
-        and any(
-            file.endswith(".parquet") and os.path.splitext(file)[0].isnumeric()
-            for file in os.listdir(table_dir)
-        )
-    ):
+
+    # detect if there's a init_sync_stat file in LZ, and get its value
+    init_sync_stat_flag = read_from_file(
+        collection_name, INIT_SYNC_STATUS_FILE_NAME, FileType.PICKLE
+    )
+    if init_sync_stat_flag == "Y":
         logger.info(
             f"init sync for collection {collection_name} has already finished previously. Skipping init sync this time."
         )
         return
+    
+    # detect if there's a last_id file, and restore last_id from it
+    last_id = read_from_file(
+        collection_name, INIT_SYNC_LAST_ID_FILE_NAME, FileType.PICKLE
+    )
+    if (init_sync_stat_flag == "N" and last_id):
+        logger.info(
+            f"interrupted init sync detected, continuing with previous _id={last_id}"
+        )
+    # skip old logic with LZ file for init_sync_stat    
+    # skip init_sync if there's already parquet files and no current_skip/last_id file
+    #table_dir = get_table_dir(collection_name)
+    #current_skip_file_path = os.path.join(table_dir, INIT_SYNC_CURRENT_SKIP_FILE_NAME)
+    #last_id_file_path = os.path.join(table_dir, INIT_SYNC_LAST_ID_FILE_NAME)
+    # needs to exclude the situation of cache or temp parquet files exist but
+    # not normal numbered parquet files, in which case we shouldn't skip init sync
+    # if (
+    #     not os.path.exists(last_id_file_path)
+    #     and os.path.exists(table_dir)
+    #     and any(
+    #         file.endswith(".parquet") and os.path.splitext(file)[0].isnumeric()
+    #         for file in os.listdir(table_dir)
+    #     )
+    # ):
+
     logger.info(f"begin init sync for {collection_name}")
-    set_init_flag(collection_name)
+    
+    # begin by writing init_sync_stat file with "N" as value
+    #set_init_flag(collection_name)
+    if not init_sync_stat_flag:
+        # writing init_sync_stat file with "N"
+        init_sync_stat_flag = "N"
+        logger.info(f"writing init sync stat file with as 'N' for {collection_name}")
+        write_to_file(
+            init_sync_stat_flag, collection_name, INIT_SYNC_STATUS_FILE_NAME, FileType.PICKLE
+        ) 
+
     db_name = os.getenv("MONGO_DB_NAME")
     logger.debug(f"db_name={db_name}")
     logger.debug(f"collection={collection_name}")
@@ -77,14 +108,15 @@ def init_sync(collection_name: str):
 
     columns_to_convert_to_str = None
 
+    #moved to the begining to check if initial sync is completed
     # detect if there's a last_id file, and restore last_id from it
-    last_id = read_from_file(
-        collection_name, INIT_SYNC_LAST_ID_FILE_NAME, FileType.PICKLE
-    )
-    if last_id:
-        logger.info(
-            f"interrupted init sync detected, continuing with previous _id={last_id}"
-        )
+    # last_id = read_from_file(
+    #     collection_name, INIT_SYNC_LAST_ID_FILE_NAME, FileType.PICKLE
+    # )
+    # if last_id:
+    #     logger.info(
+    #         f"interrupted init sync detected, continuing with previous _id={last_id}"
+    #     )
 
     while last_id is None or last_id < max_id:
         # for debug only
@@ -128,7 +160,16 @@ def init_sync(collection_name: str):
             logger.info(f"TIME: trans took {trans_end_time-read_end_time:.2f} seconds")
 
         logger.debug("creating parquet file...")
-        parquet_full_path_filename = get_parquet_full_path_filename(collection_name)
+        # changed to get last parquet file number from LZ for resilience
+        #parquet_full_path_filename = get_parquet_full_path_filename(collection_name)
+        last_parquet_file_num = read_from_file(
+        collection_name, LAST_PARQUET_FILE_NUMBER, FileType.PICKLE
+        )
+        if not last_parquet_file_num:
+           last_parquet_file_num = 0
+
+        parquet_full_path_filename = get_parquet_full_path_filename(collection_name, last_parquet_file_num)
+        
         logger.info(f"writing parquet file: {parquet_full_path_filename}")
         batch_df.to_parquet(parquet_full_path_filename, index=False)
         write_end_time = time.time()
@@ -139,11 +180,10 @@ def init_sync(collection_name: str):
             metadata_json_path = os.path.join(
                 os.path.dirname(os.path.abspath(__file__)), METADATA_FILE_NAME
             )
-            #Diana 143
             logger.info("writing metadata file to LZ")
             push_file_to_lz(metadata_json_path, collection_name)
+        # write the current batch to LZ
         push_start_time = time.time()
-        #Diana 147
         logger.info("writing parquet file to LZ")
         push_file_to_lz(parquet_full_path_filename, collection_name)
         push_end_time = time.time()
@@ -156,12 +196,26 @@ def init_sync(collection_name: str):
         write_to_file(
             last_id, collection_name, INIT_SYNC_LAST_ID_FILE_NAME, FileType.PICKLE
         )
+        # write last parquet file number to file
+        last_parquet_file_num += 1
+        logger.info(f"writing last parquet number into file: {last_parquet_file_num}")
+        write_to_file(
+            last_parquet_file_num,
+            collection_name,
+            LAST_PARQUET_FILE_NUMBER,
+            FileType.PICKLE,
+        )
 
     # delete last_id file, as init sync is complete
     logger.info("removing the last_id file")
     delete_file(collection_name, INIT_SYNC_LAST_ID_FILE_NAME)
 
-    clear_init_flag(collection_name)
+    #set_init_flag_stat as complete = Y
+    logger.info("Setting init_sync_stat flag as Y")
+    init_sync_stat_flag = "Y"
+    write_to_file(
+        init_sync_stat_flag, collection_name, INIT_SYNC_STATUS_FILE_NAME, FileType.PICKLE
+    )
     logger.info(f"init sync completed for collection {collection_name}")
 
 
diff --git a/listening.py b/listening.py
@@ -15,12 +15,15 @@
     DATA_FILES_PATH,
     DELTA_SYNC_CACHE_PARQUET_FILE_NAME,
     DELTA_SYNC_RESUME_TOKEN_FILE_NAME,
+# added the two new files to save the initial sync status and last parquet file number
+    INIT_SYNC_STATUS_FILE_NAME,
+    LAST_PARQUET_FILE_NUMBER,
     DTYPE_KEY,
     TYPE_KEY,
 )
-from utils import to_string, get_parquet_full_path_filename, get_table_dir
+from utils import to_string, get_parquet_full_path_filename, get_temp_parquet_full_path_filename, get_table_dir
 from push_file_to_lz import push_file_to_lz
-from flags import get_init_flag
+#from flags import get_init_flag
 from init_sync import init_sync
 import schemas
 import schema_utils
@@ -57,10 +60,16 @@ def listening(collection_name: str):
     Thread(target=init_sync, args=(collection_name,)).start()
 
     logger.info(f"start listening to change stream for collection {collection_name}")
+    init_sync_stat_flag = None
     for change in cursor:
-        init_flag = get_init_flag(collection_name)
+  #      init_flag = get_init_flag(collection_name)
+        if not init_sync_stat_flag == "Y":
+           init_sync_stat_flag = read_from_file(
+            collection_name, INIT_SYNC_STATUS_FILE_NAME, FileType.PICKLE
+           )
         # do post init flush if this is the first iteration after init is done
-        if not init_flag and not post_init_flush_done:
+        #if not init_flag and not post_init_flush_done:
+        if init_sync_stat_flag == "Y" and not post_init_flush_done:
             __post_init_flush(collection_name, logger)
             post_init_flush_done = True
         # logger.debug(type(change))
@@ -79,7 +88,7 @@ def listening(collection_name: str):
         # process df according to internal schema
         schema_utils.process_dataframe(collection_name, df)
 
-        if init_flag:
+        if not init_sync_stat_flag == "Y":
             logger.debug(
                 f"collection {collection_name} still initializing, use UPSERT instead of INSERT"
             )
@@ -101,33 +110,43 @@ def listening(collection_name: str):
             logger.info(f"last_sync_time when first record added: {last_sync_time}")
 
 
-        if init_flag:
+        if not init_sync_stat_flag == "Y":
             if (accumulative_df is not None
                 and (
                       (accumulative_df.shape[0] >= int(os.getenv("DELTA_SYNC_BATCH_SIZE")))
                 )
             ):
                 prefix = TEMP_PREFIX_DURING_INIT
-                parquet_full_path_filename = get_parquet_full_path_filename(
+                # changed to a diff method just for temp as temp continues in local
+               # parquet_full_path_filename = get_parquet_full_path_filename(
+                parquet_full_path_filename = get_temp_parquet_full_path_filename(
                 collection_name, prefix=prefix
                 )
   
-                logger.info(f"writing parquet file: {parquet_full_path_filename}")
+                logger.info(f"writing TEMP parquet file: {parquet_full_path_filename}")
                 accumulative_df.to_parquet(parquet_full_path_filename)
                 accumulative_df = None
-
-        if not init_flag:
+        else:        
             if (accumulative_df is not None
                 and (
                       (accumulative_df.shape[0] >= int(os.getenv("DELTA_SYNC_BATCH_SIZE")))
-#                      or (time.time() - last_sync_time >= TIME_THRESHOLD_IN_SEC)
                        or ((time.time() - last_sync_time) >= time_threshold_in_sec)
                 )
             ):
                 prefix = ""
-                parquet_full_path_filename = get_parquet_full_path_filename(
-                collection_name, prefix=prefix
-               )
+            #     parquet_full_path_filename = get_parquet_full_path_filename(
+            #     collection_name, prefix=prefix
+            #    )
+            # changed to get last parquet file number from LZ for resilience
+            #parquet_full_path_filename = get_parquet_full_path_filename(collection_name)
+                last_parquet_file_num = read_from_file(
+                  collection_name, LAST_PARQUET_FILE_NUMBER, FileType.PICKLE
+                )
+                if not last_parquet_file_num:
+                    last_parquet_file_num = 0
+
+                parquet_full_path_filename = get_parquet_full_path_filename(collection_name, last_parquet_file_num)
+
                 logger.info(f"writing parquet file: {parquet_full_path_filename}")
                 accumulative_df.to_parquet(parquet_full_path_filename)
                 accumulative_df = None
@@ -143,7 +162,15 @@ def listening(collection_name: str):
                     DELTA_SYNC_RESUME_TOKEN_FILE_NAME,
                     FileType.PICKLE,
                 )
-
+                # write last parquet file number to file
+                last_parquet_file_num +=  1
+                logger.info(f"writing last parquet number into file: {last_parquet_file_num}")
+                write_to_file(
+                    last_parquet_file_num,
+                    collection_name,
+                    LAST_PARQUET_FILE_NUMBER,
+                    FileType.PICKLE,
+                )
 
 def __post_init_flush(table_name: str, logger):
     if not logger:
@@ -163,7 +190,14 @@ def __post_init_flush(table_name: str, logger):
     )
     for temp_parquet_filename in temp_parquet_filename_list:
         temp_parquet_full_path = os.path.join(table_dir, temp_parquet_filename)
-        new_parquet_full_path = get_parquet_full_path_filename(table_name)
+        # changed to get last parquet file number from LZ for resilience
+        #new_parquet_full_path = get_parquet_full_path_filename(table_name)
+        last_parquet_file_num = read_from_file(
+            table_name, LAST_PARQUET_FILE_NUMBER, FileType.PICKLE
+        )
+        if not last_parquet_file_num:
+            last_parquet_file_num = 0
+        new_parquet_full_path = get_parquet_full_path_filename(table_name, last_parquet_file_num)   
         logger.debug("renaming temp parquet file")
         logger.debug(f"old name: {temp_parquet_full_path}")
         logger.debug(f"new name: {new_parquet_full_path}")
@@ -172,3 +206,12 @@ def __post_init_flush(table_name: str, logger):
         )
         os.rename(temp_parquet_full_path, new_parquet_full_path)
         push_file_to_lz(new_parquet_full_path, table_name)
+        # write last parquet file number to file
+        last_parquet_file_num +=  1
+        logger.info(f"writing last parquet number into file: {last_parquet_file_num}")
+        write_to_file(
+            last_parquet_file_num,
+            table_name,
+            LAST_PARQUET_FILE_NUMBER,
+            FileType.PICKLE,
+        )
diff --git a/schema_persistence.py b/schema_persistence.py
diff --git a/utils.py b/utils.py
@@ -10,17 +10,32 @@ def get_table_dir(table_name: str) -> str:
     table_dir = os.path.join(current_dir, DATA_FILES_PATH, table_name + os.sep)
     os.makedirs(table_dir, exist_ok=True)
     return table_dir
-
-def get_parquet_full_path_filename(table_name: str, prefix: str = "") -> str:
+#changes to get next parquet file num based on the last parquet from LZ, it will pass 0 if first file
+def get_parquet_full_path_filename(table_name: str, parquet_filename_int_list: int, prefix: str = "") -> str:
+    table_dir = get_table_dir(table_name)
+    # parquet_filename_int_list = [
+    #     int(os.path.splitext(filename)[0].removeprefix(prefix))
+    #     for filename in os.listdir(table_dir)
+    #     if os.path.splitext(filename)[1] == ".parquet"
+    #     and os.path.splitext(filename)[0].removeprefix(prefix).isnumeric()
+    # ]
+    #if parquet_filename_int_list:
+    #    return os.path.join(table_dir, prefix + __num_to_filename(max(parquet_filename_int_list) + 1))
+    return os.path.join(table_dir, prefix + __num_to_filename(parquet_filename_int_list + 1))
+    # else:
+    #     return os.path.join(table_dir, prefix + __num_to_filename(1))
+    
+#as temp files will be stored in local only, kept the original logic a is
+def get_temp_parquet_full_path_filename(table_name: str, prefix: str = "") -> str:
     table_dir = get_table_dir(table_name)
-    parquet_filename_int_list = [
+    tmp_parquet_filename_int_list = [
         int(os.path.splitext(filename)[0].removeprefix(prefix))
         for filename in os.listdir(table_dir)
         if os.path.splitext(filename)[1] == ".parquet"
         and os.path.splitext(filename)[0].removeprefix(prefix).isnumeric()
     ]
-    if parquet_filename_int_list:
-        return os.path.join(table_dir, prefix + __num_to_filename(max(parquet_filename_int_list) + 1))
+    if tmp_parquet_filename_int_list:
+        return os.path.join(table_dir, prefix + __num_to_filename(max(tmp_parquet_filename_int_list) + 1))
     else:
         return os.path.join(table_dir, prefix + __num_to_filename(1))