mongodb-partners
diff --git a/‎.DS_Store‎
0 Bytes b/‎.DS_Store‎
0 Bytes
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎app.py‎
Lines changed: 5 additions & 2 deletions b/‎app.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎constants.py‎
Lines changed: 1 addition & 0 deletions b/‎constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎file_utils.py‎
Lines changed: 53 additions & 12 deletions b/‎file_utils.py‎
Lines changed: 53 additions & 12 deletions
diff --git a/‎init_sync.py‎
Lines changed: 17 additions & 8 deletions b/‎init_sync.py‎
Lines changed: 17 additions & 8 deletions
@@ -1,6 +1,10 @@
 __pycache__/
 .venv/
 .env
+venv
 .vscode/
 data_files/
-*.log
+*.log
+*.log.1
+*.log.*
+*.ipynb
@@ -6,12 +6,15 @@
 
 def create_app():
     app = Flask(__name__)
-    Thread(target=mirror).start()
+    thread_name=Thread(target=mirror).start()
 
     @app.route("/")
     def home_page():
+        import threading
+        for thread in threading.enumerate(): 
+            print(thread.name)
         return "The MongoDB Fabric Mirroring Service is running..."
-    
+        
     return app
 
 app = create_app()
 
@@ -52,6 +52,7 @@
 
 COLUMN_RENAMING_FILE_NAME = "_column_renaming.pkl"
 
+CONVERSION_LOG_FILE_NAME = "_conversion_log.txt"
 # dict keys for schema
 TYPE_KEY = "type"
 DTYPE_KEY = "dtype"
@@ -2,7 +2,6 @@
 import pickle
 from enum import Enum
 from typing import Any
-
 import utils
 from push_file_to_lz import push_file_to_lz, get_file_from_lz, delete_file_from_lz
 
@@ -23,25 +22,48 @@ class FileType(Enum):
 }
 
 
+# def read_from_file(table_name: str, file_name: str, file_type: FileType):
+#     table_path = utils.get_table_dir(table_name)
+#     file_full_path = os.path.join(table_path, file_name)
+#     print("File path ; ", file_full_path)
+#     # always read from LZ first
+#     if get_file_from_lz(table_name, file_name) : # and os.path.exists(file_full_path):
+#         with open(
+#             file_full_path, FILETYPE_TO_READ_MODE_MAP.get(file_type, "r")
+#         ) as file:
+#             if file_type == FileType.PICKLE:
+#                 obj = pickle.load(file)
+#             elif file_type == FileType.TEXT:
+#                 obj = file.read()
+#             else:
+#                 obj = None
+#             return obj
+#     else:
+#         return None
+
+
 def read_from_file(table_name: str, file_name: str, file_type: FileType):
     table_path = utils.get_table_dir(table_name)
     file_full_path = os.path.join(table_path, file_name)
     # always read from LZ first
-    if get_file_from_lz(table_name, file_name) and os.path.exists(file_full_path):
-        with open(
-            file_full_path, FILETYPE_TO_READ_MODE_MAP.get(file_type, "r")
-        ) as file:
-            if file_type == FileType.PICKLE:
-                obj = pickle.load(file)
-            elif file_type == FileType.TEXT:
-                obj = file.read()
-            else:
-                obj = None
+    response_status_code, file_content = get_file_from_lz(table_name, file_name)
+    if response_status_code == 200: 
+        if file_type == FileType.PICKLE:
+            obj = pickle.loads(file_content.content)
+            print("Type of object: ", isinstance(obj, bytes))
+            # Check if the result is itself a pickled object (nested)
+            if isinstance(obj, bytes):
+                obj = pickle.loads(obj)
+            print("Unpickled object: ", obj)
             return obj
+
+        elif file_type == FileType.TEXT:
+            return file_content.content.decode('utf-8')
+        else:
+            return None
     else:
         return None
 
-
 def write_to_file(obj: Any, table_name: str, file_name: str, file_type: FileType):
     table_path = utils.get_table_dir(table_name)
     file_full_path = os.path.join(table_path, file_name)
@@ -54,6 +76,25 @@ def write_to_file(obj: Any, table_name: str, file_name: str, file_type: FileType
     push_file_to_lz(file_full_path, table_name)
 
 
+def append_to_file(obj: Any, table_name: str, file_name: str, file_type: FileType):
+    table_path = utils.get_table_dir(table_name)
+    file_full_path = os.path.join(table_path, file_name)
+    append_mode = "ab" if file_type == FileType.PICKLE else "a"
+    
+    # Create file if it doesn't exist
+    if not os.path.exists(file_full_path):
+        with open(file_full_path, FILETYPE_TO_WRITE_MODE_MAP.get(file_type, "w")) as f:
+            f.write(f"\n{'Column Name':<20} | {'Original Value':<20} | {'Converting Value':<20}\n{'-'*70}\n")
+    
+    with open(file_full_path, append_mode) as file:
+        if file_type == FileType.PICKLE:
+            pickle.dump(obj, file)
+        elif file_type == FileType.TEXT:
+            file.write(obj)
+    # write to LZ
+    # push_file_to_lz(file_full_path, table_name)
+
+
 def delete_file(table_name: str, file_name: str):
     file_full_path = os.path.join(utils.get_table_dir(table_name), file_name)
     os.remove(file_full_path)
 
@@ -1,3 +1,4 @@
+import pprint
 import pymongo
 from pymongo.collection import Collection
 import time
@@ -6,8 +7,9 @@
 import shutil
 import logging
 import glob
-from bson import ObjectId
+from bson import ObjectId, Decimal128
 import pickle
+import numpy as np
 
 from constants import (
     TYPES_TO_CONVERT_TO_STR,
@@ -137,6 +139,7 @@ def init_sync(collection_name: str):
 
         read_start_time = time.time()
         batch_df = pd.DataFrame(list(batch_cursor))
+
         read_end_time = time.time()
         if enable_perf_timer:
             logger.info(f"TIME: read took {read_end_time-read_start_time:.2f} seconds")
@@ -171,17 +174,21 @@ def init_sync(collection_name: str):
         parquet_full_path_filename = get_parquet_full_path_filename(collection_name, last_parquet_file_num)
 
         logger.info(f"writing parquet file: {parquet_full_path_filename}")
+
         batch_df.to_parquet(parquet_full_path_filename, index=False)
-        write_end_time = time.time()
+        # os.remove("temp.csv")
+        write_end_time = time.time()    
         if enable_perf_timer:
             logger.info(f"TIME: write took {write_end_time-trans_end_time:.2f} seconds")
-        if not last_id:
+            
+    #>>># changes to remove write metadata.json here as it will now be written as the first file in mongodb_generic_mirroring.py - 6Mar2025
+ #      if not last_id:
             # do not copy, but send the template file directly
-            metadata_json_path = os.path.join(
-                os.path.dirname(os.path.abspath(__file__)), METADATA_FILE_NAME
-            )
-            logger.info("writing metadata file to LZ")
-            push_file_to_lz(metadata_json_path, collection_name)
+            # metadata_json_path = os.path.join(
+            #     os.path.dirname(os.path.abspath(__file__)), METADATA_FILE_NAME
+            # )
+            # logger.info("writing metadata file to LZ")
+            # push_file_to_lz(metadata_json_path, collection_name)
         # write the current batch to LZ
         push_start_time = time.time()
         logger.info("writing parquet file to LZ")
@@ -205,6 +212,8 @@ def init_sync(collection_name: str):
             LAST_PARQUET_FILE_NUMBER,
             FileType.PICKLE,
         )
+        #>>># added sleep to ensure that Fabric picks up one file at a time - 14Mar2025
+        time.sleep(30)
 
     # delete last_id file, as init sync is complete
     logger.info("removing the last_id file")