Cosmo-Tech
diff --git a/‎cosmotech/coal/cosmotech_api/__init__.py‎
Lines changed: 9 additions & 6 deletions b/‎cosmotech/coal/cosmotech_api/__init__.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎cosmotech/coal/cosmotech_api/dataset/download/file.py‎
Lines changed: 124 additions & 104 deletions b/‎cosmotech/coal/cosmotech_api/dataset/download/file.py‎
Lines changed: 124 additions & 104 deletions
diff --git a/‎cosmotech/coal/cosmotech_api/dataset/download/twingraph.py‎
Lines changed: 8 additions & 13 deletions b/‎cosmotech/coal/cosmotech_api/dataset/download/twingraph.py‎
Lines changed: 8 additions & 13 deletions
@@ -16,12 +16,15 @@
     write_parameters,
 )
 
-# Re-export functions from the twin_data_layer module
-from cosmotech.coal.cosmotech_api.twin_data_layer import (
-    get_dataset_id_from_runner,
-    send_files_to_tdl,
-    load_files_from_tdl,
-)
+from cosmotech.coal.utils.semver import semver_of
+csm_version = semver_of('cosmotech_api')
+if csm_version.major < 5:
+    # Re-export functions from the twin_data_layer module
+    from cosmotech.coal.cosmotech_api.twin_data_layer import (
+        get_dataset_id_from_runner,
+        send_files_to_tdl,
+        load_files_from_tdl,
+    )
 
 # Re-export functions from the run_data module
 from cosmotech.coal.cosmotech_api.run_data import (
 
@@ -12,16 +12,137 @@
 import tempfile
 import time
 from pathlib import Path
-from typing import Dict, List, Any, Optional, Union, Tuple
+from typing import Dict, Any, Optional, Union, Tuple
 
 from cosmotech_api import WorkspaceApi
 from openpyxl import load_workbook
 
+from cosmotech.coal.utils.decorator import timed
 from cosmotech.coal.utils.logger import LOGGER
 from cosmotech.orchestrator.utils.translate import T
 from cosmotech.coal.cosmotech_api.connection import get_api_client
 
 
+def process_xls(target_file) -> Dict[str, Any]:
+    content = {}
+
+    LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
+    wb = load_workbook(target_file, data_only=True)
+
+    for sheet_name in wb.sheetnames:
+        sheet = wb[sheet_name]
+        content[sheet_name] = list()
+        headers = next(sheet.iter_rows(max_row=1, values_only=True))
+
+        row_count = 0
+        for r in sheet.iter_rows(min_row=2, values_only=True):
+            row = {k: v for k, v in zip(headers, r)}
+            new_row = dict()
+
+            for key, value in row.items():
+                try:
+                    converted_value = json.load(io.StringIO(value))
+                except (json.decoder.JSONDecodeError, TypeError):
+                    converted_value = value
+
+                if converted_value is not None:
+                    new_row[key] = converted_value
+
+            if new_row:
+                content[sheet_name].append(new_row)
+                row_count += 1
+
+        LOGGER.debug(
+            T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count)
+        )
+    return content
+
+
+def process_csv(target_file) -> Dict[str, Any]:
+    content = {}
+
+    LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
+    with open(target_file, "r") as file:
+        current_filename = os.path.basename(target_file)[: -len(".csv")]
+        content[current_filename] = list()
+
+        row_count = 0
+        for csv_row in csv.DictReader(file):
+            csv_row: dict
+            new_row = dict()
+
+            for key, value in csv_row.items():
+                try:
+                    # Try to convert any json row to dict object
+                    converted_value = json.load(io.StringIO(value))
+                except json.decoder.JSONDecodeError:
+                    converted_value = value
+
+                if converted_value == "":
+                    converted_value = None
+
+                if converted_value is not None:
+                    new_row[key] = converted_value
+
+            content[current_filename].append(new_row)
+            row_count += 1
+
+        LOGGER.debug(
+            T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count)
+        )
+    return content
+
+
+def process_json(target_file) -> Dict[str, Any]:
+    content = {}
+    LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
+    with open(target_file, "r") as _file:
+        current_filename = os.path.basename(target_file)
+        content[current_filename] = json.load(_file)
+
+        if isinstance(content[current_filename], dict):
+            item_count = len(content[current_filename])
+        elif isinstance(content[current_filename], list):
+            item_count = len(content[current_filename])
+        else:
+            item_count = 1
+
+        LOGGER.debug(
+            T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count)
+        )
+    return content
+
+
+def process_txt(target_file) -> Dict[str, Any]:
+    content = {}
+    LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
+    with open(target_file, "r") as _file:
+        current_filename = os.path.basename(target_file)
+        content[current_filename] = "\n".join(line for line in _file)
+
+        line_count = content[current_filename].count("\n") + 1
+        LOGGER.debug(
+            T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count)
+        )
+    return content
+
+
+def read_file(file_name, file):
+    @timed(f"process{file_name}", debug=True)
+    def timed_read_file(file_name, file):
+        content = {}
+        if ".xls" in file_name:
+            content.update(process_xls(file))
+        elif ".csv" in file_name:
+            content.update(process_csv(file))
+        elif ".json" in file_name:
+            content.update(process_json(file))
+        else:
+            content.update(process_txt(file))
+        return content
+    return timed_read_file(file_name, file)
+
+
 def download_file_dataset(
     organization_id: str,
     workspace_id: str,
@@ -105,109 +226,8 @@ def download_file_dataset(
                 )
             )
 
-            if not read_files:
-                continue
-
-            # Process file based on type
-            process_start = time.time()
-
-            if ".xls" in _file_name:
-                LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
-                wb = load_workbook(target_file, data_only=True)
-
-                for sheet_name in wb.sheetnames:
-                    sheet = wb[sheet_name]
-                    content[sheet_name] = list()
-                    headers = next(sheet.iter_rows(max_row=1, values_only=True))
-
-                    def item(_row: tuple) -> dict:
-                        return {k: v for k, v in zip(headers, _row)}
-
-                    row_count = 0
-                    for r in sheet.iter_rows(min_row=2, values_only=True):
-                        row = item(r)
-                        new_row = dict()
-
-                        for key, value in row.items():
-                            try:
-                                converted_value = json.load(io.StringIO(value))
-                            except (json.decoder.JSONDecodeError, TypeError):
-                                converted_value = value
-
-                            if converted_value is not None:
-                                new_row[key] = converted_value
-
-                        if new_row:
-                            content[sheet_name].append(new_row)
-                            row_count += 1
-
-                    LOGGER.debug(
-                        T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count)
-                    )
-
-            elif ".csv" in _file_name:
-                LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
-                with open(target_file, "r") as file:
-                    current_filename = os.path.basename(target_file)[: -len(".csv")]
-                    content[current_filename] = list()
-
-                    row_count = 0
-                    for csv_row in csv.DictReader(file):
-                        csv_row: dict
-                        new_row = dict()
-
-                        for key, value in csv_row.items():
-                            try:
-                                # Try to convert any json row to dict object
-                                converted_value = json.load(io.StringIO(value))
-                            except json.decoder.JSONDecodeError:
-                                converted_value = value
-
-                            if converted_value == "":
-                                converted_value = None
-
-                            if converted_value is not None:
-                                new_row[key] = converted_value
-
-                        content[current_filename].append(new_row)
-                        row_count += 1
-
-                    LOGGER.debug(
-                        T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count)
-                    )
-
-            elif ".json" in _file_name:
-                LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
-                with open(target_file, "r") as _file:
-                    current_filename = os.path.basename(target_file)
-                    content[current_filename] = json.load(_file)
-
-                    if isinstance(content[current_filename], dict):
-                        item_count = len(content[current_filename])
-                    elif isinstance(content[current_filename], list):
-                        item_count = len(content[current_filename])
-                    else:
-                        item_count = 1
-
-                    LOGGER.debug(
-                        T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count)
-                    )
-
-            else:
-                LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
-                with open(target_file, "r") as _file:
-                    current_filename = os.path.basename(target_file)
-                    content[current_filename] = "\n".join(line for line in _file)
-
-                    line_count = content[current_filename].count("\n") + 1
-                    LOGGER.debug(
-                        T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count)
-                    )
-
-            process_time = time.time() - process_start
-            LOGGER.debug(
-                T("coal.common.timing.operation_completed").format(operation=f"process {_file_name}", time=process_time)
-            )
+            if read_files:
+                content.update(read_file(_file_name, target_file))
 
     elapsed_time = time.time() - start_time
     LOGGER.info(T("coal.common.timing.operation_completed").format(operation="File download", time=elapsed_time))
 
@@ -8,14 +8,9 @@
 import time
 import tempfile
 from pathlib import Path
-from typing import Dict, List, Any, Optional, Union, Tuple
+from typing import Dict, Any, Optional, Union, Tuple
 
-from cosmotech_api import (
-    DatasetApi,
-    DatasetTwinGraphQuery,
-    TwinGraphQuery,
-    TwingraphApi,
-)
+import cosmotech_api
 
 from cosmotech.coal.utils.logger import LOGGER
 from cosmotech.orchestrator.utils.translate import T
@@ -47,12 +42,12 @@ def download_twingraph_dataset(
     )
 
     with get_api_client()[0] as api_client:
-        dataset_api = DatasetApi(api_client)
+        dataset_api = cosmotech_api.DatasetApi(api_client)
 
         # Query nodes
         nodes_start = time.time()
         LOGGER.debug(T("coal.services.dataset.twingraph_querying_nodes").format(dataset_id=dataset_id))
-        nodes_query = DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
+        nodes_query = cosmotech_api.DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
 
         nodes = dataset_api.twingraph_query(
             organization_id=organization_id,
@@ -67,7 +62,7 @@ def download_twingraph_dataset(
         # Query edges
         edges_start = time.time()
         LOGGER.debug(T("coal.services.dataset.twingraph_querying_edges").format(dataset_id=dataset_id))
-        edges_query = DatasetTwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
+        edges_query = cosmotech_api.DatasetTwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
 
         edges = dataset_api.twingraph_query(
             organization_id=organization_id,
@@ -129,12 +124,12 @@ def download_legacy_twingraph_dataset(
     )
 
     with get_api_client()[0] as api_client:
-        api_instance = TwingraphApi(api_client)
+        api_instance = cosmotech_api.TwingraphApi(api_client)
 
         # Query nodes
         nodes_start = time.time()
         LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_nodes").format(cache_name=cache_name))
-        _query_nodes = TwinGraphQuery(query="MATCH(n) RETURN n")
+        _query_nodes = cosmotech_api.TwinGraphQuery(query="MATCH(n) RETURN n")
 
         nodes = api_instance.query(
             organization_id=organization_id,
@@ -149,7 +144,7 @@ def download_legacy_twingraph_dataset(
         # Query relationships
         rel_start = time.time()
         LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_relations").format(cache_name=cache_name))
-        _query_rel = TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
+        _query_rel = cosmotech_api.TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
 
         rel = api_instance.query(
             organization_id=organization_id,