formatting

bidyapati-p · bidyapati-p · commit ff581d4bdb19 · 2025-12-15T10:32:59.000+05:30
diff --git a/sygra/utils/constants.py b/sygra/utils/constants.py
@@ -105,10 +105,14 @@
 DATASET_JOIN_TYPE = "join_type"
 PRIMARY_KEY = "primary_key"
 JOIN_KEY = "join_key"
-JOIN_TYPE_VSTACK = "vstack" # verticle stacking with common columns, variables will not have alias prefix and sink should be single
+JOIN_TYPE_VSTACK = "vstack"  # verticle stacking with common columns, variables will not have alias prefix and sink should be single
 # below all are for horizontal concat
-JOIN_TYPE_PRIMARY = "primary" # when joining horizontally, this dataset will be primary
-JOIN_TYPE_SEQUENTIAL = "sequential" # merge column sequentially from secondary, if less rotate to index 0
-JOIN_TYPE_RANDOM = "random" # pick random and join at each primary dataset record in horizontal way(add column)
-JOIN_TYPE_CROSS = "cross" # Each primary will join the secondary record(MxN)
-JOIN_TYPE_COLUMN = "column" # join like RDBMS column based inner join
+JOIN_TYPE_PRIMARY = "primary"  # when joining horizontally, this dataset will be primary
+JOIN_TYPE_SEQUENTIAL = (
+    "sequential"  # merge column sequentially from secondary, if less rotate to index 0
+)
+JOIN_TYPE_RANDOM = (
+    "random"  # pick random and join at each primary dataset record in horizontal way(add column)
+)
+JOIN_TYPE_CROSS = "cross"  # Each primary will join the secondary record(MxN)
+JOIN_TYPE_COLUMN = "column"  # join like RDBMS column based inner join
diff --git a/tests/core/test_base_task_executor.py b/tests/core/test_base_task_executor.py
@@ -8,8 +8,8 @@
 import json
 from unittest.mock import MagicMock, Mock, mock_open, patch
 
-import pytest
 import pandas as pd
+import pytest
 
 from sygra.core.base_task_executor import BaseTaskExecutor
 from sygra.core.dataset.dataset_config import OutputType
@@ -623,89 +623,253 @@ def test_output_sink_jsonl_reading(
         dummy_instance.execute()
         mock_write.assert_called_once()
 
+
 def test_validate_data_config_rule1_success_flow(dummy_instance):
     # success flow in source and sink
-    src_config_list = [{"alias": "ds1", "join_type": "primary", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"alias": "ds2", "join_type": "sequential", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "random", "type": "servicenow", "table": "problem", "limit": 10}]
-    sink_config_list = [{"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "primary",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {
+            "alias": "ds2",
+            "join_type": "sequential",
+            "type": "servicenow",
+            "table": "request",
+            "limit": 10,
+        },
+        {
+            "alias": "ds3",
+            "join_type": "random",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
+    sink_config_list = [
+        {"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}
+    ]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == True
+    assert validated
+
 
 def test_validate_data_config_rule1_missing_join_type(dummy_instance):
     # missing join type in source
-    src_config_list = [{"alias": "ds1", "join_type": "primary", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"alias": "ds2", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "random", "type": "servicenow", "table": "problem", "limit": 10}]
-    sink_config_list = [{"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "primary",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {"alias": "ds2", "type": "servicenow", "table": "request", "limit": 10},
+        {
+            "alias": "ds3",
+            "join_type": "random",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
+    sink_config_list = [
+        {"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}
+    ]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == False
+    assert not validated
+
 
 def test_validate_data_config_rule1_missing_alias(dummy_instance):
     # missing alias in source
-    src_config_list = [{"alias": "ds1", "join_type": "primary", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"join_type": "sequential", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "random", "type": "servicenow", "table": "problem", "limit": 10}]
-    sink_config_list = [{"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "primary",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {"join_type": "sequential", "type": "servicenow", "table": "request", "limit": 10},
+        {
+            "alias": "ds3",
+            "join_type": "random",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
+    sink_config_list = [
+        {"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}
+    ]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == False
+    assert not validated
 
     # missing alias in sink
-    src_config_list = [{"alias": "ds1", "join_type": "primary", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"alias": "ds2", "join_type": "sequential", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "random", "type": "servicenow", "table": "problem", "limit": 10}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "primary",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {
+            "alias": "ds2",
+            "join_type": "sequential",
+            "type": "servicenow",
+            "table": "request",
+            "limit": 10,
+        },
+        {
+            "alias": "ds3",
+            "join_type": "random",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
     sink_config_list = [{"type": "servicenow", "table": "incident", "operation": "insert"}]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == False
+    assert not validated
+
 
 def test_validate_data_config_rule2_vstack_success(dummy_instance):
     # all source should be vstack
-    src_config_list = [{"alias": "ds1", "join_type": "vstack", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"alias": "ds2", "join_type": "vstack", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "vstack", "type": "servicenow", "table": "problem", "limit": 10}]
-    sink_config_list = [{"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "vstack",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {
+            "alias": "ds2",
+            "join_type": "vstack",
+            "type": "servicenow",
+            "table": "request",
+            "limit": 10,
+        },
+        {
+            "alias": "ds3",
+            "join_type": "vstack",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
+    sink_config_list = [
+        {"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}
+    ]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == True
+    assert validated
+
 
 def test_validate_data_config_rule2_vstack_failure(dummy_instance):
     # some source are non vstack
-    src_config_list = [{"alias": "ds1", "join_type": "vstack", "type": "servicenow", "table": "incident", "limit": 10},
-                       {"alias": "ds2", "join_type": "primary", "type": "servicenow", "table": "request", "limit": 10},
-                       {"alias": "ds3", "join_type": "random", "type": "servicenow", "table": "problem", "limit": 10}]
-    sink_config_list = [{"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}]
+    src_config_list = [
+        {
+            "alias": "ds1",
+            "join_type": "vstack",
+            "type": "servicenow",
+            "table": "incident",
+            "limit": 10,
+        },
+        {
+            "alias": "ds2",
+            "join_type": "primary",
+            "type": "servicenow",
+            "table": "request",
+            "limit": 10,
+        },
+        {
+            "alias": "ds3",
+            "join_type": "random",
+            "type": "servicenow",
+            "table": "problem",
+            "limit": 10,
+        },
+    ]
+    sink_config_list = [
+        {"alias": "ds1", "type": "servicenow", "table": "incident", "operation": "insert"}
+    ]
     validated = dummy_instance.validate_data_config(src_config_list, sink_config_list)
-    assert validated == False
+    assert not validated
+
 
 def test_rename_dataframe(dummy_instance):
-    test_df = pd.DataFrame([{"roll":1, "name": "John", "marks": 123.5}, {"roll":2, "name": "Johny", "marks": 152.5}])
+    test_df = pd.DataFrame(
+        [{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}]
+    )
     final_df = dummy_instance._rename_dataframe(test_df, "student")
     new_columns = list(final_df.columns)
-    assert "student->roll" in new_columns and  "student->name" in new_columns and  "student->marks" in new_columns
+    assert (
+        "student->roll" in new_columns
+        and "student->name" in new_columns
+        and "student->marks" in new_columns
+    )
+
 
 def test_repeat_to_merge_sequentially(dummy_instance):
     # horizontal merge with different columns
     # test 1 : both df has same rows
-    primary_df = pd.DataFrame([{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}])
-    secondary_df = pd.DataFrame([{"class": 5, "sports": "cricket"}, {"class": 6, "sports": "football"}])
+    primary_df = pd.DataFrame(
+        [{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}]
+    )
+    secondary_df = pd.DataFrame(
+        [{"class": 5, "sports": "cricket"}, {"class": 6, "sports": "football"}]
+    )
     merged_df = dummy_instance._repeat_to_merge_sequentially(primary_df, secondary_df)
-    assert len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 6
+    assert (
+        len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 6
+    )
 
     # test 2 : secondary has less rows (need rotation with same data)
-    primary_df = pd.DataFrame([{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}])
+    primary_df = pd.DataFrame(
+        [{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}]
+    )
     secondary_df = pd.DataFrame([{"class": 5, "sports": "cricket"}])
     merged_df = dummy_instance._repeat_to_merge_sequentially(primary_df, secondary_df)
-    assert len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 5
+    assert (
+        len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 5
+    )
 
     # test 3 : secondary has more rows (truncation needed)
-    primary_df = pd.DataFrame([{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}])
-    secondary_df = pd.DataFrame([{"class": 5, "sports": "cricket"}, {"class": 6, "sports": "football"}, {"class": 7, "sports": "tennis"}])
+    primary_df = pd.DataFrame(
+        [{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}]
+    )
+    secondary_df = pd.DataFrame(
+        [
+            {"class": 5, "sports": "cricket"},
+            {"class": 6, "sports": "football"},
+            {"class": 7, "sports": "tennis"},
+        ]
+    )
     merged_df = dummy_instance._repeat_to_merge_sequentially(primary_df, secondary_df)
-    assert len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 6
+    assert (
+        len(merged_df) == 2 and merged_df.iloc[0]["class"] == 5 and merged_df.iloc[1]["class"] == 6
+    )
+
 
 def test_shuffle_and_extend(dummy_instance):
     # random merge from secondary by keeping primary rows same
-    primary_df = pd.DataFrame([{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}])
-    secondary_df = pd.DataFrame([{"class": 5, "sports": "cricket"}, {"class": 6, "sports": "football"}, {"class": 7, "sports": "tennis"}])
+    primary_df = pd.DataFrame(
+        [{"roll": 1, "name": "John", "marks": 123.5}, {"roll": 2, "name": "Johny", "marks": 152.5}]
+    )
+    secondary_df = pd.DataFrame(
+        [
+            {"class": 5, "sports": "cricket"},
+            {"class": 6, "sports": "football"},
+            {"class": 7, "sports": "tennis"},
+        ]
+    )
     merged_df = dummy_instance._shuffle_and_extend(primary_df, secondary_df)
     # 2 records but new column can have value from any record(secondary)
-    assert len(merged_df) == 2 and (merged_df.iloc[0]["class"] == 5 or merged_df.iloc[0]["class"] == 6 or merged_df.iloc[0]["class"] == 7)
+    assert len(merged_df) == 2 and (
+        merged_df.iloc[0]["class"] == 5
+        or merged_df.iloc[0]["class"] == 6
+        or merged_df.iloc[0]["class"] == 7
+    )