NRL-1186 Lightweight framework for ETL process

jackleary · jackleary · commit 479a7a9333cd · 2024-12-10T17:54:15.000Z
diff --git a/terraform/account-wide-infrastructure/modules/glue/src/__init__.py b/terraform/account-wide-infrastructure/modules/glue/src/__init__.py
diff --git a/terraform/account-wide-infrastructure/modules/glue/src/instances.py b/terraform/account-wide-infrastructure/modules/glue/src/instances.py
@@ -0,0 +1,30 @@
+import logging
+
+from awsglue.context import GlueContext
+from pyspark.sql import SparkSession
+
+
+class GlueContextSingleton:
+    """Singleton for GlueContext and SparkSession"""
+
+    _instance = None
+
+    def __new__(cls, spark_context):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+            cls._instance.spark = SparkSession.builder.getOrCreate()
+            cls._instance.context = GlueContext(spark_context)
+        return cls._instance
+
+
+class LoggerSingleton:
+    """Singleton for logger"""
+
+    _instance = None
+
+    def __new__(cls):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+            cls._instance.logger = logging.getLogger("ETLLogger")
+            cls._instance.logger.setLevel(logging.INFO)
+        return cls._instance
diff --git a/terraform/account-wide-infrastructure/modules/glue/src/main.py b/terraform/account-wide-infrastructure/modules/glue/src/main.py
@@ -1,111 +1,26 @@
-from awsglue.context import GlueContext
-from awsglue.dynamicframe import DynamicFrame
+import sys
 
-# from awsglue.job import Job
+from awsglue.utils import getResolvedOptions
 from pyspark.context import SparkContext
 
-# from pyspark.sql import DataFrame
+# Get arguments from AWS Glue job
+args = getResolvedOptions(
+    sys.argv, ["JOB_NAME", "SOURCE_PATH", "TARGET_PATH", "PARTITION_COLS"]
+)
 
+# Start Glue context
+sc = SparkContext()
 
-def create_glue_context():
-    # Initialize the SparkContext and GlueContext
-    sc = SparkContext()
-    glueContext = GlueContext(sc)
+partition_cols = args["PARTITION_COLS"].split(",") if "PARTITION_COLS" in args else []
 
-    return glueContext
+# Initialize ETL process
+etl_job = ETLTemplate(
+    spark_context=sc,
+    source_path=args["SOURCE_PATH"],
+    target_path=args["TARGET_PATH"],
+    partition_cols=partition_cols,
+    transformations=[placeholder],
+)
 
-
-def load_data_from_s3(
-    glueContext, s3_path: str, file_type: str = "json", format_options: dict = {}
-):
-    """
-    Loads data from S3 into a Glue DynamicFrame.
-    """
-    if file_type == "json":
-        return glueContext.create_dynamic_frame.from_options(
-            connection_type="s3",
-            connection_options={"paths": [s3_path]},
-            format=file_type,
-        )
-    else:
-        raise ValueError(f"Unsupported file_type: {file_type}")
-
-
-def transform_data(dynamic_frame: DynamicFrame) -> DynamicFrame:
-    """
-    Example transformation function. Modify this to suit your transformation logic.
-    """
-    # Convert DynamicFrame to DataFrame to leverage Spark SQL operations if needed
-    df = dynamic_frame.toDF()
-
-    # Perform any necessary transformations using Spark DataFrame API
-    df_transformed = df.filter(df["x"] == "placeholder")
-
-    # Convert DataFrame back to DynamicFrame for Glue compatibility
-    transformed_dynamic_frame = DynamicFrame.fromDF(
-        df_transformed, dynamic_frame.glue_ctx, "transformed_dynamic_frame"
-    )
-
-    return transformed_dynamic_frame
-
-
-def write_data_to_s3(
-    dynamic_frame: DynamicFrame,
-    s3_path: str,
-    file_type: str = "csv",
-    partition_keys: list = None,
-):
-    """
-    Writes a DynamicFrame to S3 with partitioning support for scalability.
-    """
-    if file_type == "csv":
-        dynamic_frame.toDF().write.option("header", "true").mode(
-            "overwrite"
-        ).partitionBy(*partition_keys).csv(s3_path)
-    elif file_type == "parquet":
-        dynamic_frame.toDF().write.mode("overwrite").partitionBy(
-            *partition_keys
-        ).parquet(s3_path)
-    elif file_type == "json":
-        dynamic_frame.toDF().write.mode("overwrite").partitionBy(*partition_keys).json(
-            s3_path
-        )
-    else:
-        raise ValueError(f"Unsupported file_type: {file_type}")
-
-
-def handle_error(exception: Exception):
-    # Custom error handling for logging
-    raise exception
-
-
-def main():
-    try:
-        # Initialize Glue Context
-        glueContext = create_glue_context()
-
-        # Example paths and configurations
-        input_path = "s3://source-data-bucket/input-data/"  # probs worth using one bucket and different folders? Cuts costs
-        output_path = "s3://target-data-bucket/output-data/"
-
-        # Load data from S3 (adjust format if needed)
-        dynamic_frame = load_data_from_s3(glueContext, input_path, format="json")
-
-        # Transform data
-        transformed_dynamic_frame = transform_data(dynamic_frame)
-
-        # Write the transformed data back to S3, partitioned by 'date'
-        write_data_to_s3(
-            transformed_dynamic_frame,
-            output_path,
-            format="csv",
-            partition_keys=["date"],
-        )
-
-    except Exception as e:
-        handle_error(e)
-
-
-# Entry point for Glue job
-if __name__ == "__main__":
-    main()
+# Run the job
+etl_job.run()
diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py
@@ -0,0 +1,50 @@
+class ETLTemplate:
+    def __init__(
+        self,
+        spark_context,
+        source_path,
+        target_path,
+        partition_cols=None,
+        transformations=[],
+    ):
+        """Initialize Glue context, Spark session, logger, and paths"""
+        self.glue_context = GlueContextSingleton(spark_context).context
+        self.spark = GlueContextSingleton(spark_context).spark
+        self.logger = LoggerSingleton().logger
+        self.source_path = source_path
+        self.target_path = target_path
+        self.partition_cols = partition_cols
+        self.transformations = transformations
+
+    def run(self):
+        """Runs ETL"""
+        try:
+            self.logger.info("ETL Process started.")
+            df = self.extract()
+            self.logger.info(f"Data extracted from {self.source_path}.")
+            df = self.transform(df)
+            self.logger.info("Data transformed successfully.")
+            self.load(df)
+            self.logger.info(f"Data loaded into {self.target_path}.")
+        except Exception as e:
+            self.logger.error(f"ETL process failed: {e}")
+            raise e
+
+    def extract(self):
+        """Extract JSON data from S3"""
+        self.logger.info(f"Extracting data from {self.source_path} as JSON")
+        return self.spark.read.json(self.source_path)
+
+    def transform(self, dataframe):
+        """Apply a list of transformations on the dataframe"""
+        for transformation in self.transformations:
+            self.logger.info(f"Applying transformation: {transformation.__name__}")
+            dataframe = transformation(dataframe)
+        return dataframe
+
+    def load(self, dataframe):
+        """Load transformed data into Parquet format"""
+        self.logger.info(f"Loading data into {self.target_path} as Parquet")
+        dataframe.write.mode("overwrite").partitionBy(*self.partition_cols).parquet(
+            self.target_path
+        )
diff --git a/terraform/account-wide-infrastructure/modules/glue/src/transformations.py b/terraform/account-wide-infrastructure/modules/glue/src/transformations.py
@@ -0,0 +1 @@
+def placeholder(): ...