11from pyspark .sql import SparkSession #type:ignore
2+ from pyspark .sql import Dataframe #type:ignore
23import pyspark .sql .functions as F #type:ignore
34import argparse
45import os
56import logging
7+ from typing import List , Union , Optional ,Tuple
68
79logging .basicConfig (
810 level = logging .INFO ,
1113 logging .StreamHandler ()
1214 ]
1315)
16+ def get_required_env (env_name :str ) -> str :
17+ env_value = os .getenv (env_name )
18+ if env_value is None :
19+ raise ValueError (f"Environment variable { env_name } is not set" )
20+ return env_value
21+
1422def get_args () -> argparse .Namespace :
1523 parser = argparse .ArgumentParser ()
1624 parser .add_argument ("--savepath" , type = str ,required = True , help = "The S3 bucket intended for the data to be stored" )
@@ -21,7 +29,7 @@ def get_args() -> argparse.Namespace:
2129def get_spark_session (S3_ACCESS_KEY : str ,S3_SECRET_KEY : str , S3_ENDPOINT : str ) -> SparkSession :
2230
2331 spark = SparkSession .builder \
24- .appName ("incremental_table_ingestion " ) \
32+ .appName ("full_table_ingestion " ) \
2533 .config ("spark.jars" , "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.375.jar,/opt/spark/jars/delta-spark_2.12-3.2.1.jar,/opt/spark/jars/delta-storage-3.2.1.jar,/opt/spark/jars/delta-kernel-api-3.2.1.jar,/opt/spark/jars/mysql-connector-j-8.3.0.jar" ) \
2634 .config ("spark.sql.extensions" , "io.delta.sql.DeltaSparkSessionExtension" )\
2735 .config ("spark.sql.catalog.spark_catalog" , "org.apache.spark.sql.delta.catalog.DeltaCatalog" )\
@@ -33,26 +41,26 @@ def get_spark_session(S3_ACCESS_KEY: str,S3_SECRET_KEY: str , S3_ENDPOINT: str)
3341 .getOrCreate ()
3442 return spark
3543
36- ###################################################################################
37- # GET MYSQL CREDENTIALS #
38- ###################################################################################
39- def main () -> None :
40- MYSQL_DATABASE = os .getenv ("MYSQL_DATABASE" )
41- MYSQL_HOST = os .getenv ("MYSQL_HOST" )
42- MYSQL_PORT = os .getenv ("MYSQL_PORT" )
43- MYSQL_USER = os .getenv ("MYSQL_USER" )
44- MYSQL_SECRET = os .getenv ("MYSQL_SECRET" )
45- jdbc_url = f"jdbc:mysql://{ MYSQL_HOST } :{ MYSQL_PORT } /{ MYSQL_DATABASE } "
46-
44+ def add_ingestion_metadata_column (df : Dataframe ,table : str ) -> Dataframe :
45+ tmp_df = df .withColumn ("ingestion_date" , F .current_timestamp ()).withColumn ("source_name" , F .lit (table ))
46+ return tmp_df
4747
48+ def add_date_partition_columns (df : Dataframe ,column_name :str ) -> Dataframe :
49+ df = df .withColumn ("year" , F .year (F .col (column_name )))\
50+ .withColumn ("month" , F .month (F .col (column_name )))\
51+ .withColumn ("day" ,F .day (column_name ))
4852
49- ###################################################################################
50- # GET S3 CREDENTIALS #
51- ###################################################################################
52- S3_ACCESS_KEY = str (os .getenv ("S3_ACCESS_KEY" ))
53- S3_SECRET_KEY = str (os .getenv ("S3_SECRET_KEY" ))
54- S3_ENDPOINT = str (os .getenv ("S3_ENDPOINT" ))
53+ def main () -> None :
54+ MYSQL_DATABASE = get_required_env ("MYSQL_DATABASE" )
55+ MYSQL_HOST = get_required_env ("MYSQL_HOST" )
56+ MYSQL_PORT = get_required_env ("MYSQL_PORT" )
57+ MYSQL_USER = get_required_env ("MYSQL_USER" )
58+ MYSQL_SECRET = get_required_env ("MYSQL_SECRET" )
59+ jdbc_url = f"jdbc:mysql://{ MYSQL_HOST } :{ MYSQL_PORT } /{ MYSQL_DATABASE } "
5560
61+ S3_ACCESS_KEY = get_required_env ("S3_ACCESS_KEY" )
62+ S3_SECRET_KEY = get_required_env ("S3_SECRET_KEY" )
63+ S3_ENDPOINT = get_required_env ("S3_ENDPOINT" )
5664 args = get_args ()
5765 S3_SAVEPATH = args .savepath
5866 undesired_column = args .undesired_column
@@ -65,15 +73,15 @@ def main() -> None:
6573 "auth_userprofile" ,
6674 "student_userattribute" ,
6775 "organizations_organization" ,
76+ "organizations_historicalorganization"
6877 "auth_user"
6978 ]
7079
80+ spark = get_spark_session (S3_ACCESS_KEY = S3_ACCESS_KEY ,S3_SECRET_KEY = S3_SECRET_KEY ,S3_ENDPOINT = S3_ENDPOINT )
7181 for table in TABLES :
7282
7383 logging .info (f"getting table { table } " )
7484 try :
75-
76- spark = get_spark_session (S3_ACCESS_KEY = S3_ACCESS_KEY ,S3_SECRET_KEY = S3_SECRET_KEY ,S3_ENDPOINT = S3_ENDPOINT )
7785
7886 df = spark .read .format ("jdbc" ) \
7987 .option ("url" , jdbc_url ) \
@@ -85,18 +93,20 @@ def main() -> None:
8593 if table == "auth_user" :
8694 df = df .drop (undesired_column )
8795
88- df = df . withColumn ( "ingestion_date" , F . current_timestamp ()) \
89- . withColumn ( "source_name" , F . lit ( table ) )
96+ df = add_ingestion_metadata_column ( df = df , table = table )
97+ df = add_date_partition_columns ( df , "ingestion_date" )
9098 if table == "auth_user" and undesired_column and undesired_column in df .columns :
9199 raise Exception ("THE undesired column stills in the dataframe" )
100+
92101 output_path = f"{ S3_SAVEPATH } /{ table } "
93102
94- df .write .format ("delta" ).mode ("append" ).save (output_path )
103+ df .write .format ("delta" ).mode ("append" ).partitionBy ( "year" , "month" , "day" ). save (output_path )
95104
96105 logging .info (f"Data saved as Delta table to { output_path } " )
97106
98107 except Exception as e :
99108 logging .error (f"Pipeline failed: { e } " )
109+
100110 spark .stop ()
101111
102112
0 commit comments