|
1 | | -from awsglue.context import GlueContext |
2 | | -from awsglue.dynamicframe import DynamicFrame |
| 1 | +import sys |
3 | 2 |
|
4 | | -# from awsglue.job import Job |
| 3 | +from awsglue.utils import getResolvedOptions |
5 | 4 | from pyspark.context import SparkContext |
6 | 5 |
|
7 | | -# from pyspark.sql import DataFrame |
| 6 | +# Get arguments from AWS Glue job |
| 7 | +args = getResolvedOptions( |
| 8 | + sys.argv, ["JOB_NAME", "SOURCE_PATH", "TARGET_PATH", "PARTITION_COLS"] |
| 9 | +) |
8 | 10 |
|
| 11 | +# Start Glue context |
| 12 | +sc = SparkContext() |
9 | 13 |
|
10 | | -def create_glue_context(): |
11 | | - # Initialize the SparkContext and GlueContext |
12 | | - sc = SparkContext() |
13 | | - glueContext = GlueContext(sc) |
| 14 | +partition_cols = args["PARTITION_COLS"].split(",") if "PARTITION_COLS" in args else [] |
14 | 15 |
|
15 | | - return glueContext |
| 16 | +# Initialize ETL process |
| 17 | +etl_job = ETLTemplate( |
| 18 | + spark_context=sc, |
| 19 | + source_path=args["SOURCE_PATH"], |
| 20 | + target_path=args["TARGET_PATH"], |
| 21 | + partition_cols=partition_cols, |
| 22 | + transformations=[placeholder], |
| 23 | +) |
16 | 24 |
|
17 | | - |
18 | | -def load_data_from_s3( |
19 | | - glueContext, s3_path: str, file_type: str = "json", format_options: dict = {} |
20 | | -): |
21 | | - """ |
22 | | - Loads data from S3 into a Glue DynamicFrame. |
23 | | - """ |
24 | | - if file_type == "json": |
25 | | - return glueContext.create_dynamic_frame.from_options( |
26 | | - connection_type="s3", |
27 | | - connection_options={"paths": [s3_path]}, |
28 | | - format=file_type, |
29 | | - ) |
30 | | - else: |
31 | | - raise ValueError(f"Unsupported file_type: {file_type}") |
32 | | - |
33 | | - |
34 | | -def transform_data(dynamic_frame: DynamicFrame) -> DynamicFrame: |
35 | | - """ |
36 | | - Example transformation function. Modify this to suit your transformation logic. |
37 | | - """ |
38 | | - # Convert DynamicFrame to DataFrame to leverage Spark SQL operations if needed |
39 | | - df = dynamic_frame.toDF() |
40 | | - |
41 | | - # Perform any necessary transformations using Spark DataFrame API |
42 | | - df_transformed = df.filter(df["x"] == "placeholder") |
43 | | - |
44 | | - # Convert DataFrame back to DynamicFrame for Glue compatibility |
45 | | - transformed_dynamic_frame = DynamicFrame.fromDF( |
46 | | - df_transformed, dynamic_frame.glue_ctx, "transformed_dynamic_frame" |
47 | | - ) |
48 | | - |
49 | | - return transformed_dynamic_frame |
50 | | - |
51 | | - |
52 | | -def write_data_to_s3( |
53 | | - dynamic_frame: DynamicFrame, |
54 | | - s3_path: str, |
55 | | - file_type: str = "csv", |
56 | | - partition_keys: list = None, |
57 | | -): |
58 | | - """ |
59 | | - Writes a DynamicFrame to S3 with partitioning support for scalability. |
60 | | - """ |
61 | | - if file_type == "csv": |
62 | | - dynamic_frame.toDF().write.option("header", "true").mode( |
63 | | - "overwrite" |
64 | | - ).partitionBy(*partition_keys).csv(s3_path) |
65 | | - elif file_type == "parquet": |
66 | | - dynamic_frame.toDF().write.mode("overwrite").partitionBy( |
67 | | - *partition_keys |
68 | | - ).parquet(s3_path) |
69 | | - elif file_type == "json": |
70 | | - dynamic_frame.toDF().write.mode("overwrite").partitionBy(*partition_keys).json( |
71 | | - s3_path |
72 | | - ) |
73 | | - else: |
74 | | - raise ValueError(f"Unsupported file_type: {file_type}") |
75 | | - |
76 | | - |
77 | | -def handle_error(exception: Exception): |
78 | | - # Custom error handling for logging |
79 | | - raise exception |
80 | | - |
81 | | - |
82 | | -def main(): |
83 | | - try: |
84 | | - # Initialize Glue Context |
85 | | - glueContext = create_glue_context() |
86 | | - |
87 | | - # Example paths and configurations |
88 | | - input_path = "s3://source-data-bucket/input-data/" # probs worth using one bucket and different folders? Cuts costs |
89 | | - output_path = "s3://target-data-bucket/output-data/" |
90 | | - |
91 | | - # Load data from S3 (adjust format if needed) |
92 | | - dynamic_frame = load_data_from_s3(glueContext, input_path, format="json") |
93 | | - |
94 | | - # Transform data |
95 | | - transformed_dynamic_frame = transform_data(dynamic_frame) |
96 | | - |
97 | | - # Write the transformed data back to S3, partitioned by 'date' |
98 | | - write_data_to_s3( |
99 | | - transformed_dynamic_frame, |
100 | | - output_path, |
101 | | - format="csv", |
102 | | - partition_keys=["date"], |
103 | | - ) |
104 | | - |
105 | | - except Exception as e: |
106 | | - handle_error(e) |
107 | | - |
108 | | - |
109 | | -# Entry point for Glue job |
110 | | -if __name__ == "__main__": |
111 | | - main() |
| 25 | +# Run the job |
| 26 | +etl_job.run() |
0 commit comments