updated Java file

scmacdon · scmacdon · commit fc8e9e1358f8 · 2024-09-25T16:54:00.000-04:00
diff --git a/javav2/example_code/glue/src/main/java/com/example/glue/scenario/GlueScenario.java b/javav2/example_code/glue/src/main/java/com/example/glue/scenario/GlueScenario.java
@@ -57,15 +57,15 @@
  * environment, including your credentials.
  * <p>
  * For more information, see the following documentation topic:
- * <p>
+ *
  * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html
- * <p>
+ *
  * To set up the resources, see this documentation topic:
- * <p>
+ *
  * https://docs.aws.amazon.com/glue/latest/ug/tutorial-add-crawler.html
- * <p>
+ *
  * This example performs the following tasks:
- * <p>
+ *
  * 1. Create a database.
  * 2. Create a crawler.
  * 3. Get a crawler.
@@ -88,7 +88,7 @@ public static void main(String[] args) throws InterruptedException {
         final String usage = """
 
             Usage:
-                <iam> <s3Path> <cron> <dbName> <crawlerName> <jobName>\s
+                <iam> <s3Path> <cron> <dbName> <crawlerName> <jobName> <scriptLocation> <locationUri> <bucketNameSc>\s
 
             Where:
                 iam - The ARN of the IAM role that has AWS Glue and S3 permissions.\s
@@ -98,7 +98,7 @@ public static void main(String[] args) throws InterruptedException {
                 crawlerName - The name of the crawler.\s
                 jobName - The name you assign to this job definition.
                 scriptLocation - The Amazon S3 path to a script that runs a job.
-                locationUri - The location of the database
+                locationUri - The location of the database (you can find this file in resources folder).
                 bucketNameSc - The Amazon S3 bucket name used when creating a job
                 """;
 
diff --git a/javav2/example_code/glue/src/main/resources/flight_etl_job_script.py b/javav2/example_code/glue/src/main/resources/flight_etl_job_script.py
@@ -0,0 +1,79 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This script is used by the AWS Glue _getting started with crawlers and jobs_ scenario to
+perform extract, transform, and load (ETL) operations on sample flight data.
+As part of the example, it is uploaded to an Amazon Simple Storage Service (Amazon S3)
+bucket so that AWS Glue can access it.
+"""
+
+# pylint: disable=undefined-variable
+
+import sys
+from awsglue.transforms import *
+from awsglue.utils import getResolvedOptions
+from pyspark.context import SparkContext
+from awsglue.context import GlueContext
+from awsglue.job import Job
+
+"""
+These custom arguments must be passed as Arguments to the StartJobRun request.
+    --input_database    The name of a metadata database that is contained in your 
+                        AWS Glue Data Catalog and that contains tables that describe 
+                        the data to be processed.
+    --input_table       The name of a table in the database that describes the data to
+                        be processed.
+    --output_bucket_url An S3 bucket that receives the transformed output data.  
+"""
+args = getResolvedOptions(
+    sys.argv, ["JOB_NAME", "input_database", "input_table", "output_bucket_url"]
+)
+sc = SparkContext()
+glueContext = GlueContext(sc)
+spark = glueContext.spark_session
+job = Job(glueContext)
+job.init(args["JOB_NAME"], args)
+
+# Script generated for node S3 Flight Data.
+S3FlightData_node1 = glueContext.create_dynamic_frame.from_catalog(
+    database=args["input_database"],
+    table_name=args["input_table"],
+    transformation_ctx="S3FlightData_node1",
+)
+
+# This mapping performs two main functions:
+# 1. It simplifies the output by removing most of the fields from the data.
+# 2. It renames some fields. For example, `fl_date` is renamed to `flight_date`.
+ApplyMapping_node2 = ApplyMapping.apply(
+    frame=S3FlightData_node1,
+    mappings=[
+        ("year", "long", "year", "long"),
+        ("month", "long", "month", "tinyint"),
+        ("day_of_month", "long", "day", "tinyint"),
+        ("fl_date", "string", "flight_date", "string"),
+        ("carrier", "string", "carrier", "string"),
+        ("fl_num", "long", "flight_num", "long"),
+        ("origin_city_name", "string", "origin_city_name", "string"),
+        ("origin_state_abr", "string", "origin_state_abr", "string"),
+        ("dest_city_name", "string", "dest_city_name", "string"),
+        ("dest_state_abr", "string", "dest_state_abr", "string"),
+        ("dep_time", "long", "departure_time", "long"),
+        ("wheels_off", "long", "wheels_off", "long"),
+        ("wheels_on", "long", "wheels_on", "long"),
+        ("arr_time", "long", "arrival_time", "long"),
+        ("mon", "string", "mon", "string"),
+    ],
+    transformation_ctx="ApplyMapping_node2",
+)
+
+# Script generated for node Revised Flight Data.
+RevisedFlightData_node3 = glueContext.write_dynamic_frame.from_options(
+    frame=ApplyMapping_node2,
+    connection_type="s3",
+    format="json",
+    connection_options={"path": args["output_bucket_url"], "partitionKeys": []},
+    transformation_ctx="RevisedFlightData_node3",
+)
+
+job.commit()