Skip to content

Commit fc8e9e1

Browse files
committed
updated Java file
1 parent 60d6417 commit fc8e9e1

File tree

2 files changed

+86
-7
lines changed

2 files changed

+86
-7
lines changed

javav2/example_code/glue/src/main/java/com/example/glue/scenario/GlueScenario.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,15 @@
5757
* environment, including your credentials.
5858
* <p>
5959
* For more information, see the following documentation topic:
60-
* <p>
60+
*
6161
* https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html
62-
* <p>
62+
*
6363
* To set up the resources, see this documentation topic:
64-
* <p>
64+
*
6565
* https://docs.aws.amazon.com/glue/latest/ug/tutorial-add-crawler.html
66-
* <p>
66+
*
6767
* This example performs the following tasks:
68-
* <p>
68+
*
6969
* 1. Create a database.
7070
* 2. Create a crawler.
7171
* 3. Get a crawler.
@@ -88,7 +88,7 @@ public static void main(String[] args) throws InterruptedException {
8888
final String usage = """
8989
9090
Usage:
91-
<iam> <s3Path> <cron> <dbName> <crawlerName> <jobName>\s
91+
<iam> <s3Path> <cron> <dbName> <crawlerName> <jobName> <scriptLocation> <locationUri> <bucketNameSc>\s
9292
9393
Where:
9494
iam - The ARN of the IAM role that has AWS Glue and S3 permissions.\s
@@ -98,7 +98,7 @@ public static void main(String[] args) throws InterruptedException {
9898
crawlerName - The name of the crawler.\s
9999
jobName - The name you assign to this job definition.
100100
scriptLocation - The Amazon S3 path to a script that runs a job.
101-
locationUri - The location of the database
101+
locationUri - The location of the database (you can find this file in resources folder).
102102
bucketNameSc - The Amazon S3 bucket name used when creating a job
103103
""";
104104

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
This script is used by the AWS Glue _getting started with crawlers and jobs_ scenario to
6+
perform extract, transform, and load (ETL) operations on sample flight data.
7+
As part of the example, it is uploaded to an Amazon Simple Storage Service (Amazon S3)
8+
bucket so that AWS Glue can access it.
9+
"""
10+
11+
# pylint: disable=undefined-variable
12+
13+
import sys
14+
from awsglue.transforms import *
15+
from awsglue.utils import getResolvedOptions
16+
from pyspark.context import SparkContext
17+
from awsglue.context import GlueContext
18+
from awsglue.job import Job
19+
20+
"""
21+
These custom arguments must be passed as Arguments to the StartJobRun request.
22+
--input_database The name of a metadata database that is contained in your
23+
AWS Glue Data Catalog and that contains tables that describe
24+
the data to be processed.
25+
--input_table The name of a table in the database that describes the data to
26+
be processed.
27+
--output_bucket_url An S3 bucket that receives the transformed output data.
28+
"""
29+
args = getResolvedOptions(
30+
sys.argv, ["JOB_NAME", "input_database", "input_table", "output_bucket_url"]
31+
)
32+
sc = SparkContext()
33+
glueContext = GlueContext(sc)
34+
spark = glueContext.spark_session
35+
job = Job(glueContext)
36+
job.init(args["JOB_NAME"], args)
37+
38+
# Script generated for node S3 Flight Data.
39+
S3FlightData_node1 = glueContext.create_dynamic_frame.from_catalog(
40+
database=args["input_database"],
41+
table_name=args["input_table"],
42+
transformation_ctx="S3FlightData_node1",
43+
)
44+
45+
# This mapping performs two main functions:
46+
# 1. It simplifies the output by removing most of the fields from the data.
47+
# 2. It renames some fields. For example, `fl_date` is renamed to `flight_date`.
48+
ApplyMapping_node2 = ApplyMapping.apply(
49+
frame=S3FlightData_node1,
50+
mappings=[
51+
("year", "long", "year", "long"),
52+
("month", "long", "month", "tinyint"),
53+
("day_of_month", "long", "day", "tinyint"),
54+
("fl_date", "string", "flight_date", "string"),
55+
("carrier", "string", "carrier", "string"),
56+
("fl_num", "long", "flight_num", "long"),
57+
("origin_city_name", "string", "origin_city_name", "string"),
58+
("origin_state_abr", "string", "origin_state_abr", "string"),
59+
("dest_city_name", "string", "dest_city_name", "string"),
60+
("dest_state_abr", "string", "dest_state_abr", "string"),
61+
("dep_time", "long", "departure_time", "long"),
62+
("wheels_off", "long", "wheels_off", "long"),
63+
("wheels_on", "long", "wheels_on", "long"),
64+
("arr_time", "long", "arrival_time", "long"),
65+
("mon", "string", "mon", "string"),
66+
],
67+
transformation_ctx="ApplyMapping_node2",
68+
)
69+
70+
# Script generated for node Revised Flight Data.
71+
RevisedFlightData_node3 = glueContext.write_dynamic_frame.from_options(
72+
frame=ApplyMapping_node2,
73+
connection_type="s3",
74+
format="json",
75+
connection_options={"path": args["output_bucket_url"], "partitionKeys": []},
76+
transformation_ctx="RevisedFlightData_node3",
77+
)
78+
79+
job.commit()

0 commit comments

Comments
 (0)