Skip to content

Commit e326add

Browse files
lachezar-nmboehm7
authored andcommitted
[SYSTEMDS-1780] Final resource optimizer for AWS EMR
Closes #2135.
1 parent c929843 commit e326add

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+5175
-1226
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,7 @@ src/test/scripts/functions/pipelines/intermediates/classification/*
146146

147147
venv
148148
venv/*
149+
150+
# resource optimization
151+
scripts/resource/output
152+
*.pem

docs/api/java/org/apache/sysds/resource/class-use/CloudUtils.InstanceType.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<html lang="en">
44
<head>
55
<!-- Generated by javadoc -->
6-
<title>Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType (Apache SystemDS 3.3.0-SNAPSHOT API)</title>
6+
<title>Uses of Class org.apache.sysds.resource.CloudUtils.InstanceFamily (Apache SystemDS 3.3.0-SNAPSHOT API)</title>
77
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
88
<link rel="stylesheet" type="text/css" href="../../../../../stylesheet.css" title="Style">
99
<link rel="stylesheet" type="text/css" href="../../../../../jquery/jquery-ui.min.css" title="Style">
@@ -94,7 +94,7 @@
9494
</header>
9595
<main role="main">
9696
<div class="header">
97-
<h2 title="Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType" class="title">Uses of Class<br>org.apache.sysds.resource.CloudUtils.InstanceType</h2>
97+
<h2 title="Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType" class="title">Uses of Class<br>org.apache.sysds.resource.CloudUtils.InstanceFamily</h2>
9898
</div>
9999
<div class="classUseContainer">
100100
<ul class="blockList">

pom.xml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@
6464
<maven-shade-plugin.version>3.5.0</maven-shade-plugin.version>
6565
<maven-compiler-plugin.version>3.11.0</maven-compiler-plugin.version>
6666
<maven-antrun-plugin.version>3.1.0</maven-antrun-plugin.version>
67+
<!-- aws-java-sdk-bundle version should align with hadoop-aws version -->
68+
<!-- aws-java-sdk-bundle.version>1.12.367</aws-java-sdk-bundle.version -->
6769
<!-- Set java compile level via argument, ex: 1.8 1.9 10 11-->
6870
<java.level>11</java.level>
6971
<java.version>{java.level}</java.version>
@@ -274,7 +276,7 @@
274276
<manifest>
275277
<addClasspath>true</addClasspath>
276278
<classpathPrefix>lib/</classpathPrefix>
277-
<mainClass>org.apache.sysds.api.ropt.Executor</mainClass>
279+
<mainClass>org.apache.sysds.resource.ResourceOptimizer</mainClass>
278280
</manifest>
279281
<manifestEntries>
280282
<Class-Path>SystemDS.jar ${project.artifactId}-${project.version}.jar</Class-Path>
@@ -413,6 +415,18 @@
413415
<goal>run</goal>
414416
</goals>
415417
</execution>
418+
<execution>
419+
<id>rename-ropt-jar</id>
420+
<phase>package</phase>
421+
<configuration>
422+
<target name="rename test JAR">
423+
<copy file="${project.build.directory}/${project.artifactId}-${project.version}-ropt.jar" tofile="${project.build.directory}/ResourceOptimizer.jar" />
424+
</target>
425+
</configuration>
426+
<goals>
427+
<goal>run</goal>
428+
</goals>
429+
</execution>
416430
</executions>
417431
</plugin>
418432

@@ -1337,6 +1351,18 @@
13371351
</exclusions>
13381352
</dependency>
13391353

1354+
<!--dependency>
1355+
<groupId>org.apache.hadoop</groupId>
1356+
<artifactId>hadoop-aws</artifactId>
1357+
<version>${hadoop.version}</version>
1358+
</dependency-->
1359+
1360+
<!--dependency>
1361+
<groupId>com.amazonaws</groupId>
1362+
<artifactId>aws-java-sdk-bundle</artifactId>
1363+
<version>${aws-java-sdk-bundle.version}</version>
1364+
</dependency-->
1365+
13401366
<dependency>
13411367
<groupId>commons-logging</groupId>
13421368
<artifactId>commons-logging</artifactId>

scripts/resource/README.md

Lines changed: 169 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Region,Fee Ratio,EBS Price
2+
af-south-1,0.195918367,0.1047
3+
ap-east-1,0.181818182,0.1056
4+
ap-northeast-1,0.193548387,0.096
5+
ap-northeast-2,0.203389831,0.0912
6+
ap-northeast-3,0.193548387,0.096
7+
ap-south-1,0.237623762,0.0912
8+
ap-south-2,0.237623762,0.0912
9+
ap-southeast-1,0.2,0.096
10+
ap-southeast-2,0.2,0.096
11+
ap-southeast-3,0.2,0.096
12+
ap-southeast-4,0.2,0.096
13+
ap-southeast-5,0.235294118,0.0864
14+
ca-central-1,0.224299065,0.088
15+
ca-west-1,0.224299065,0.088
16+
eu-central-1,0.208695652,0.0952
17+
eu-central-2,0.18972332,0.1142
18+
eu-north-1,0.235294118,0.0836
19+
eu-south-1,0.214285714,0.0924
20+
eu-south-2,0.224299065,0.088
21+
eu-west-1,0.224299065,0.088
22+
eu-west-2,0.216216216,0.0928
23+
eu-west-3,0.214285714,0.0928
24+
il-central-1,0.213333333,0.1056
25+
me-central-1,0.204255319,0.0968
26+
me-south-1,0.204255319,0.0968
27+
sa-east-1,0.156862745,0.152
28+
us-east-1,0.25,0.08
29+
us-east-2,0.25,0.08
30+
us-west-1,0.214285714,0.096
31+
us-west-2,0.25,0.08

scripts/resource/bin/systemds-ropt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/usr/bin/env bash
2+
3+
ROPT_JAR_FILE="${SYSTEMDS_ROOT}/target/ResourceOptimizer.jar"
4+
DEFAULT_PROPERTIES="${SYSTEMDS_ROOT}/scripts/resource/options.properties"
5+
6+
java -jar "$ROPT_JAR_FILE" "$@" -options "$DEFAULT_PROPERTIES"
7+

scripts/resource/ec2_stats.csv

Lines changed: 373 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#-------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
#-------------------------------------------------------------
21+
22+
# Configurations for EMR launch
23+
24+
# User-defined configurations --------------------------------
25+
26+
# Program specific --------------------------------
27+
28+
# URI addres for the SystemDS jar file on S3
29+
SYSTEMDS_JAR_URI=
30+
# DML script path (use s3a:// URI schema for remote scripts in S3)
31+
SYSTEMDS_PROGRAM=s3://systemds-testing/dml_scripts/Algorithm_L2SVM.dml
32+
# Set the the file path arguments with adapted URI address
33+
# for the actual file location and always s3a:// schema
34+
# comma separated values
35+
SYSTEMDS_ARGS=
36+
# comma separated key=value pairs
37+
SYSTEMDS_NVARGS=m=200000,n=10000
38+
#Y=s3://systemds-testing/data/Y.csv,B=s3a://systemds-testing/data/B.csv
39+
40+
# AWS specific -------------------------
41+
42+
# Inspect the version difference before changing to version defferent form 7.3.0
43+
EMR_VERSION="emr-7.3.0"
44+
# output file of the resource optimization: hardware configurations
45+
INSTANCE_CONFIGS=
46+
# output file of the resource optimization: Spark configurations
47+
SPARK_CONFIGS=
48+
# existing SSH key (not created automatically)
49+
KEYPAIR_NAME=
50+
# Choose the same region as at executing resource optimizer
51+
REGION=us-east-1
52+
# Provide optionally a (signle) security group id to be added as additional to the master node
53+
# If value empy the option won't be used and AWS won't attach an additional group and the SSH may be blocked
54+
# Multiple additional groups are not supported by the launch script and this one is attached to the master only
55+
SECURITY_GROUP_ID=
56+
# Provide already created names
57+
# or desired names for generation with 'generate_instance_profile.sh'
58+
INSTANCE_PROFILE_NAME=
59+
IAM_ROLE_NAME=
60+
# Desired subnet to be used by the cluster, if not defined a default one will be used
61+
TARGET_SUBNET=
62+
# S3 folder URI for landing of log files
63+
LOG_URI=
64+
65+
# Execution specific -------------------------
66+
67+
# (number) - if 0 the cluster will be terminated automatically after program execution
68+
# - if greater than 0 the cluster will be terminated automatically after the given number of second in state idle
69+
# - if less than 0 no automatic temrination rules will be applied
70+
AUTO_TERMINATION_TIME=-1
71+
72+
# Automatic configurations (read only for users) -------------
73+
74+
# Current EMR Cluster ID
75+
CLUSTER_ID=
76+
# Public DNS name of the moster node in the current cluster
77+
CLUSTER_URL=
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/env bash
2+
#-------------------------------------------------------------
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one
5+
# or more contributor license agreements. See the NOTICE file
6+
# distributed with this work for additional information
7+
# regarding copyright ownership. The ASF licenses this file
8+
# to you under the Apache License, Version 2.0 (the
9+
# "License"); you may not use this file except in compliance
10+
# with the License. You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing,
15+
# software distributed under the License is distributed on an
16+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
# KIND, either express or implied. See the License for the
18+
# specific language governing permissions and limitations
19+
# under the License.
20+
#
21+
#-------------------------------------------------------------
22+
23+
# exit in case of error or unbound var
24+
set -euo pipefail
25+
26+
# get file directory to allow finding the file with the utils
27+
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
28+
29+
source cluster.env
30+
source "$SCRIPT_DIR/cluster_utils.sh"
31+
32+
if [ -n "$TARGET_SUBNET" ]; then
33+
SUBNET=$TARGET_SUBNET
34+
else
35+
#Get the first available subnet in the default VPC of the configured region
36+
SUBNET=$(aws ec2 describe-subnets --region $REGION \
37+
--filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId" --output text)
38+
fi
39+
40+
# generate the step definition into STEP variable
41+
generate_step_definition
42+
43+
echo -e "\nLaunching EMR cluster via AWS CLI and adding a step to run $SYSTEMDS_PROGRAM with SystemDS"
44+
CLUSTER_INFO=$(aws emr create-cluster \
45+
--applications Name=AmazonCloudWatchAgent Name=Spark \
46+
--ec2-attributes '{
47+
"KeyName":"'${KEYPAIR_NAME}'",
48+
"InstanceProfile":"EMR_EC2_DefaultRole",
49+
'"$( [ -n "$SECURITY_GROUP_ID'" ] && echo '"AdditionalMasterSecurityGroups": ["'${SECURITY_GROUP_ID}'"],' )"'
50+
"SubnetId": "'${SUBNET}'"
51+
}'\
52+
--service-role EMR_DefaultRole \
53+
--enable-debugging \
54+
--release-label $EMR_VERSION \
55+
--log-uri $LOG_URI \
56+
--name "SystemDS cluster" \
57+
--instance-groups file://$INSTANCE_CONFIGS \
58+
--configurations file://$SPARK_CONFIGS \
59+
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
60+
--no-termination-protected \
61+
$( [ -n "$STEP" ] && echo "--steps $STEP" ) \
62+
$( [ "$AUTO_TERMINATION_TIME" = 0 ] && echo "--auto-terminate" ) \
63+
$( [ "$AUTO_TERMINATION_TIME" -gt 0 ] && echo "--auto-termination-policy IdleTimeout=$AUTO_TERMINATION_TIME" ) \
64+
--region $REGION)
65+
66+
CLUSTER_ID=$(echo $CLUSTER_INFO | jq .ClusterId | tr -d '"')
67+
echo "Cluster successfully initialized with cluster ID: "${CLUSTER_ID}
68+
set_config "CLUSTER_ID" $CLUSTER_ID
69+
70+
# Wait for cluster to start
71+
echo -e "\nWaiting for cluster to enter running state..."
72+
aws emr wait cluster-running --cluster-id $CLUSTER_ID --region $REGION
73+
74+
CLUSTER_URL=$(aws emr describe-cluster --cluster-id $CLUSTER_ID --region $REGION | jq .Cluster.MasterPublicDnsName | tr -d '"')
75+
set_config "CLUSTER_URL" "$CLUSTER_URL"
76+
77+
echo "...launching process has finished and the cluster is not in state running."
78+
79+
if [ "$AUTO_TERMINATION_TIME" = 0 ]; then
80+
echo -e "\nImmediate automatic termination was enabled so the cluster will terminate directly after the step completion"
81+
elif [ "$AUTO_TERMINATION_TIME" -gt 0 ]; then
82+
echo -e "\nDelayed automatic termination was enabled so the cluster will terminate $AUTO_TERMINATION_TIME
83+
seconds after entering idle state"
84+
else
85+
echo -e "\nAutomatic termination was not enabled so you should manually terminate the cluster"
86+
fi
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env bash
2+
#-------------------------------------------------------------
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one
5+
# or more contributor license agreements. See the NOTICE file
6+
# distributed with this work for additional information
7+
# regarding copyright ownership. The ASF licenses this file
8+
# to you under the Apache License, Version 2.0 (the
9+
# "License"); you may not use this file except in compliance
10+
# with the License. You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing,
15+
# software distributed under the License is distributed on an
16+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
# KIND, either express or implied. See the License for the
18+
# specific language governing permissions and limitations
19+
# under the License.
20+
#
21+
#-------------------------------------------------------------
22+
23+
# exit in case of error or unbound var
24+
set -euo pipefail
25+
26+
# get file directory to allow finding the file with the utils
27+
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
28+
29+
source cluster.env
30+
source "$SCRIPT_DIR/cluster_utils.sh"
31+
32+
# generate the step definition into STEP variable
33+
generate_step_definition
34+
if [ $STEP -z ]; then
35+
echo "Error: Empty state definition, probably due to empty SYSTEMDS_PROGRAM option."
36+
exit 1
37+
fi
38+
39+
echo "Adding a step to run $SYSTEMDS_PROGRAM with SystemDS"
40+
STEP_INFO=$(aws emr add-steps --cluster-id $CLUSTER_ID --region $REGION --steps $STEP)
41+
42+
if [ "$AUTO_TERMINATION_TIME" = 0 ]; then
43+
STEP_ID=$(echo $STEP_INFO | jq .StepIds | tr -d '"' | tr -d ']' | tr -d '[' | tr -d '[:space:]' )
44+
echo "Waiting for the step to finish before termination (immediate automatic termination enabled)"
45+
aws emr wait step-complete --cluster-id $CLUSTER_ID --step-id $STEP_ID --region $REGION
46+
echo "The step has finished and now the cluster will before immediately terminated"
47+
aws emr terminate-clusters --cluster-ids $CLUSTER_ID
48+
elif [ "$AUTO_TERMINATION_TIME" -gt 0 ]; then
49+
echo "Delayed automatic termination will apply only in case this option was set on cluster launch."
50+
echo "You should manually track the step completion"
51+
else
52+
echo "Automatic termination was not enabled so you should manually track the step completion and terminate the cluster"
53+
fi
54+
55+

0 commit comments

Comments
 (0)