Skip to content

Commit 0a7821b

Browse files
authored
Refactor how we launch spot instances (#366)
* use create_instances for spot instances too * deprecate --ec2-spot-request-duration * remove unused date utilities for translating durations into expirations * add changelog
1 parent 7dde875 commit 0a7821b

File tree

6 files changed

+43
-148
lines changed

6 files changed

+43
-148
lines changed

CHANGES.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88

99
* [#348]: Bumped default Spark to 3.2; dropped support for Python 3.6; added CI build for Python 3.10.
1010
* [#361]: Migrated from AdoptOpenJDK, which is deprecated, to Adoptium OpenJDK.
11-
* [#362]: Improved Flintrock's ability to cleanup after launch failures.
11+
* [#362][#366]: Improved Flintrock's ability to cleanup after launch failures.
12+
* [#366]: Deprecated `--ec2-spot-request-duration`, which is not needed for one-time spot instances launched using the RunInstances API.
1213

1314
[#348]: https://github.com/nchammas/flintrock/pull/348
1415
[#361]: https://github.com/nchammas/flintrock/pull/361
1516
[#362]: https://github.com/nchammas/flintrock/pull/362
17+
[#366]: https://github.com/nchammas/flintrock/pull/366
1618

1719
## [2.0.0] - 2021-06-10
1820

flintrock/config.yaml.template

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ providers:
3535
# ami: ami-61bbf104 # CentOS 7, us-east-1
3636
# user: centos
3737
# spot-price: <price>
38-
# spot-request-duration: 7d # duration a spot request is valid, supports d/h/m/s (e.g. 4d 3h 2m 1s)
3938
# vpc-id: <id>
4039
# subnet-id: <id>
4140
# placement-group: <name>

flintrock/ec2.py

Lines changed: 25 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
)
2828
from .ssh import generate_ssh_key_pair
2929
from .services import SecurityGroupRule
30-
from .util import duration_to_expiration
3130

3231
logger = logging.getLogger('flintrock.ec2')
3332

@@ -275,7 +274,6 @@ def add_slaves(
275274
identity_file: str,
276275
num_slaves: int,
277276
spot_price: float,
278-
spot_request_duration: str,
279277
min_root_ebs_size_gb: int,
280278
tags: list,
281279
assume_yes: bool,
@@ -321,7 +319,6 @@ def add_slaves(
321319
num_instances=num_slaves,
322320
region=self.region,
323321
spot_price=spot_price,
324-
spot_request_valid_until=duration_to_expiration(spot_request_duration),
325322
ami=self.master_instance.image_id,
326323
assume_yes=assume_yes,
327324
key_name=self.master_instance.key_name,
@@ -704,7 +701,6 @@ def _create_instances(
704701
num_instances,
705702
region,
706703
spot_price,
707-
spot_request_valid_until,
708704
ami,
709705
assume_yes,
710706
key_name,
@@ -724,7 +720,6 @@ def _create_instances(
724720
ec2 = boto3.resource(service_name='ec2', region_name=region)
725721

726722
cluster_instances = []
727-
spot_requests = []
728723
common_launch_specs = {
729724
'ImageId': ami,
730725
'KeyName': key_name,
@@ -733,7 +728,6 @@ def _create_instances(
733728
'Placement': {
734729
'AvailabilityZone': availability_zone,
735730
'Tenancy': tenancy,
736-
'GroupName': placement_group,
737731
},
738732
'SecurityGroupIds': security_group_ids,
739733
'SubnetId': subnet_id,
@@ -748,80 +742,36 @@ def _create_instances(
748742
],
749743
}
750744

745+
if spot_price:
746+
common_launch_specs.update({
747+
'InstanceMarketOptions': {
748+
'MarketType': 'spot',
749+
'SpotOptions': {
750+
'SpotInstanceType': 'one-time',
751+
'MaxPrice': str(spot_price),
752+
'InstanceInterruptionBehavior': 'terminate',
753+
},
754+
}
755+
})
756+
else:
757+
common_launch_specs.update({
758+
'InstanceInitiatedShutdownBehavior': instance_initiated_shutdown_behavior,
759+
})
760+
# This can't be part of the previous update because we need a deep merge.
761+
common_launch_specs['Placement'].update({
762+
'GroupName': placement_group,
763+
})
764+
751765
try:
752-
if spot_price:
753-
user_data = base64.b64encode(user_data.encode('utf-8')).decode()
754-
logger.info("Requesting {c} spot instances at a max price of ${p}...".format(
755-
c=num_instances, p=spot_price))
756-
client = ec2.meta.client
757-
spot_requests = client.request_spot_instances(
758-
SpotPrice=str(spot_price),
759-
InstanceCount=num_instances,
760-
ValidUntil=spot_request_valid_until,
761-
LaunchSpecification=common_launch_specs,
762-
)['SpotInstanceRequests']
763-
764-
request_ids = [r['SpotInstanceRequestId'] for r in spot_requests]
765-
pending_request_ids = request_ids
766-
767-
while pending_request_ids:
768-
logger.info("{grant} of {req} instances granted. Waiting...".format(
769-
grant=num_instances - len(pending_request_ids),
770-
req=num_instances))
771-
time.sleep(30)
772-
spot_requests = client.describe_spot_instance_requests(
773-
SpotInstanceRequestIds=request_ids)['SpotInstanceRequests']
774-
775-
failed_requests = [r for r in spot_requests if r['State'] == 'failed']
776-
if failed_requests:
777-
failure_reasons = {r['Status']['Code'] for r in failed_requests}
778-
raise Error(
779-
"The spot request failed for the following reason{s}: {reasons}"
780-
.format(
781-
s='' if len(failure_reasons) == 1 else 's',
782-
reasons=', '.join(failure_reasons)))
783-
784-
pending_request_ids = [
785-
r['SpotInstanceRequestId'] for r in spot_requests
786-
if r['State'] == 'open']
787-
788-
logger.info("All {c} instances granted.".format(c=num_instances))
789-
790-
cluster_instances = list(
791-
ec2.instances.filter(
792-
Filters=[
793-
{'Name': 'instance-id', 'Values': [r['InstanceId'] for r in spot_requests]}
794-
]))
795-
else:
796-
cluster_instances = ec2.create_instances(
797-
MinCount=num_instances,
798-
MaxCount=num_instances,
799-
# Shutdown Behavior is specific to on-demand instances.
800-
InstanceInitiatedShutdownBehavior=instance_initiated_shutdown_behavior,
801-
**common_launch_specs,
802-
)
766+
cluster_instances = ec2.create_instances(
767+
MinCount=num_instances,
768+
MaxCount=num_instances,
769+
**common_launch_specs,
770+
)
803771
return cluster_instances
804772
except (Exception, KeyboardInterrupt) as e:
805773
if not isinstance(e, KeyboardInterrupt):
806774
print(e, file=sys.stderr)
807-
if spot_requests:
808-
request_ids = [r['SpotInstanceRequestId'] for r in spot_requests]
809-
if any([r['State'] != 'active' for r in spot_requests]):
810-
print("Canceling spot instance requests...", file=sys.stderr)
811-
client.cancel_spot_instance_requests(
812-
SpotInstanceRequestIds=request_ids)
813-
# Make sure we have the latest information on any launched spot instances.
814-
spot_requests = client.describe_spot_instance_requests(
815-
SpotInstanceRequestIds=request_ids)['SpotInstanceRequests']
816-
instance_ids = [
817-
r['InstanceId'] for r in spot_requests
818-
if 'InstanceId' in r]
819-
if instance_ids:
820-
cluster_instances = list(
821-
ec2.instances.filter(
822-
Filters=[
823-
{'Name': 'instance-id', 'Values': instance_ids}
824-
]))
825775
raise InterruptedEC2Operation(instances=cluster_instances) from e
826776

827777

@@ -842,7 +792,6 @@ def launch(
842792
user,
843793
security_groups,
844794
spot_price=None,
845-
spot_request_duration=None,
846795
min_root_ebs_size_gb,
847796
vpc_id,
848797
subnet_id,
@@ -916,7 +865,6 @@ def launch(
916865
common_instance_spec = {
917866
'region': region,
918867
'spot_price': spot_price,
919-
'spot_request_valid_until': duration_to_expiration(spot_request_duration),
920868
'ami': ami,
921869
'assume_yes': assume_yes,
922870
'key_name': key_name,

flintrock/flintrock.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939
else:
4040
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
4141

42+
EC2_SPOT_REQUEST_DURATION_DEPRECATION_MESSAGE = (
43+
"Deprecation: --ec2-spot-request-duration is deprecated. One-time spot instances do "
44+
"not support a request duration. "
45+
"For more information see: https://github.com/nchammas/flintrock/pull/366"
46+
)
4247

4348
logger = logging.getLogger('flintrock.flintrock')
4449

@@ -345,8 +350,8 @@ def cli(cli_context, config, provider, debug):
345350
help="Additional security groups names to assign to the instances. "
346351
"You can specify this option multiple times.")
347352
@click.option('--ec2-spot-price', type=float)
348-
@click.option('--ec2-spot-request-duration', default='7d',
349-
help="Duration a spot request is valid (e.g. 3d 2h 1m).")
353+
@click.option('--ec2-spot-request-duration',
354+
help="(DEPRECATED) Duration a spot request is valid (e.g. 3d 2h 1m).")
350355
@click.option('--ec2-min-root-ebs-size-gb', type=int, default=30)
351356
@click.option('--ec2-vpc-id', default='', help="Leave empty for default VPC.")
352357
@click.option('--ec2-subnet-id', default='')
@@ -414,6 +419,9 @@ def launch(
414419
"""
415420
Launch a new cluster.
416421
"""
422+
if ec2_spot_request_duration:
423+
logger.warning(EC2_SPOT_REQUEST_DURATION_DEPRECATION_MESSAGE)
424+
417425
provider = cli_context.obj['provider']
418426
services = []
419427

@@ -511,7 +519,6 @@ def launch(
511519
user=ec2_user,
512520
security_groups=ec2_security_groups,
513521
spot_price=ec2_spot_price,
514-
spot_request_duration=ec2_spot_request_duration,
515522
min_root_ebs_size_gb=ec2_min_root_ebs_size_gb,
516523
vpc_id=ec2_vpc_id,
517524
subnet_id=ec2_subnet_id,
@@ -787,8 +794,8 @@ def stop(cli_context, cluster_name, ec2_region, ec2_vpc_id, assume_yes):
787794
help="Path to SSH .pem file for accessing nodes.")
788795
@click.option('--ec2-user')
789796
@click.option('--ec2-spot-price', type=float)
790-
@click.option('--ec2-spot-request-duration', default='7d',
791-
help="Duration a spot request is valid (e.g. 3d 2h 1m).")
797+
@click.option('--ec2-spot-request-duration',
798+
help="(DEPRECATED) Duration a spot request is valid (e.g. 3d 2h 1m).")
792799
@click.option('--ec2-min-root-ebs-size-gb', type=int, default=30)
793800
@click.option('--assume-yes/--no-assume-yes', default=False)
794801
@click.option('--ec2-tag', 'ec2_tags',
@@ -816,6 +823,8 @@ def add_slaves(
816823
Flintrock will configure new slaves based on information queried
817824
automatically from the master.
818825
"""
826+
if ec2_spot_request_duration:
827+
logger.warning(EC2_SPOT_REQUEST_DURATION_DEPRECATION_MESSAGE)
819828
provider = cli_context.obj['provider']
820829

821830
option_requires(
@@ -842,7 +851,6 @@ def add_slaves(
842851
provider_options = {
843852
'min_root_ebs_size_gb': ec2_min_root_ebs_size_gb,
844853
'spot_price': ec2_spot_price,
845-
'spot_request_duration': ec2_spot_request_duration,
846854
'tags': ec2_tags
847855
}
848856
else:

flintrock/util.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import os
22
import sys
3-
from datetime import datetime, timedelta, timezone
4-
from decimal import Decimal
53

64
FROZEN = getattr(sys, 'frozen', False)
75

@@ -20,47 +18,6 @@ def get_subprocess_env() -> dict:
2018
return env
2119

2220

23-
def duration_to_timedelta(duration_string):
24-
"""
25-
Convert a time duration string (e.g. 3h 4m 10s) into a timedelta
26-
"""
27-
28-
duration_string = duration_string.lower()
29-
30-
total_seconds = Decimal('0')
31-
32-
prev_num = []
33-
for character in duration_string:
34-
if character.isalpha():
35-
if prev_num:
36-
num = Decimal(''.join(prev_num))
37-
if character == 'd':
38-
total_seconds += num * 60 * 60 * 24
39-
elif character == 'h':
40-
total_seconds += num * 60 * 60
41-
elif character == 'm':
42-
total_seconds += num * 60
43-
elif character == 's':
44-
total_seconds += num
45-
prev_num = []
46-
47-
elif character.isnumeric() or character == '.':
48-
prev_num.append(character)
49-
50-
return timedelta(seconds=float(total_seconds))
51-
52-
53-
def duration_to_expiration(duration_string):
54-
default_duration = timedelta(days=7)
55-
56-
if not duration_string:
57-
expiration = datetime.now(tz=timezone.utc) + default_duration
58-
else:
59-
expiration = datetime.now(tz=timezone.utc) + duration_to_timedelta(duration_string)
60-
61-
return expiration
62-
63-
6421
def spark_hadoop_build_version(hadoop_version: str) -> str:
6522
"""
6623
Given a Hadoop version, determine the Hadoop build of Spark to use.

tests/test_util.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,4 @@
1-
from datetime import datetime, timedelta, timezone
2-
from flintrock.util import (
3-
duration_to_timedelta,
4-
duration_to_expiration,
5-
spark_hadoop_build_version,
6-
)
7-
from freezegun import freeze_time
8-
9-
10-
def test_duration_to_timedelta():
11-
assert duration_to_timedelta('1d') == timedelta(days=1)
12-
assert duration_to_timedelta('3d2h1m') == timedelta(days=3, hours=2, minutes=1)
13-
assert duration_to_timedelta('4d 2h 1m 5s') == timedelta(days=4, hours=2, minutes=1, seconds=5)
14-
assert duration_to_timedelta('36h') == timedelta(hours=36)
15-
assert duration_to_timedelta('7d') == timedelta(days=7)
16-
17-
18-
@freeze_time("2012-01-14")
19-
def test_duration_to_expiration():
20-
assert duration_to_expiration('5m') == datetime.now(tz=timezone.utc) + timedelta(minutes=5)
1+
from flintrock.util import spark_hadoop_build_version
212

223

234
def test_spark_hadoop_build_version():

0 commit comments

Comments
 (0)