Skip to content

Commit 269933e

Browse files
authored
planner: add support for compact and spot policies (#85)
* planner: add support for compaction policy * planner: add support for spot policy
1 parent 2f39e04 commit 269933e

File tree

15 files changed

+237
-46
lines changed

15 files changed

+237
-46
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ running [Faasm](https://github.com/faasm/faasm) cluster.
88
To install `faasmctl` you need a working `pip` (virtual-)environment. Then:
99

1010
```bash
11-
pip install faasmctl==0.42.0
11+
pip install faasmctl==0.43.0
1212
```
1313

1414
## Usage

faasmctl/tasks/deploy.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,16 @@ def dist_tests(ctx, mount_source=None, ini_file=None):
5959
- mount_source (str): path to the Faasm's source code checkout
6060
- ini_file (str): optional path to a running cluster
6161
"""
62-
if not mount_source:
63-
raise RuntimeError(
64-
"When deploying a dist-tests cluster, you need to"
65-
" specify the --mount-source"
66-
)
67-
6862
# If the user provided a path to the ini_file, it means that we are
6963
# deploying the dist-tests on top of an existing cluster. Otherwise start
7064
# a new compose clustter
7165
if ini_file is None:
66+
if not mount_source:
67+
raise RuntimeError(
68+
"When deploying a dist-tests cluster, you need to"
69+
" specify the --mount-source"
70+
)
71+
7272
ini_file = compose(ctx, workers=0, mount_source=mount_source, clean=False)
7373

7474
# Second, start the dist-test-server

faasmctl/tasks/logs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@ def get_k8s_logs(s, follow, ini_file, last_restart):
4242

4343
k8s_cmd = [
4444
"logs",
45-
# TODO: there seems to be a divergence between our time and AKSs
46-
# "--since-time={}".format(last_restart),
4745
"-f" if follow else "",
4846
service_to_k8s_str[service],
4947
"--tail=-1",

faasmctl/tasks/monitor.py

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ def stop_container():
4545
)
4646

4747

48-
def get_apps_to_be_migrated(registered_workers, in_flight_apps, worker_occupation):
48+
def get_apps_to_be_migrated(
49+
planner_policy, registered_workers, in_flight_apps, worker_occupation
50+
):
4951
"""
5052
Helper method that, given the current worker occupation, works out all the
5153
apps that could be migrated if they checked for a migration opportunity
@@ -56,12 +58,17 @@ def get_apps_to_be_migrated(registered_workers, in_flight_apps, worker_occupatio
5658
worker_occupation_file_path = "/tmp/worker_occupation{}.csv".format(file_suffix)
5759
with open(worker_occupation_file_path, "w") as fh:
5860
fh.write("WorkerIp,Slots\n")
59-
for ip in worker_occupation:
60-
total_slots = [w for w in registered_workers.hosts if w.ip == ip][0].slots
61-
for i in range(len(worker_occupation[ip]), total_slots):
62-
worker_occupation[ip].append("-1")
63-
64-
fh.write("{},{}\n".format(ip, ",".join(worker_occupation[ip])))
61+
try:
62+
for ip in worker_occupation:
63+
total_slots = [w for w in registered_workers.hosts if w.ip == ip][
64+
0
65+
].slots
66+
for i in range(len(worker_occupation[ip]), total_slots):
67+
worker_occupation[ip].append("-1")
68+
69+
fh.write("{},{}\n".format(ip, ",".join(worker_occupation[ip])))
70+
except IndexError:
71+
return []
6572

6673
# Start container in the background
6774
docker_cmd = [
@@ -81,8 +88,8 @@ def get_apps_to_be_migrated(registered_workers, in_flight_apps, worker_occupatio
8188
docker_cmd = [
8289
"docker exec",
8390
get_ctr_name(),
84-
"bash -c '/build/faasm/bin/is_app_migratable {} {}'".format(
85-
app.appId, worker_occupation_file_path
91+
"bash -c '/build/faasm/bin/is_app_migratable {} {} {}'".format(
92+
planner_policy, app.appId, worker_occupation_file_path
8693
),
8794
]
8895
docker_cmd = " ".join(docker_cmd)
@@ -91,7 +98,7 @@ def get_apps_to_be_migrated(registered_workers, in_flight_apps, worker_occupatio
9198
# App can not be migrated
9299
continue
93100
elif out.returncode == 0:
94-
to_be_migrated_apps.append(app.appId)
101+
to_be_migrated_apps.append(app)
95102
else:
96103
# stop_container()
97104
# Survive downstream binary errors, but report a warning
@@ -111,7 +118,7 @@ def get_apps_to_be_migrated(registered_workers, in_flight_apps, worker_occupatio
111118
orig_num_migrations = -1
112119

113120

114-
def print_planner_resources():
121+
def print_planner_resources(policy):
115122
"""
116123
Helper method to visualise the state of the planner
117124
"""
@@ -121,13 +128,19 @@ def color_text(color, text="X"):
121128
num1 = str(color)
122129
return f"\033[38;5;{num1}m{text}\033[0;0m"
123130

124-
def print_line(host_msg, worker_occupation):
131+
def print_line(host_msg, worker_occupation, next_evicted_vm_ips=[]):
132+
is_evicted = host_msg.ip in next_evicted_vm_ips
133+
125134
line = "{}\t".format(host_msg.ip)
135+
if is_evicted:
136+
line = "{}\t".format(color_text(1, text=host_msg.ip))
137+
126138
used_slots = host_msg.usedSlots
127139
occupation = (
128140
worker_occupation[host_msg.ip] if host_msg.ip in worker_occupation else []
129141
)
130142
if used_slots != len(occupation):
143+
# TODO: FIXME: this is a symptom of a problem!!
131144
print(
132145
"Expected {} used slots for host {} but got {}!".format(
133146
used_slots,
@@ -145,11 +158,25 @@ def print_line(host_msg, worker_occupation):
145158
line += " [ ]"
146159
print(line)
147160

161+
def get_app_color(app, policy):
162+
user_id = 0
163+
try:
164+
user_id = app.subType
165+
except AttributeError:
166+
pass
167+
168+
# We only care about user ids if we are using the COMPACT (multi-tenant)
169+
# policy
170+
if policy != "compact":
171+
return app.appId % 256
172+
173+
return (user_id * 10) % 256
174+
148175
def print_apps_legend(in_flight_apps):
149176
num_apps_per_line = NUM_APPS_PER_LINE
150177
line = ""
151178
for i, app in enumerate(in_flight_apps.apps):
152-
app_color = app.appId % 256
179+
app_color = get_app_color(app, policy)
153180
app_text = color_text(app_color, "App ID: {}".format(app.appId))
154181
if i == 0:
155182
line = app_text
@@ -163,9 +190,24 @@ def print_apps_legend(in_flight_apps):
163190
def print_migration_opportunities(apps_to_be_migrated):
164191
num_apps_per_line = NUM_APPS_PER_LINE
165192
line = ""
166-
for i, app_id in enumerate(apps_to_be_migrated):
167-
app_color = app_id % 256
168-
app_text = color_text(app_color, "App ID: {}".format(app_id))
193+
for i, app in enumerate(apps_to_be_migrated):
194+
app_color = get_app_color(app, policy)
195+
app_text = color_text(app_color, "App ID: {}".format(app.appId))
196+
if i == 0:
197+
line = app_text
198+
elif i % num_apps_per_line == 0:
199+
print(line)
200+
line = app_text
201+
else:
202+
line += "\t{}".format(app_text)
203+
print(line)
204+
205+
def print_frozen_apps(frozen_apps):
206+
num_apps_per_line = NUM_APPS_PER_LINE
207+
line = ""
208+
for i, app in enumerate(frozen_apps):
209+
app_color = get_app_color(app, policy)
210+
app_text = color_text(app_color, "App ID: {}".format(app.appId))
169211
if i == 0:
170212
line = app_text
171213
elif i % num_apps_per_line == 0:
@@ -178,6 +220,7 @@ def print_migration_opportunities(apps_to_be_migrated):
178220
header = "============== PLANNER RESOURCES ==============="
179221
divide = "------------------------------------------------"
180222
div_mg = "*********** MIGRATION OPPORTUNITIES ************"
223+
div_fa = "***************** FROZEN APPS ******************"
181224
div_al = "************* APP ID COLOR LEGEND **************"
182225
footer = "================================================"
183226

@@ -192,7 +235,7 @@ def print_migration_opportunities(apps_to_be_migrated):
192235
worker_occupation = {}
193236
worker_occupation_ids = {}
194237
for app in in_flight_apps.apps:
195-
app_color = app.appId % 256
238+
app_color = get_app_color(app, policy)
196239
for ip in app.hostIps:
197240
if ip not in worker_occupation:
198241
worker_occupation[ip] = []
@@ -204,10 +247,14 @@ def print_migration_opportunities(apps_to_be_migrated):
204247
if orig_num_migrations < 0:
205248
orig_num_migrations = in_flight_apps.numMigrations
206249

250+
# Work-out the forzen apps
251+
next_evicted_vm_ips = in_flight_apps.nextEvictedVmIps
252+
frozen_apps = [app for app in in_flight_apps.frozenApps]
253+
207254
# Work out the existing migration opportunities
208255
registered_workers = get_available_hosts()
209256
apps_to_be_migrated = get_apps_to_be_migrated(
210-
registered_workers, in_flight_apps, worker_occupation_ids
257+
policy, registered_workers, in_flight_apps, worker_occupation_ids
211258
)
212259

213260
# -------------
@@ -217,7 +264,7 @@ def print_migration_opportunities(apps_to_be_migrated):
217264
print(header)
218265
# Print registered worker occupation
219266
for worker in registered_workers.hosts:
220-
print_line(worker, worker_occupation)
267+
print_line(worker, worker_occupation, next_evicted_vm_ips)
221268

222269
# Print migration opportunities (if any)
223270
if len(apps_to_be_migrated) > 0:
@@ -226,6 +273,13 @@ def print_migration_opportunities(apps_to_be_migrated):
226273
print(divide)
227274
print_migration_opportunities(apps_to_be_migrated)
228275

276+
# Print frozen apps (if any)
277+
if len(frozen_apps) > 0:
278+
print(divide)
279+
print(div_fa)
280+
print(divide)
281+
print_frozen_apps(frozen_apps)
282+
229283
# Print app-to-color legend (if any)
230284
if len(in_flight_apps.apps) > 0:
231285
print(divide)
@@ -253,11 +307,11 @@ def signal_handler(sig, frame):
253307

254308

255309
@task
256-
def planner(ctx, poll_period_sec=2):
310+
def planner(ctx, policy="bin-pack", poll_period_sec=2):
257311
"""
258312
Monitor the in-flight apps and host occupation in the planner
259313
"""
260314
signal(SIGINT, signal_handler)
261315
while True:
262-
print_planner_resources()
316+
print_planner_resources(policy)
263317
sleep(poll_period_sec)

faasmctl/tasks/restart.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,31 @@
44
BACKEND_INI_STRING,
55
get_faasm_ini_file,
66
get_faasm_ini_value,
7-
update_faasm_ini_vaule,
7+
update_faasm_ini_value,
88
)
9+
from faasmctl.util.restart import replica as do_restart_replica
910
from faasmctl.util.time import get_time_rfc3339
1011
from invoke import task
1112

1213

14+
@task
15+
def replica(ctx, name, ini_file=None):
16+
"""
17+
Restart an individual replica by name
18+
19+
The meaning of name here will depend on wether we are using a compose
20+
or a k8s backend.
21+
"""
22+
do_restart_replica(name)
23+
24+
1325
@task(default=True, iterable=["s"])
1426
def restart(ctx, s, ini_file=None):
1527
"""
1628
Restart a running service in the cluster
1729
1830
Parameters:
19-
- s (str, repeateble): service to get the logs from
31+
- s (str, repeateble): service to restart
2032
- ini_file (str): path to the cluster's INI file
2133
"""
2234
if not ini_file:
@@ -30,5 +42,5 @@ def restart(ctx, s, ini_file=None):
3042
)
3143

3244
# Update the last restart value
33-
update_faasm_ini_vaule(ini_file, "Faasm", "last_restart", get_time_rfc3339())
45+
update_faasm_ini_value(ini_file, "Faasm", "last_restart", get_time_rfc3339())
3446
run_compose_cmd(ini_file, "restart {}".format(" ".join(s)))

faasmctl/tasks/scale.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
BACKEND_INI_STRING,
99
get_faasm_ini_file,
1010
get_faasm_ini_value,
11-
update_faasm_ini_vaule,
11+
update_faasm_ini_value,
1212
)
1313
from invoke import task
1414

@@ -43,5 +43,5 @@ def scale(ctx, service, replicas, ini_file=None):
4343
worker_ips = "{}".format(
4444
",".join(get_container_ips_from_compose(faasm_checkout, cluster_name))
4545
)
46-
update_faasm_ini_vaule(ini_file, "Faasm", "worker_names", worker_names)
47-
update_faasm_ini_vaule(ini_file, "Faasm", "worker_ips", worker_ips)
46+
update_faasm_ini_value(ini_file, "Faasm", "worker_names", worker_names)
47+
update_faasm_ini_value(ini_file, "Faasm", "worker_ips", worker_ips)

faasmctl/util/compose.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from faasmctl.util.config import get_faasm_ini_value
1+
from faasmctl.util.config import get_faasm_ini_value, update_faasm_ini_value
22
from faasmctl.util.deploy import generate_ini_file
33
from faasmctl.util.docker import get_docker_tag
44
from faasmctl.util.faasm import FAASM_DOCKER_REGISTRY
@@ -368,3 +368,32 @@ def get_container_ips_from_compose(faasm_checkout, cluster_name):
368368
)
369369
container_ips.append(c_ip)
370370
return container_ips
371+
372+
373+
def restart_ctr_by_name(ini_file, ctr_names):
374+
all_ctr_names = get_faasm_ini_value(ini_file, "Faasm", "worker_names").split(",")
375+
376+
if not all([ctr_name in all_ctr_names for ctr_name in ctr_names]):
377+
print(
378+
"Requested to restart a ctr list "
379+
"({}) not a subset of the worker list: {}".format(ctr_names, all_ctr_names)
380+
)
381+
raise RuntimeError("Unrecognised container name!")
382+
383+
docker_cmd = "docker restart {}".format(" ".join(ctr_names))
384+
out = run(docker_cmd, shell=True, capture_output=True)
385+
assert out.returncode == 0, "Error restarting docker container: {}".format(
386+
out.stderr
387+
)
388+
389+
# Update the container names and ips
390+
faasm_checkout = get_faasm_ini_value(ini_file, "Faasm", "working_dir")
391+
cluster_name = get_faasm_ini_value(ini_file, "Faasm", "cluster_name")
392+
393+
# Ge the names and the ips directly from docker as the ones in the INI
394+
# file are now stale after the restart
395+
new_ctr_names = get_container_names_from_compose(faasm_checkout, cluster_name)
396+
new_ctr_ips = get_container_ips_from_compose(faasm_checkout, cluster_name)
397+
398+
update_faasm_ini_value(ini_file, "Faasm", "worker_names", ",".join(new_ctr_names))
399+
update_faasm_ini_value(ini_file, "Faasm", "worker_ips", ",".join(new_ctr_ips))

faasmctl/util/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def get_faasm_ini_value(ini_file, section, key):
2424
return config[section].get(key, "")
2525

2626

27-
def update_faasm_ini_vaule(ini_file, section, key, new_value):
27+
def update_faasm_ini_value(ini_file, section, key, new_value):
2828
if not exists(ini_file):
2929
raise RuntimeError("Did not find faasm config at: {}".format(ini_file))
3030

faasmctl/util/faasm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from os import environ
22

3-
FAASM_VERSION = "0.17.0"
3+
FAASM_VERSION = "0.26.0"
44

55

66
def get_version():

faasmctl/util/invoke.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,22 @@ def invoke_and_await(url, json_msg, expected_num_messages):
101101
"""
102102
poll_period = 2
103103

104-
# The first invocation returns an appid to poll for the message
105-
response = post(url, data=json_msg, timeout=None)
104+
# The first invocation returns an appid to poll for the message. If there
105+
# are not enough slots, this will POST will fail. In general, we want to
106+
# tolerate this a number of times (for example, to accomodate for dynamic
107+
# cluster sizes)
108+
109+
num_retries = 10
110+
sleep_period_secs = 0.5
111+
112+
for i in range(num_retries):
113+
response = post(url, data=json_msg, timeout=None)
114+
if response.status_code == 500 and response.text == "No available hosts":
115+
print("No available hosts, retrying... {}/{}".format(i + 1, num_retries))
116+
sleep(sleep_period_secs)
117+
continue
118+
break
119+
106120
if response.status_code != 200:
107121
print(
108122
"POST request failed (code: {}): {}".format(

0 commit comments

Comments
 (0)