Skip to content

Commit 1fb5623

Browse files
authored
Fixes registry step in e2e-CI (#1870)
* Fixes on waiting services * extra script to install nodejs on ubuntu * Keeps .python-version (from pyenv) when make clean * added node version in make info * Downgrades flower due to issues in latest release
1 parent 981bc8d commit 1fb5623

File tree

4 files changed

+133
-89
lines changed

4 files changed

+133
-89
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,11 +452,12 @@ info: ## displays setup information
452452
@echo ' DIRECTOR_API_VERSION : ${DIRECTOR_API_VERSION}'
453453
@echo ' STORAGE_API_VERSION : ${STORAGE_API_VERSION}'
454454
@echo ' WEBSERVER_API_VERSION : ${WEBSERVER_API_VERSION}'
455-
# tools version
455+
# dev tools version
456456
@echo ' make : $(shell make --version 2>&1 | head -n 1)'
457457
@echo ' jq : $(shell jq --version)'
458458
@echo ' awk : $(shell awk -W version 2>&1 | head -n 1)'
459459
@echo ' python : $(shell python3 --version)'
460+
@echo ' node : $(shell node --version 2> /dev/null || echo ERROR nodejs missing)'
460461

461462

462463
define show-meta
@@ -498,7 +499,7 @@ endif
498499

499500
.PHONY: clean clean-images clean-venv clean-all clean-more
500501

501-
_git_clean_args := -dxf -e .vscode -e TODO.md -e .venv
502+
_git_clean_args := -dxf -e .vscode -e TODO.md -e .venv -e .python-version
502503
_running_containers = $(shell docker ps -aq)
503504

504505
.check-clean:

scripts/install_nodejs_14.bash

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#
2+
# Install Node.js 14 on Ubuntu
3+
#
4+
# Requirements for development machines
5+
#
6+
#
7+
set -o errexit
8+
set -o nounset
9+
set -o pipefail
10+
IFS=$'\n\t'
11+
12+
sudo apt update
13+
14+
# Script to install the NodeSource Node.js 14.x repo onto a Debian or Ubuntu system
15+
curl -sL https://deb.nodesource.com/setup_14.x | sudo bash -
16+
17+
# Verify new source
18+
cat /etc/apt/sources.list.d/nodesource.list
19+
20+
# Installs node-js
21+
apt-get install -y nodejs

services/docker-compose-ops.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ services:
7171
- simcore_default
7272

7373
flower:
74-
image: mher/flower:latest
74+
# NOTE: latest with DIGEST 9bcc31818a1c7 is broken!
75+
# SEE https://github.com/mher/flower/issues/1029
76+
image: mher/flower:0.9.5
7577
init: true
7678
restart: always
7779
environment:
Lines changed: 106 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,66 @@
1+
import json
12
import logging
2-
from pdb import Pdb
3+
import os
34
import sys
45
import time
6+
from datetime import datetime
57
from pathlib import Path
6-
from typing import List
8+
from pdb import Pdb
79
from pprint import pformat
10+
from typing import Dict, List
811

912
import docker
1013
import yaml
14+
from tenacity import (
15+
RetryError,
16+
Retrying,
17+
before_sleep_log,
18+
stop_after_attempt,
19+
wait_fixed,
20+
)
1121

1222
logger = logging.getLogger(__name__)
1323

1424
current_dir = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent
1525

16-
WAIT_TIME_SECS = 20
17-
RETRY_COUNT = 7
26+
WAIT_BEFORE_RETRY = 10
27+
MAX_RETRY_COUNT = 10
1828
MAX_WAIT_TIME = 240
1929

20-
# https://docs.docker.com/engine/swarm/how-swarm-mode-works/swarm-task-states/
21-
pre_states = ["NEW", "PENDING", "ASSIGNED", "PREPARING", "STARTING"]
22-
23-
failed_states = [
24-
"COMPLETE",
25-
"FAILED",
26-
"SHUTDOWN",
27-
"REJECTED",
28-
"ORPHANED",
29-
"REMOVE",
30-
"CREATED",
31-
]
32-
# UTILS --------------------------------
30+
# SEE https://docs.docker.com/engine/swarm/how-swarm-mode-works/swarm-task-states/
3331

32+
PRE_STATES = [
33+
"new", # The task was initialized.
34+
"pending", # Resources for the task were allocated.
35+
"assigned", # Docker assigned the task to nodes.
36+
"accepted", # The task was accepted by a worker node. If a worker node rejects the task, the state changes to REJECTED.
37+
"preparing", # Docker is preparing the task.
38+
"starting", # Docker is starting the task.
39+
]
3440

35-
def get_tasks_summary(tasks):
36-
msg = ""
37-
for t in tasks:
38-
t["Status"].setdefault("Err", "")
39-
msg += "- task ID:{ID}, STATE: {Status[State]}, ERROR: '{Status[Err]}' \n".format(
40-
**t
41-
)
42-
return msg
43-
41+
RUNNING_STATE = "running" # The task is executing.
4442

45-
def get_failed_tasks_logs(service, docker_client):
46-
failed_logs = ""
47-
for t in service.tasks():
48-
if t["Status"]["State"].upper() in failed_states:
49-
cid = t["Status"]["ContainerStatus"]["ContainerID"]
50-
failed_logs += "{2} {0} - {1} BEGIN {2}\n".format(
51-
service.name, t["ID"], "=" * 10
52-
)
53-
if cid:
54-
container = docker_client.containers.get(cid)
55-
failed_logs += container.logs().decode("utf-8")
56-
else:
57-
failed_logs += " log unavailable. container does not exists\n"
58-
failed_logs += "{2} {0} - {1} END {2}\n".format(
59-
service.name, t["ID"], "=" * 10
60-
)
43+
FAILED_STATES = [
44+
"complete", # The task exited without an error code.
45+
"failed", # The task exited with an error code.
46+
"shutdown", # Docker requested the task to shut down.
47+
"rejected", # The worker node rejected the task.
48+
"orphaned", # The node was down for too long.
49+
"remove", # The task is not terminal but the associated service was removed or scaled down.
50+
]
6151

62-
return failed_logs
6352

53+
def get_tasks_summary(service_tasks):
54+
msg = ""
55+
for task in service_tasks:
56+
status: Dict = task["Status"]
57+
msg += f"- task ID:{task['ID']}, CREATED: {task['CreatedAt']}, UPDATED: {task['UpdatedAt']}, DESIRED_STATE: {task['DesiredState']}, STATE: {status['State']}"
58+
error = status.get("Err")
59+
if error:
60+
msg += f", ERROR: {error}"
61+
msg += "\n"
6462

65-
# --------------------------------------------------------------------------------
63+
return msg
6664

6765

6866
def osparc_simcore_root_dir() -> Path:
@@ -100,52 +98,74 @@ def ops_services() -> List[str]:
10098
return [x for x in dc_specs["services"].keys()]
10199

102100

103-
def wait_for_services() -> None:
104-
# get all services
105-
services = core_services() + ops_services()
101+
def to_datetime(datetime_str: str) -> datetime:
102+
# datetime_str is typically '2020-10-09T12:28:14.771034099Z'
103+
# - The T separates the date portion from the time-of-day portion
104+
# - The Z on the end means UTC, that is, an offset-from-UTC
105+
# The 099 before the Z is not clear, therefore we will truncate the last part
106+
N = len("2020-10-09T12:28:14.771034")
107+
if len(datetime_str) > N:
108+
datetime_str = datetime_str[:N]
109+
return datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%f")
110+
111+
112+
def by_service_creation(service):
113+
datetime_str = service.attrs["CreatedAt"]
114+
return to_datetime(datetime_str)
115+
116+
117+
def wait_for_services() -> int:
118+
expected_services = core_services() + ops_services()
106119

107120
client = docker.from_env()
108-
running_services = [
109-
x for x in client.services.list() if x.name.split("_")[-1] in services
110-
]
111-
112-
# check all services are in
113-
assert len(running_services), "no services started!"
114-
assert len(services) == len(
115-
running_services
116-
), f"Some services are missing or unexpected:\nexpected: {len(services)} {services}\ngot: {len(running_services)} {[service.name for service in running_services]}"
117-
# now check they are in running mode
118-
for service in running_services:
119-
task = None
120-
for n in range(RETRY_COUNT):
121-
# get last updated task
122-
sorted_tasks = sorted(service.tasks(), key=lambda task: task["UpdatedAt"])
123-
task = sorted_tasks[-1]
124-
125-
if task["Status"]["State"].upper() in pre_states:
126-
print(
127-
"Waiting [{}/{}] for {}...\n{}".format(
128-
n, RETRY_COUNT, service.name, get_tasks_summary(service.tasks())
121+
started_services = sorted(
122+
[
123+
s
124+
for s in client.services.list()
125+
if s.name.split("_")[-1] in expected_services
126+
],
127+
key=by_service_creation,
128+
)
129+
130+
assert len(started_services), "no services started!"
131+
assert len(expected_services) == len(started_services), (
132+
f"Some services are missing or unexpected:\n"
133+
"expected: {len(expected_services)} {expected_services}\n"
134+
"got: {len(started_services)} {[s.name for s in started_services]}"
135+
)
136+
137+
for service in started_services:
138+
139+
expected_replicas = service.attrs["Spec"]["Mode"]["Replicated"]["Replicas"]
140+
print(f"Service: {service.name} expects {expected_replicas} replicas", "-" * 10)
141+
142+
try:
143+
for attempt in Retrying(
144+
stop=stop_after_attempt(MAX_RETRY_COUNT),
145+
wait=wait_fixed(WAIT_BEFORE_RETRY),
146+
):
147+
with attempt:
148+
service_tasks: List[Dict] = service.tasks() # freeze
149+
print(get_tasks_summary(service_tasks))
150+
151+
#
152+
# NOTE: a service could set 'ready' as desired-state instead of 'running' if
153+
# it constantly breaks and the swarm desides to "stopy trying".
154+
#
155+
valid_replicas = sum(
156+
task["Status"]["State"] == RUNNING_STATE
157+
for task in service_tasks
129158
)
130-
)
131-
time.sleep(WAIT_TIME_SECS)
132-
elif task["Status"]["State"].upper() in failed_states:
133-
print(
134-
f"Waiting [{n}/{RETRY_COUNT}] Service {service.name} failed once...\n{get_tasks_summary(service.tasks())}"
135-
)
136-
time.sleep(WAIT_TIME_SECS)
137-
else:
138-
break
139-
assert task
140-
assert (
141-
task["Status"]["State"].upper() == "RUNNING"
142-
), "Expected running, got \n{}\n{}".format(
143-
pformat(task), get_tasks_summary(service.tasks())
144-
)
145-
# get_failed_tasks_logs(service, client))
159+
assert valid_replicas == expected_replicas
160+
except RetryError:
161+
print(
162+
f"ERROR: Service {service.name} failed to start {expected_replicas} replica/s"
163+
)
164+
print(json.dumps(service.attrs, indent=1))
165+
return os.EX_SOFTWARE
166+
167+
return os.EX_OK
146168

147169

148170
if __name__ == "__main__":
149-
# get retry parameters
150-
# wait for the services
151171
sys.exit(wait_for_services())

0 commit comments

Comments
 (0)