Skip to content

Commit 3e80ce4

Browse files
authored
Is784/instrumentation metrics (#912)
Implements instrumentation needed for #784 - adds prometheus and graphana to the tool stack - monitors traffic in core uservices: - webserver - storage - monitors database usage - configurable log-level via environment variable ``*_LOG_LEVEL`` (for storage and webserver) - redesigns docker-compose fixtures to resolve more reliable paths and environment variables
1 parent 31ff224 commit 3e80ce4

File tree

49 files changed

+530
-175
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+530
-175
lines changed

.env-devel

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# - Keep it alfphabetical order
2+
# - Keep it alfphabetical order and grouped by prefix
33
# - To expose: export $(grep -v '^#' .env | xargs -0)
44
#
55

@@ -8,6 +8,9 @@ BF_API_SECRET=none
88

99
DOCKER_IMAGE_TAG=latest
1010

11+
GF_SECURITY_ADMIN_PASSWORD=z43
12+
GF_SMTP_HOST=mail.speag.com:25
13+
1114
MAINTENANCE_PASSWORD=z43
1215

1316
PUBLISHED_HOST_NAME=localhost
@@ -19,6 +22,9 @@ POSTGRES_DB=simcoredb
1922
POSTGRES_HOST=postgres
2023
POSTGRES_PORT=5432
2124

25+
# ensure consistency with POSTGRES_ variables: postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}
26+
POSTGRES_EXPORTER_DATA_SOURCE_NAME=postgresql://simcore:simcore@postgres:5432/simcoredb?sslmode=disable
27+
2228
RABBIT_HOST=rabbit
2329
RABBIT_PORT=5672
2430
RABBITMQ_USER=simcore
@@ -44,6 +50,5 @@ SMTP_HOST=mail.speag.com
4450
SMTP_PORT=25
4551

4652
WEBSERVER_LOGIN_REGISTRATION_INVITATION_REQUIRED=1
47-
4853
# python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key())"
4954
WEBSERVER_SESSION_SECRET_KEY=REPLACE ME with a key of at least length 32.

Makefile

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,17 +135,15 @@ up-devel: up-swarm-devel
135135

136136
up-swarm: .env docker-swarm-check
137137
${DOCKER} swarm init
138-
${DOCKER_COMPOSE} -f services/docker-compose.yml \
139-
-f services/docker-compose-tools.yml \
140-
config > $(TEMPCOMPOSE).tmp-compose.yml ;
141-
${DOCKER} stack deploy -c $(TEMPCOMPOSE).tmp-compose.yml ${SWARM_STACK_NAME}
138+
${DOCKER_COMPOSE} -f services/docker-compose.yml -f services/docker-compose-tools.yml config > $(TEMPCOMPOSE).tmp-compose.yml ;
139+
@cat $(TEMPCOMPOSE).tmp-compose.yml
140+
@${DOCKER} stack deploy -c $(TEMPCOMPOSE).tmp-compose.yml ${SWARM_STACK_NAME}
142141

143142
up-swarm-devel: .env docker-swarm-check $(CLIENT_WEB_OUTPUT)
144143
${DOCKER} swarm init
145-
${DOCKER_COMPOSE} -f services/docker-compose.yml -f services/docker-compose.devel.yml \
146-
-f services/docker-compose-tools.yml \
147-
config > $(TEMPCOMPOSE).tmp-compose.yml
148-
${DOCKER} stack deploy -c $(TEMPCOMPOSE).tmp-compose.yml ${SWARM_STACK_NAME}
144+
${DOCKER_COMPOSE} -f services/docker-compose.yml -f services/docker-compose.devel.yml -f services/docker-compose-tools.yml config > $(TEMPCOMPOSE).tmp-compose.yml
145+
@cat $(TEMPCOMPOSE).tmp-compose.yml
146+
@${DOCKER} stack deploy -c $(TEMPCOMPOSE).tmp-compose.yml ${SWARM_STACK_NAME}
149147

150148
.PHONY: up-webclient-devel
151149
# target: up-webclient-devel: – init swarm and deploys all core and tool services up in development mode. Then it stops the webclient service and starts it again with the watcher attached.

ops/travis/system-testing/build_and_run

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ before_script() {
2525
2626
script() {
2727
# wait for a minute to let the swarm warm up...
28+
docker service ls --format "table"
2829
pytest -v ops/travis/system-testing/tests
2930
}
3031

ops/travis/system-testing/tests/test_swarm_runs.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import asyncio
88
import datetime
99
import logging
10+
import re
1011
import sys
1112
import urllib
1213
from pathlib import Path
@@ -140,19 +141,33 @@ def test_all_services_up(docker_client, services_docker_compose, tools_docker_co
140141
for name in service_names:
141142
assert any( name in s.name for s in running_services ), f"{name} not in {running_services}"
142143

143-
144144
async def test_core_service_running(core_service_name, docker_client, loop):
145145
"""
146146
NOTE: Assumes `make up-swarm` executed
147147
NOTE: loop fixture makes this test async
148148
"""
149-
running_services = docker_client.services.list()
149+
SERVICE_NAMES_PATTERN = re.compile(r'([\w^_]+)_([-\w]+)')
150+
# Matches strings as
151+
# services_director
152+
# services_postgres-exporter
153+
# services_postgres_exporter
154+
155+
# maps service names in docker-compose with actual services
156+
running_services = {}
157+
expected_prefix = None
158+
for service in docker_client.services.list():
159+
match = SERVICE_NAMES_PATTERN.match(service.name)
160+
assert match, f"Could not match service name {service.name}"
161+
prefix, service_name = match.groups()
162+
running_services[service_name] = service
163+
if expected_prefix:
164+
assert prefix == expected_prefix
165+
else:
166+
expected_prefix = prefix
150167

151168
# find the service
152-
running_service = [s for s in running_services if core_service_name in s.name]
153-
assert len(running_service) == 1
154-
155-
running_service = running_service[0]
169+
assert core_service_name in running_services
170+
running_service = running_services[core_service_name]
156171

157172
# Every service in the fixture runs a single task, but they might have failed!
158173
#
@@ -203,5 +218,5 @@ async def test_check_serve_root(docker_client, services_docker_compose, tools_do
203218
pytest.fail("{} not found in main index.html".format(search))
204219
except urllib.error.HTTPError as err:
205220
pytest.fail("The server could not fulfill the request.\nError code {}".format(err.code))
206-
except urllib.error.URLError as e:
221+
except urllib.error.URLError as err:
207222
pytest.fail("Failed reaching the server..\nError reason {}".format(err.reason))

packages/service-library/requirements/_base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ aiopg[sa]
1212
ujson
1313
werkzeug
1414
jsonschema
15+
prometheus_client # TODO: add as optional service-library[monitoring]

packages/service-library/requirements/_base.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ lazy-object-proxy==1.4.1 # via openapi-core
1616
multidict==4.5.2 # via aiohttp, yarl
1717
openapi-core==0.8.0
1818
openapi-spec-validator==0.2.7 # via openapi-core
19+
prometheus-client==0.7.1
1920
psycopg2-binary==2.8.2
2021
pyrsistent==0.15.2 # via jsonschema
2122
pyyaml==5.1

packages/service-library/requirements/_test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ multidict==4.5.2
2727
openapi-core==0.8.0
2828
openapi-spec-validator==0.2.7
2929
pluggy==0.12.0 # via pytest
30+
prometheus-client==0.7.1
3031
psycopg2-binary==2.8.2
3132
py==1.8.0 # via pytest
3233
pylint==2.3.1
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
3+
UNDER DEVELOPMENT for issue #784
4+
5+
Based on https://github.com/amitsaha/aiohttp-prometheus
6+
7+
Clients:
8+
- https://github.com/prometheus/client_python
9+
- TODO: see https://github.com/claws/aioprometheus
10+
"""
11+
12+
import prometheus_client
13+
from aiohttp import web
14+
from prometheus_client import Counter, Gauge, Histogram, CONTENT_TYPE_LATEST
15+
import time
16+
17+
18+
# https://prometheus.io/docs/concepts/metric_types/#counter
19+
20+
def middleware_factory(app_name):
21+
@web.middleware
22+
async def middleware_handler(request, handler):
23+
try:
24+
request['start_time'] = time.time()
25+
request.app['REQUEST_IN_PROGRESS'].labels(
26+
app_name, request.path, request.method).inc()
27+
28+
resp = await handler(request)
29+
30+
except web.HTTPException as ee:
31+
# Captures raised reponses (success/failures accounted with resp.status)
32+
resp = ee
33+
raise
34+
finally:
35+
# metrics on the same request
36+
resp_time = time.time() - request['start_time']
37+
request.app['REQUEST_LATENCY'].labels(
38+
app_name, request.path).observe(resp_time)
39+
40+
request.app['REQUEST_IN_PROGRESS'].labels(
41+
app_name, request.path, request.method).dec()
42+
43+
request.app['REQUEST_COUNT'].labels(
44+
app_name, request.method, request.path, resp.status).inc()
45+
46+
return resp
47+
return middleware_handler
48+
49+
async def metrics(_request):
50+
# TODO: NOT async!
51+
# prometheus_client access to a singleton registry!
52+
resp = web.Response(body=prometheus_client.generate_latest())
53+
resp.content_type = CONTENT_TYPE_LATEST
54+
return resp
55+
56+
57+
def setup_monitoring(app, app_name):
58+
59+
# NOTE: prometheus_client registers metrics in globals
60+
# tests might fail when fixtures get re-created
61+
62+
# Total number of requests processed
63+
app['REQUEST_COUNT'] = Counter(
64+
'http_requests_total', 'Total Request Count',
65+
['app_name', 'method', 'endpoint', 'http_status']
66+
)
67+
68+
# Latency of a request in seconds
69+
app['REQUEST_LATENCY'] = Histogram(
70+
'http_request_latency_seconds', 'Request latency',
71+
['app_name', 'endpoint']
72+
)
73+
74+
# Number of requests in progress
75+
app['REQUEST_IN_PROGRESS']=Gauge(
76+
'http_requests_in_progress_total', 'Requests in progress',
77+
['app_name', 'endpoint', 'method']
78+
)
79+
80+
app.middlewares.insert(0, middleware_factory(app_name))
81+
82+
# FIXME: this in the front-end has to be protected!
83+
app.router.add_get("/metrics", metrics)

packages/simcore-sdk/tests/fixtures/storage.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import pytest
66
import requests
7-
from pytest_docker import docker_ip, docker_services # pylint:disable=W0611
87

98
log = logging.getLogger(__name__)
109

packages/simcore-sdk/tests/node_ports/docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
version: '3.7'
22
services:
33
#TODO: use same integration test framework as in web server instead of fixed docker-compose file
4+
#FIXME: use new config filter to generate these docker-compose. Lack of sync produces issues
45
apihub:
56
image: ${DOCKER_REGISTRY:-itisfoundation}/apihub:${DOCKER_IMAGE_TAG:-latest}
67
init: true
@@ -27,6 +28,8 @@ services:
2728
- S3_SECURE=0
2829
- BF_API_SECRET="none"
2930
- BF_API_KEY="none"
31+
- STORAGE_LOGLEVEL=INFO
32+
- STORAGE_MONITORING_ENABLED=1
3033
restart: always
3134
depends_on:
3235
- postgres

0 commit comments

Comments
 (0)