Skip to content

Commit 64e91e8

Browse files
authored
Add local observability stack with OTel, Prometheus, and Grafana (#75)
* Add observability stack configuration * Fix Prometheus remote write and OTLP port conflicts
1 parent e0da8c8 commit 64e91e8

File tree

9 files changed

+375
-0
lines changed

9 files changed

+375
-0
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,16 @@ The frontend uses Next.js App Router architecture:
102102
docker compose --env-file .env.development -f docker-compose.dev.yml up
103103
```
104104

105+
To launch the observability stack alongside your application, start a second
106+
Compose project in a separate terminal:
107+
108+
```bash
109+
docker compose -f docker-compose.observability.yml up
110+
```
111+
112+
This brings up Prometheus, Grafana, Loki, Tempo, and the OpenTelemetry
113+
Collector with their configuration mounted from the `ops/` directory.
114+
105115
### Production
106116

107117
```bash

docker-compose.observability.yml

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
version: "3.9"
2+
3+
services:
4+
prometheus:
5+
image: prom/prometheus:v2.49.1
6+
restart: unless-stopped
7+
command:
8+
- "--config.file=/etc/prometheus/prometheus.yml"
9+
- "--web.enable-lifecycle"
10+
- "--web.enable-remote-write-receiver"
11+
ports:
12+
- "9090:9090"
13+
volumes:
14+
- ./ops/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
15+
- ./ops/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
16+
- prometheus_data:/prometheus
17+
networks:
18+
- observability
19+
20+
loki:
21+
image: grafana/loki:2.9.3
22+
restart: unless-stopped
23+
command: ["-config.file=/etc/loki/local-config.yaml"]
24+
ports:
25+
- "3100:3100"
26+
volumes:
27+
- ./ops/loki/local-config.yaml:/etc/loki/local-config.yaml:ro
28+
- loki_data:/loki
29+
networks:
30+
- observability
31+
32+
tempo:
33+
image: grafana/tempo:2.4.1
34+
restart: unless-stopped
35+
command: ["-config.file=/etc/tempo/tempo.yaml"]
36+
ports:
37+
- "3200:3200"
38+
volumes:
39+
- ./ops/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
40+
- tempo_data:/tmp/tempo
41+
networks:
42+
- observability
43+
44+
otel-collector:
45+
image: otel/opentelemetry-collector-contrib:0.87.0
46+
restart: unless-stopped
47+
command: ["--config=/etc/otelcol/config.yaml"]
48+
depends_on:
49+
- prometheus
50+
- loki
51+
- tempo
52+
ports:
53+
- "4317:4317"
54+
- "4318:4318"
55+
- "8888:8888"
56+
volumes:
57+
- ./ops/otel/otelcol-config.yaml:/etc/otelcol/config.yaml:ro
58+
networks:
59+
- observability
60+
61+
grafana:
62+
image: grafana/grafana:10.4.2
63+
restart: unless-stopped
64+
depends_on:
65+
- prometheus
66+
- loki
67+
- tempo
68+
ports:
69+
- "3000:3000"
70+
environment:
71+
GF_SECURITY_ADMIN_USER: admin
72+
GF_SECURITY_ADMIN_PASSWORD: admin
73+
GF_PATHS_PROVISIONING: /etc/grafana/provisioning
74+
volumes:
75+
- grafana_data:/var/lib/grafana
76+
- ./ops/grafana/provisioning:/etc/grafana/provisioning:ro
77+
- ./ops/grafana/dashboards:/var/lib/grafana/dashboards:ro
78+
networks:
79+
- observability
80+
81+
volumes:
82+
prometheus_data:
83+
loki_data:
84+
tempo_data:
85+
grafana_data:
86+
87+
networks:
88+
observability:
89+
driver: bridge
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: 1
2+
providers:
3+
- name: default
4+
orgId: 1
5+
folder: ""
6+
type: file
7+
disableDeletion: false
8+
editable: true
9+
updateIntervalSeconds: 30
10+
options:
11+
path: /var/lib/grafana/dashboards
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
apiVersion: 1
2+
datasources:
3+
- name: Prometheus
4+
uid: prometheus
5+
type: prometheus
6+
access: proxy
7+
url: http://prometheus:9090
8+
isDefault: true
9+
jsonData:
10+
httpMethod: POST
11+
- name: Loki
12+
uid: loki
13+
type: loki
14+
access: proxy
15+
url: http://loki:3100
16+
- name: Tempo
17+
uid: tempo
18+
type: tempo
19+
access: proxy
20+
url: http://tempo:3200
21+
jsonData:
22+
httpMethod: POST
23+
serviceMap:
24+
datasourceUid: prometheus
25+
tracesToLogs:
26+
datasourceUid: loki
27+
spanStartTimeShift: -1h
28+
spanEndTimeShift: 1h
29+
filterByTraceID: true
30+
filterBySpanID: false
31+
tags:
32+
- key: service.name
33+
- key: service.namespace
34+
tracesToMetrics:
35+
datasourceUid: prometheus
36+
tags:
37+
- key: service.name
38+
- key: service.namespace
39+
secureJsonData: {}

ops/loki/local-config.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
auth_enabled: false
2+
3+
server:
4+
http_listen_port: 3100
5+
grpc_listen_port: 9096
6+
7+
common:
8+
ring:
9+
instance_addr: 127.0.0.1
10+
kvstore:
11+
store: inmemory
12+
replication_factor: 1
13+
path_prefix: /loki
14+
15+
schema_config:
16+
configs:
17+
- from: 2020-10-24
18+
store: boltdb-shipper
19+
object_store: filesystem
20+
schema: v11
21+
index:
22+
prefix: index_
23+
period: 24h
24+
25+
storage_config:
26+
boltdb_shipper:
27+
active_index_directory: /loki/index
28+
cache_location: /loki/cache
29+
shared_store: filesystem
30+
filesystem:
31+
directory: /loki/chunks
32+
33+
limits_config:
34+
allow_structured_metadata: true
35+
retention_period: 168h
36+
37+
chunk_store_config:
38+
max_look_back_period: 0s
39+
40+
compactor:
41+
working_directory: /loki/compactor
42+
shared_store: filesystem
43+
compactor_ring:
44+
kvstore:
45+
store: inmemory

ops/otel/otelcol-config.yaml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
receivers:
2+
otlp:
3+
protocols:
4+
grpc:
5+
endpoint: 0.0.0.0:4317
6+
http:
7+
endpoint: 0.0.0.0:4318
8+
9+
processors:
10+
batch:
11+
send_batch_size: 8192
12+
timeout: 5s
13+
resource:
14+
attributes:
15+
- key: environment
16+
value: local
17+
action: upsert
18+
- key: service.namespace
19+
value: paform
20+
action: upsert
21+
22+
exporters:
23+
logging:
24+
loglevel: info
25+
otlp/tempo:
26+
endpoint: tempo:4317
27+
tls:
28+
insecure: true
29+
prometheusremotewrite:
30+
endpoint: http://prometheus:9090/api/v1/write
31+
loki:
32+
endpoint: http://loki:3100/loki/api/v1/push
33+
default_labels_enabled:
34+
job: true
35+
instance: true
36+
service_name: true
37+
service_namespace: true
38+
environment: true
39+
40+
service:
41+
telemetry:
42+
logs:
43+
level: info
44+
metrics:
45+
level: normal
46+
address: 0.0.0.0:8888
47+
pipelines:
48+
traces:
49+
receivers: [otlp]
50+
processors: [resource, batch]
51+
exporters: [otlp/tempo]
52+
metrics:
53+
receivers: [otlp]
54+
processors: [resource, batch]
55+
exporters: [prometheusremotewrite]
56+
logs:
57+
receivers: [otlp]
58+
processors: [resource, batch]
59+
exporters: [loki]

ops/prometheus/alerts.yml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
groups:
2+
- name: backend-alerts
3+
rules:
4+
- alert: BackendHighErrorRate
5+
expr: |
6+
sum(rate(http_requests_total{job="backend",status_code=~"5.."}[5m]))
7+
/
8+
sum(rate(http_requests_total{job="backend"}[5m]))
9+
> 0.05
10+
for: 5m
11+
labels:
12+
severity: warning
13+
annotations:
14+
summary: "High 5xx error rate on backend"
15+
description: |
16+
The backend service has more than 5% 5xx responses over the last 5 minutes.
17+
18+
- alert: ApiP95LatencyHigh
19+
expr: |
20+
histogram_quantile(
21+
0.95,
22+
sum(rate(http_request_duration_seconds_bucket{job="backend"}[5m])) by (le)
23+
) > 0.75
24+
for: 10m
25+
labels:
26+
severity: critical
27+
annotations:
28+
summary: "Backend p95 latency is elevated"
29+
description: |
30+
The 95th percentile API latency has been above 750ms for more than 10 minutes.
31+
32+
- name: frontend-alerts
33+
rules:
34+
- alert: FrontendLCPDegraded
35+
expr: |
36+
histogram_quantile(
37+
0.95,
38+
sum(rate(web_vitals_lcp_seconds_bucket{job="frontend"}[10m])) by (le)
39+
) > 2.5
40+
for: 15m
41+
labels:
42+
severity: warning
43+
annotations:
44+
summary: "Frontend LCP is above target"
45+
description: |
46+
The frontend Largest Contentful Paint 95th percentile has exceeded 2.5 seconds for 15 minutes.

ops/prometheus/prometheus.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
global:
2+
scrape_interval: 15s
3+
evaluation_interval: 15s
4+
5+
alerting:
6+
alertmanagers:
7+
- static_configs:
8+
- targets: []
9+
10+
rule_files:
11+
- /etc/prometheus/alerts.yml
12+
13+
scrape_configs:
14+
- job_name: otel-collector
15+
scrape_interval: 15s
16+
metrics_path: /metrics
17+
static_configs:
18+
- targets:
19+
- otel-collector:8888
20+
21+
- job_name: backend
22+
scrape_interval: 15s
23+
metrics_path: /metrics
24+
static_configs:
25+
- targets:
26+
- backend:8000
27+
28+
- job_name: node-exporter
29+
scrape_interval: 30s
30+
static_configs:
31+
- targets:
32+
- node-exporter:9100

ops/tempo/tempo.yaml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
server:
2+
http_listen_port: 3200
3+
log_level: info
4+
5+
distributor:
6+
receivers:
7+
otlp:
8+
protocols:
9+
grpc:
10+
endpoint: 0.0.0.0:4317
11+
http:
12+
endpoint: 0.0.0.0:4318
13+
14+
ingester:
15+
max_block_duration: 5m
16+
max_block_bytes: 1000000
17+
18+
compactor:
19+
compaction:
20+
block_retention: 168h
21+
22+
storage:
23+
trace:
24+
backend: local
25+
block:
26+
bloom_filter_false_positive: 0.05
27+
index_downsample_bytes: 1000
28+
encoding: zstd
29+
wal:
30+
path: /tmp/tempo/wal
31+
local:
32+
path: /tmp/tempo/blocks
33+
34+
overrides:
35+
defaults:
36+
metrics_generator_processors:
37+
- service-graphs
38+
- span-metrics
39+
metrics_generator:
40+
processors:
41+
span-metrics:
42+
dimensions: [service.name, service.namespace]
43+
service-graphs:
44+
dimensions: [service.name, service.namespace]

0 commit comments

Comments
 (0)