Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"uid": "monitor-dashboard",
"title": "Monitor",
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"refresh": "10s",
"panels": [
{
"id": 1,
"type": "timeseries",
"title": "HIBP notify total requests (rate)",
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 8 },
"targets": [
{
"refId": "A",
"expr": "sum(rate(hibp_notify_requests_total[1m]))",
"legendFormat": "requests/s"
}
]
},
{
"id": 2,
"type": "timeseries",
"title": "HIBP notify failures (rate)",
"gridPos": { "x": 0, "y": 8, "w": 24, "h": 8 },
"targets": [
{
"refId": "B",
"expr": "sum(rate(hibp_notify_request_failures_total[1m]))",
"legendFormat": "failures/s"
}
]
},
{
"id": 3,
"type": "timeseries",
"title": "HIBP notify failures by error (rate)",
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 },
"targets": [
{
"refId": "C",
"expr": "sum by (error) (rate(hibp_notify_request_failures_total[1m]))",
"legendFormat": "{{error}}"
}
]
},
{
"id": 4,
"type": "barchart",
"title": "Failures by error (increase, last 15m)",
"gridPos": { "x": 0, "y": 26, "w": 12, "h": 8 },
"targets": [
{
"refId": "D",
"expr": "sum by (error) (increase(hibp_notify_request_failures_total[15m]))",
"legendFormat": "{{error}}"
}
]
},
{
"id": 5,
"type": "stat",
"title": "Failure rate %",
"gridPos": { "x": 12, "y": 26, "w": 12, "h": 8 },
"targets": [
{
"refId": "E",
"expr": "100 * (sum(rate(hibp_notify_request_failures_total[5m])) / clamp_min(sum(rate(hibp_notify_requests_total[5m])), 1e-9))"
}
],
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
}
}
]
}
9 changes: 9 additions & 0 deletions .docker/otel/grafana/provisioning/dashboards/dashboards.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: 1

providers:
- name: Local Dashboards
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards-json
22 changes: 22 additions & 0 deletions .docker/otel/grafana/provisioning/datasources/datasources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: 1

datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
editable: true
jsonData:
tracesToLogsV2:
datasourceUid: prometheus
nodeGraph:
enabled: true
search:
hide: false
25 changes: 25 additions & 0 deletions .docker/otel/otel-collector-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
receivers:
otlp:
protocols:
http:
endpoint: '0.0.0.0:4318' # http port for receiving traces, metrics, and logs over http

exporters:
debug:

otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true

prometheus:
endpoint: 0.0.0.0:9464

service:
pipelines:
traces:
receivers: [otlp]
exporters: [debug, otlp/tempo]
metrics:
receivers: [otlp]
exporters: [debug, prometheus]
7 changes: 7 additions & 0 deletions .docker/otel/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
global:
scrape_interval: 5s

scrape_configs:
- job_name: otelcol
static_configs:
- targets: ["otel-collector:9464"]
15 changes: 15 additions & 0 deletions .docker/otel/tempo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
server:
http_listen_port: 3200

distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317

storage:
trace:
backend: local
local:
path: /tmp/tempo/traces
5 changes: 5 additions & 0 deletions .env.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@ GCP_PUBSUB_SUBSCRIPTION_NAME=hibp-cron
PUBSUB_HOST=localhost
PUBSUB_PORT=8085
PUBSUB_EMULATOR_HOST=localhost:8085

# OpenTelemetry configuration
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
OTEL_SERVICE_NAME=monitor
3 changes: 3 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ updates:
- "stylelint"
- "stylelint-scss"
- "stylelint-config-recommended-scss"
otel:
patterns:
- "@opentelemetry/*"
- package-ecosystem: "docker"
directory: "/"
schedule:
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,19 @@ To use the strings in code, you need to obtain a `ReactLocalization` instance, w

We use GCP Cloudrun for dev review – official stage and production apps are built by the Dockerfile and Github Actions. Everything that is merged into `main` will deploy automatically to stage. The ADR for preview deployment can be found [here](https://github.com/mozilla/blurts-server/blob/main/docs/adr/0008-preview-deployment.md)

## Observability

We use opentelemetry for manual and auto-instrumentation of app code. We use Sentry for error tracking and some alerting; other alerts are configured on metrics through Grafana ([Yardstick](https://yardstick.mozilla.org/)). Error-level logs are automatically captured and sent to Sentry. Trace IDs are forwarded to Sentry. They can be searched in [Yardstick](https://yardstick.mozilla.org/) for more detailed trace data.

The infrastructure for viewing traces and metrics locally is automatically set up when you follow #docker-compose-setup instructions. It starts 4 services:

- Otel collector (collects metrics, traces, and logs using OTLP)
- Tempo (scrapes traces for grafana; in GCP environment we use Cloud Trace)
- Prometheus (scrapes metrics for grafana; in GCP environment we use Google-Managed Prometheus)
- Grafana (visualization)

To view metrics locally, visit [Grafana](http://localhost:3000/d/monitor-dashboard/monitor?orgId=1). Some default dashboard panels are seeded. To see traces, navigate to the [Explore] pane in Grafana and select the Tempo datasource. Note that the data won't propagate immediately, so wait a minute if you're not seeing expected activity show up.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Thank you for this write-up! It was helpful in explaining the process and made the setup pretty painless.


_**TODO:** add full deploy process similar to Relay_

_**TODO:** consider whether we can re-enable Heroku Review Apps_
54 changes: 54 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,60 @@ services:
start_interval: 10s
networks:
- shared

otel-collector:
image: otel/opentelemetry-collector:0.142.0
restart: always
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./.docker/otel/otel-collector-config.yaml:/etc/otel-collector-config.yaml
networks:
- shared
ports:
- "4318:4318" # OTLP HTTP receiver
- "9464:9464" # Prometheus scrape endpoint

tempo:
image: grafana/tempo:2.4.1
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./.docker/otel/tempo.yaml:/etc/tempo.yaml:ro
ports:
- "3200:3200" # Tempo query endpoint
networks:
- shared


prometheus:
image: prom/prometheus:v2.49.1
command: ["--config.file=/etc/prometheus/prometheus.yaml"]
volumes:
- ./.docker/otel/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro
ports:
- "9090:9090"
networks:
- shared
depends_on:
- otel-collector

grafana:
image: grafana/grafana:10.3.3
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
# Skip the initial admin password change UI
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- ./.docker/otel/grafana/provisioning:/etc/grafana/provisioning:ro
- ./.docker/otel/grafana/provisioning/dashboards-json:/etc/grafana/provisioning/dashboards-json:ro
ports:
- "3000:3000"
networks:
- shared
depends_on:
- tempo
- prometheus

networks:
shared:
Expand Down
Loading
Loading