Skip to content

Commit 386b05c

Browse files
authored
feat: otel tracing (#494)
1 parent 332b690 commit 386b05c

31 files changed

+10361
-7111
lines changed

.docker/docker-compose-infra.yml

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,23 @@ services:
153153
- IMGPROXY_USE_ETAG=true
154154
- IMGPROXY_ENABLE_WEBP_DETECTION=true
155155

156-
# Optional for rate-limiting
157-
redis:
158-
image: redis:6.2-alpine
159-
restart: always
160-
ports:
161-
- '6379:6379'
156+
# Optional for rate-limiting
157+
# redis:
158+
# image: redis:6.2-alpine
159+
# restart: always
160+
# ports:
161+
# - '6379:6379'
162+
163+
# Optional for tracing
164+
# otel:
165+
# extends:
166+
# service: otel-collector
167+
# file: ./.docker/docker-compose-monitoring.yml
168+
#
169+
# jaeger:
170+
# extends:
171+
# service: jaeger
172+
# file: ./.docker/docker-compose-monitoring.yml
162173

163174
configs:
164175
init.sql:

.docker/docker-compose-monitoring.yml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,27 @@ services:
4343
- GF_SECURITY_ADMIN_PASSWORD=grafana
4444
volumes:
4545
- ../monitoring/grafana/config:/etc/grafana/provisioning
46-
- ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards
46+
- ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards
47+
48+
jaeger:
49+
image: jaegertracing/all-in-one:1.57.0
50+
ports:
51+
- "16686:16686" # Jaeger UI
52+
- "14250:14250" # GRPC
53+
- "14268:14268" # HTTP
54+
- "14269:14269" # HTTP
55+
- "6831:6831/udp" # UDP
56+
- "6832:6832/udp" # UDP
57+
- "5778:5778" # HTTP
58+
59+
otel-collector:
60+
image: otel/opentelemetry-collector-contrib:0.100.0
61+
ports:
62+
- "4317:4317" # OTLP gRPC receiver
63+
- "4318:4318" # OTLP Http receiver
64+
- "55680:55680" # OTLP HTTP receiver
65+
command: [ "--config=/etc/otel/otel-collector-config.yml" ]
66+
depends_on:
67+
- jaeger
68+
volumes:
69+
- ../monitoring/otel/config:/etc/otel

docker-compose.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,4 +99,14 @@ services:
9999
# prometheus:
100100
# extends:
101101
# service: prometheus
102+
# file: ./.docker/docker-compose-monitoring.yml
103+
#
104+
# otel:
105+
# extends:
106+
# service: otel-collector
107+
# file: ./.docker/docker-compose-monitoring.yml
108+
#
109+
# jaeger:
110+
# extends:
111+
# service: jaeger
102112
# file: ./.docker/docker-compose-monitoring.yml
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
CREATE FUNCTION tenants_delete_notify_trigger ()
2+
RETURNS TRIGGER
3+
AS $$
4+
BEGIN
5+
PERFORM
6+
pg_notify('tenants_update', '"' || OLD.id || '"');
7+
RETURN NULL;
8+
END;
9+
$$
10+
LANGUAGE plpgsql;
11+
CREATE TRIGGER tenants_delete_notify_trigger
12+
AFTER DELETE ON tenants
13+
FOR EACH ROW
14+
EXECUTE PROCEDURE tenants_delete_notify_trigger ();
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
ALTER TABLE tenants ADD COLUMN tracing_mode text NOT NULL DEFAULT 'basic';
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
receivers:
2+
otlp:
3+
protocols:
4+
grpc:
5+
http:
6+
7+
processors:
8+
memory_limiter:
9+
check_interval: 1s
10+
limit_percentage: 70
11+
spike_limit_percentage: 20
12+
batch:
13+
send_batch_size: 10000
14+
timeout: 10s
15+
tail_sampling/storage:
16+
decision_wait: 10s
17+
expected_new_traces_per_sec: 10000
18+
num_traces: 50000
19+
policies:
20+
[
21+
# Exclude probes URLs
22+
{
23+
name: exclude-urls,
24+
type: string_attribute,
25+
string_attribute: { key: http.route, values: [ \/health.*, \/metrics, \/tenants, \/version, \/status ], enabled_regex_matching: true, invert_match: true }
26+
},
27+
# All error are sampled
28+
{
29+
name: error-status-codes,
30+
type: numeric_attribute,
31+
numeric_attribute: { key: http.status_code, min_value: 500, max_value: 599 }
32+
},
33+
# Always sample high latency traces that are not uploads
34+
{
35+
name: high-latency-excluding-uploads,
36+
type: and,
37+
and: {
38+
and_sub_policy:
39+
[
40+
{
41+
type: latency,
42+
latency: { threshold_ms: 5000 }
43+
},
44+
# Exclude upload operations
45+
{
46+
type: string_attribute,
47+
string_attribute: {
48+
key: http.operation,
49+
values: [ .*upload.* ],
50+
enabled_regex_matching: true,
51+
invert_match: true
52+
}
53+
}
54+
]
55+
}
56+
},
57+
# Always sample high latency uploads
58+
{
59+
name: high-latency-uploads,
60+
type: and,
61+
and: {
62+
and_sub_policy:
63+
[
64+
{
65+
type: latency,
66+
latency: { threshold_ms: 300000 }
67+
},
68+
# Only upload operations
69+
{
70+
type: string_attribute,
71+
string_attribute: {
72+
key: http.operation,
73+
values: [ .*upload.* ],
74+
enabled_regex_matching: true,
75+
}
76+
}
77+
]
78+
}
79+
},
80+
# Sample traces for tenants with default mode
81+
# Default mode is the mode where the trace.mode attribute is set to basic
82+
# and only 0.2 of traces are sampled for each tenant
83+
{
84+
name: sampling-basic-tenants,
85+
type: and,
86+
and: {
87+
and_sub_policy:
88+
[
89+
{
90+
# must have tenant.ref attribute
91+
name: has-tenant-ref,
92+
type: string_attribute,
93+
string_attribute:
94+
{
95+
key: tenant.ref,
96+
values: [ .* ],
97+
enabled_regex_matching: true
98+
},
99+
},
100+
{
101+
# trace.mode = basic
102+
name: trace-mode-default,
103+
type: string_attribute,
104+
string_attribute:
105+
{
106+
key: trace.mode,
107+
values: [ basic ],
108+
},
109+
},
110+
{
111+
name: success-status-codes,
112+
type: numeric_attribute,
113+
numeric_attribute: { key: http.status_code, min_value: 200, max_value: 399 }
114+
},
115+
{
116+
name: basic-sampling,
117+
type: probabilistic,
118+
probabilistic: {
119+
sampling_percentage: 5
120+
}
121+
}
122+
]
123+
}
124+
},
125+
126+
# Sample traces for tenants with premium mode
127+
# Premium mode sample 100% of traces for each tenant
128+
{
129+
name: sampling-premium-tenants,
130+
type: and,
131+
and: {
132+
and_sub_policy:
133+
[
134+
{
135+
# must have tenant.ref attribute
136+
name: has-tenant-ref,
137+
type: string_attribute,
138+
string_attribute:
139+
{
140+
key: tenant.ref,
141+
values: [ .* ],
142+
enabled_regex_matching: true
143+
},
144+
},
145+
{
146+
# trace.mode = premium
147+
name: trace-mode-default,
148+
type: string_attribute,
149+
string_attribute:
150+
{
151+
key: trace.mode,
152+
values: [ full ],
153+
},
154+
},
155+
{
156+
name: success-status-codes,
157+
type: numeric_attribute,
158+
numeric_attribute: { key: http.status_code, min_value: 200, max_value: 399 }
159+
},
160+
{
161+
name: full-sampling,
162+
type: probabilistic,
163+
probabilistic: {
164+
sampling_percentage: 100
165+
}
166+
}
167+
]
168+
}
169+
}
170+
]
171+
172+
exporters:
173+
otlp/jaeger:
174+
endpoint: "jaeger:4317"
175+
tls:
176+
insecure: true
177+
178+
service:
179+
pipelines:
180+
traces:
181+
receivers: [otlp]
182+
processors: [memory_limiter, tail_sampling/storage, batch]
183+
exporters: [otlp/jaeger]

0 commit comments

Comments
 (0)