Skip to content

Commit e23365f

Browse files
authored
* Move monitoring from components to resources * Add ns-sourcegraph as default namespace * Update prometheus to use dns service discovery * release: [email protected]
1 parent 20b216e commit e23365f

File tree

85 files changed

+827
-1274
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+827
-1274
lines changed

base/monitoring/cadvisor/cadvisor.DaemonSet.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ spec:
2626
serviceAccountName: cadvisor
2727
containers:
2828
- name: cadvisor
29-
image: index.docker.io/sourcegraph/cadvisor:4.4.2@sha256:4c3af0c4fd9ea4425d38f7d1a784833c5fd542542cdbb81292044773e686fa60
29+
image: index.docker.io/sourcegraph/cadvisor:4.5.0@sha256:5117f2bc817c16fb129acb6f9b070af8f1be09d3d9a8f88e3297f7adfff9af0d
3030
args:
3131
# Kubernetes-specific flags below (other flags are baked into the Docker image)
3232
#

base/monitoring/grafana/grafana.StatefulSet.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ spec:
2626
spec:
2727
containers:
2828
- name: grafana
29-
image: index.docker.io/sourcegraph/grafana:4.4.2@sha256:69777c3a895a03eee035c173c91c0f25893285118c06e51a67728ec4259e2296
29+
image: index.docker.io/sourcegraph/grafana:4.5.0@sha256:f70a7f79c5c90cab0d5cfb8f3dbca4dc60ed390b045aff1a86079c87bfe9a8af
3030
terminationMessagePolicy: FallbackToLogsOnError
3131
ports:
3232
- containerPort: 3370
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
resources:
4+
- grafana.ServiceAccount.yaml

base/monitoring/jaeger/jaeger.Deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
spec:
3131
containers:
3232
- name: jaeger
33-
image: index.docker.io/sourcegraph/jaeger-all-in-one:insiders@sha256:462ef3b4a5fa9227f04c2f4bc2968970fad0fcc9efbaf89adaad0ef98a24b53f
33+
image: index.docker.io/sourcegraph/jaeger-all-in-one:4.5.0@sha256:461476b01968324a0d8cb43a0176713e006f99cdb1f2efc3ab2210fd0bb812c2
3434
args: ["--memory.max-traces=20000"]
3535
ports:
3636
- containerPort: 5775

base/monitoring/node-exporter/node-exporter.DaemonSet.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ spec:
2424
spec:
2525
containers:
2626
- name: node-exporter
27-
image: index.docker.io/sourcegraph/node-exporter:4.4.2@sha256:fa8e5700b7762fffe0674e944762f44bb787a7e44d97569fe55348260453bf80
27+
image: index.docker.io/sourcegraph/node-exporter:4.5.0@sha256:fa8e5700b7762fffe0674e944762f44bb787a7e44d97569fe55348260453bf80
2828
imagePullPolicy: IfNotPresent
2929
resources:
3030
limits:

base/monitoring/otel-collector/otel-agent.DaemonSet.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ spec:
2626
spec:
2727
containers:
2828
- name: otel-agent
29-
image: index.docker.io/sourcegraph/opentelemetry-collector:4.4.2@sha256:f0723c96c973258ad3123ddc479261bb8f5827bbac1d091b6a683fde55334413
29+
image: index.docker.io/sourcegraph/opentelemetry-collector:4.5.0@sha256:12f3fc137edea8319ebf574e15e6c27c19fb0b7ca17165973f98c8d8c342ca1d
3030
command:
3131
- "/bin/otelcol-sourcegraph"
3232
- "--config=/etc/otel-agent/config.yaml"

base/monitoring/otel-collector/otel-collector.Deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ spec:
2626
spec:
2727
containers:
2828
- name: otel-collector
29-
image: index.docker.io/sourcegraph/opentelemetry-collector:4.4.2@sha256:f0723c96c973258ad3123ddc479261bb8f5827bbac1d091b6a683fde55334413
29+
image: index.docker.io/sourcegraph/opentelemetry-collector:4.5.0@sha256:12f3fc137edea8319ebf574e15e6c27c19fb0b7ca17165973f98c8d8c342ca1d
3030
command:
3131
- "/bin/otelcol-sourcegraph"
3232
# To use a custom configuration, edit otel-collector.ConfigMap.yaml

base/monitoring/prometheus/prometheus.ConfigMap.yaml

Lines changed: 108 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ data:
1010
prometheus.yml: |
1111
# Prometheus global config
1212
global:
13-
scrape_interval: 30s
13+
scrape_interval: 30s
1414
evaluation_interval: 30s
1515
# scrape_timeout is set to the global default (10s).
1616
@@ -19,47 +19,36 @@ data:
1919
alertmanagers:
2020
# bundled alertmanager, started by prom-wrapper
2121
- static_configs:
22-
- targets: ['127.0.0.1:9093']
22+
- targets: ["127.0.0.1:9093"]
2323
path_prefix: /alertmanager
2424
# add more alertmanagers here
2525
2626
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
2727
rule_files:
28-
- '/sg_config_prometheus/*_rules.yml'
29-
- '/sg_prometheus_add_ons/*_rules.yml'
28+
- "/sg_config_prometheus/*_rules.yml"
29+
- "/sg_prometheus_add_ons/*_rules.yml"
3030
3131
# Configure targets to scrape
3232
scrape_configs:
33-
3433
# Scrape prometheus itself for metrics.
35-
- job_name: 'builtin-prometheus'
34+
- job_name: "builtin-prometheus"
3635
static_configs:
37-
- targets: ['127.0.0.1:9092']
36+
- targets: ["127.0.0.1:9092"]
3837
39-
- job_name: 'builtin-alertmanager'
38+
- job_name: "builtin-alertmanager"
4039
metrics_path: /alertmanager/metrics
4140
static_configs:
42-
- targets: ['127.0.0.1:9093']
41+
- targets: ["127.0.0.1:9093"]
4342
44-
- job_name: 'sourcegraph-services'
45-
relabel_configs:
46-
- source_labels: [__address__]
47-
target_label: instance
48-
regex: (.*)\.(.*)
49-
replacement: ${1}_${2}
50-
metric_relabel_configs:
51-
- source_labels: [container_label_io_kubernetes_pod_namespace]
52-
regex: kube-system
53-
action: drop
54-
file_sd_configs:
55-
- files:
56-
- '/sg_prometheus_add_ons/*_targets.yml'
57-
58-
- job_name: 'cadvisor'
43+
##########################################################################################
44+
# cadvisor
45+
##########################################################################################
46+
47+
- job_name: "kubernetes-pods"
5948
dns_sd_configs:
6049
- names:
61-
- 'cadvisor.default.svc.cluster.local'
62-
- 'cadvisor.ns-sourcegraph.svc.cluster.local'
50+
- "cadvisor.default.svc.cluster.local"
51+
- "cadvisor.ns-sourcegraph.svc.cluster.local"
6352
type: A
6453
port: 48080
6554
relabel_configs:
@@ -73,59 +62,97 @@ data:
7362
- source_labels: [container_label_io_kubernetes_pod_namespace]
7463
regex: kube-system
7564
action: drop
76-
- source_labels: [container_label_io_kubernetes_container_name, container_label_io_kubernetes_pod_name]
65+
- source_labels:
66+
[
67+
container_label_io_kubernetes_container_name,
68+
container_label_io_kubernetes_pod_name,
69+
]
7770
regex: (.+)
7871
action: replace
7972
target_label: name
80-
separator: '-'
81-
# - source_labels: [container_label_io_kubernetes_pod_namespace]
82-
# regex: ^$|ns-sourcegraph # ACTION: replace ns-sourcegraph with your namespace
83-
# action: keep
84-
85-
- job_name: 'sourcegraph-statefulsets'
73+
separator: "-"
74+
75+
##########################################################################################
76+
# sourcegraph-services
77+
##########################################################################################
78+
79+
- job_name: "sourcegraph-services"
80+
relabel_configs:
81+
- source_labels: [__address__]
82+
target_label: instance
83+
regex: (.*)\.(.*)
84+
replacement: ${1}_${2}
85+
metric_relabel_configs:
86+
- source_labels: [container_label_io_kubernetes_pod_namespace]
87+
regex: kube-system
88+
action: drop
89+
file_sd_configs:
90+
- files:
91+
- "/sg_prometheus_add_ons/*_targets.yml"
92+
93+
- job_name: "sourcegraph-statefulsets"
8694
dns_sd_configs:
8795
- names:
88-
- 'symbols.default.svc.cluster.local'
89-
- 'symbols.ns-sourcegraph.svc.cluster.local'
90-
- 'searcher.default.svc.cluster.local'
91-
- 'searcher.ns-sourcegraph.svc.cluster.local'
92-
- 'gitserver.default.svc.cluster.local'
93-
- 'gitserver.ns-sourcegraph.svc.cluster.local'
94-
- 'sourcegraph-frontend.default.svc.cluster.local'
95-
- 'sourcegraph-frontend.ns-sourcegraph.svc.cluster.local'
96-
type: A
97-
port: 6060
98-
- names:
99-
- 'indexed-search.default.svc.cluster.local'
100-
- 'indexed-search.ns-sourcegraph.svc.cluster.local'
101-
type: A
102-
port: 6070
103-
- names:
104-
- 'indexed-search-indexer.default.svc.cluster.local'
105-
- 'indexed-search-indexer.ns-sourcegraph.svc.cluster.local'
106-
type: A
107-
port: 6072
96+
- "symbols.default.svc.cluster.local"
97+
- "symbols.ns-sourcegraph.svc.cluster.local"
98+
- "symbols.$SG_NAMESPACE.svc.cluster.local"
99+
- "searcher.default.svc.cluster.local"
100+
- "searcher.ns-sourcegraph.svc.cluster.local"
101+
- "searcher.$SG_NAMESPACE.svc.cluster.local"
102+
- "gitserver.default.svc.cluster.local"
103+
- "gitserver.ns-sourcegraph.svc.cluster.local"
104+
- "gitserver.$SG_NAMESPACE.svc.cluster.local"
105+
- "sourcegraph-frontend.default.svc.cluster.local"
106+
- "sourcegraph-frontend.ns-sourcegraph.svc.cluster.local"
107+
- "sourcegraph-frontend.$SG_NAMESPACE.svc.cluster.local"
108+
- "indexed-search.default.svc.cluster.local"
109+
- "indexed-search.ns-sourcegraph.svc.cluster.local"
110+
- "indexed-search.$SG_NAMESPACE.svc.cluster.local"
111+
- "indexed-search-indexer.default.svc.cluster.local"
112+
- "indexed-search-indexer.ns-sourcegraph.svc.cluster.local"
113+
- "indexed-search-indexer.$SG_NAMESPACE.svc.cluster.local"
114+
type: SRV
108115
relabel_configs:
116+
- source_labels: [__meta_dns_srv_record_target]
117+
target_label: __address__
118+
regex: (.*)\.
119+
replacement: ${1}:6060
120+
- source_labels: [__meta_dns_srv_record_target]
121+
target_label: __address__
122+
regex: ^(indexed-search.*)\.
123+
replacement: ${1}:6070
124+
- source_labels: [__meta_dns_srv_record_target]
125+
target_label: __address__
126+
regex: (.*)\.(indexed-search-indexer.*)\.
127+
replacement: ${1}.${2}:6072
128+
- source_labels: [__meta_dns_srv_record_port]
129+
target_label: __meta_dns_srv_record_port
130+
replacement: 6060
131+
- source_labels: [__address__]
132+
regex: ^(indexed-search).*$
133+
target_label: __meta_dns_srv_record_port
134+
replacement: 6070
109135
- source_labels: [__meta_dns_name]
110-
target_label: service_name
136+
target_label: job
111137
regex: (.*)\..*\..*\..*\..*
112138
replacement: ${1}
139+
- source_labels: [__meta_dns_srv_record_target]
140+
regex: (.*)\.(.*)\..*\..*\..*\..*\..*
141+
target_label: instance
142+
replacement: ${2}_${1}
143+
metric_relabel_configs:
144+
- source_labels: [container_label_io_kubernetes_pod_namespace]
145+
regex: kube-system
146+
action: drop
147+
- source_labels: [__address__]
148+
target_label: instance
149+
regex: (.*)\:.*
150+
replacement: $1:6060
151+
- source_labels: [__address__]
152+
target_label: instance
153+
regex: (.*)\.(.*)\..*\..*\..*\..*\..*
154+
replacement: ${2}_${1}
113155
114-
# Extra rules
115-
extra_rules.yml: |
116-
groups:
117-
- name: container.rules
118-
rules:
119-
- record: container:process_cpu_seconds_total:ratio_rate5m
120-
expr: sum by (instance) (rate(process_cpu_seconds_total[5m])) / engine_daemon_engine_cpus_cpus
121-
- record: container:process_cpu_seconds_total:sum
122-
expr: sum by (instance) (irate(process_cpu_seconds_total[1m]))
123-
- record: container:process_resident_memory_bytes:max
124-
expr: max by (instance) (process_resident_memory_bytes)
125-
- record: container:process_virtual_memory_bytes:max
126-
expr: max by (instance) (process_virtual_memory_bytes)
127-
128-
# List of static targets
129156
prometheus_targets.yml: |
130157
- labels:
131158
nodename: "sourcegraph-services"
@@ -202,40 +229,15 @@ data:
202229
job: otel-collector
203230
targets:
204231
- otel-collector:8888
205-
206-
# Add new targets based on replica count of symbols
207-
symbols_targets.yml: |
208-
- labels:
209-
nodename: "sourcegraph-services"
210-
job: symbols
211-
targets:
212-
- symbols-0.symbols:6060
213-
214-
# Add new targets based on replica count of searcher
215-
searcher_targets.yml: |
216-
- labels:
217-
nodename: "sourcegraph-services"
218-
job: searcher
219-
targets:
220-
- searcher-0.searcher:6060
221-
222-
# Add new targets based on replica count of gitserver
223-
gitserver_targets.yml: |
224-
- labels:
225-
nodename: "sourcegraph-services"
226-
job: gitserver
227-
targets:
228-
- gitserver-0.gitserver:6060
229-
230-
# Add new targets based on replica count of indexed-search
231-
indexed-search_targets.yml: |
232-
- labels:
233-
nodename: "sourcegraph-services"
234-
job: zoekt-indexserver
235-
targets:
236-
- indexed-search-0.indexed-search:6072
237-
- labels:
238-
nodename: "sourcegraph-services"
239-
job: zoekt-webserver
240-
targets:
241-
- indexed-search-0.indexed-search:6070
232+
extra_rules.yml: |
233+
groups:
234+
- name: container.rules
235+
rules:
236+
- record: container:process_cpu_seconds_total:ratio_rate5m
237+
expr: sum by (instance) (rate(process_cpu_seconds_total[5m])) / engine_daemon_engine_cpus_cpus
238+
- record: container:process_cpu_seconds_total:sum
239+
expr: sum by (instance) (irate(process_cpu_seconds_total[1m]))
240+
- record: container:process_resident_memory_bytes:max
241+
expr: max by (instance) (process_resident_memory_bytes)
242+
- record: container:process_virtual_memory_bytes:max
243+
expr: max by (instance) (process_virtual_memory_bytes)

base/monitoring/prometheus/prometheus.Deployment.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ spec:
2525
spec:
2626
containers:
2727
- name: prometheus
28-
image: index.docker.io/sourcegraph/prometheus:4.4.2@sha256:d833d00a39937cf700f276f816dc789615d6396979418a7d9362386513b1fc9d
28+
image: index.docker.io/sourcegraph/prometheus:4.5.0@sha256:4fe9a5fdee206b1aac9d32afb31ad57e1882394aad9e7e9f719a1b2741afcae5
2929
terminationMessagePolicy: FallbackToLogsOnError
3030
env:
31-
- name: MY_POD_NAMESPACE
31+
- name: SG_NAMESPACE
3232
valueFrom:
3333
fieldRef:
3434
fieldPath: metadata.namespace
@@ -70,7 +70,6 @@ spec:
7070
runAsUser: 100
7171
fsGroup: 100
7272
fsGroupChangePolicy: "OnRootMismatch"
73-
# serviceAccountName: prometheus
7473
volumes:
7574
- name: data
7675
persistentVolumeClaim:

0 commit comments

Comments
 (0)