Skip to content

Commit cc9f91a

Browse files
committed
feat(observability): 增强服务实例标识与监控面板
- 在Prometheus配置中添加service_id标签用于区分服务实例 - 为OTEL资源添加service_instance_id属性 - 改进Consul注册地址探测逻辑,优先使用容器网卡IP - 更新Grafana仪表板以支持按实例筛选和展示
1 parent dff769f commit cc9f91a

File tree

4 files changed

+147
-58
lines changed

4 files changed

+147
-58
lines changed

mock/s3/deployments/observability/grafana/dashboards/mock-s3-services-metrics.json

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,20 @@
2828
"text": "All",
2929
"value": "$__all"
3030
}
31+
},
32+
{
33+
"name": "instance",
34+
"type": "query",
35+
"query": "label_values(system_cpu_usage_percent_percent{service=~\"$service\"}, instance)",
36+
"refresh": 1,
37+
"includeAll": true,
38+
"allValue": ".*",
39+
"multi": true,
40+
"current": {
41+
"selected": true,
42+
"text": "All",
43+
"value": "$__all"
44+
}
3145
}
3246
]
3347
},
@@ -38,8 +52,8 @@
3852
"type": "timeseries",
3953
"targets": [
4054
{
41-
"expr": "system_cpu_usage_percent_percent{service=~\"$service\"}",
42-
"legendFormat": "{{service}} CPU",
55+
"expr": "system_cpu_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
56+
"legendFormat": "{{service}} {{instance}} CPU",
4357
"refId": "A"
4458
}
4559
],
@@ -74,8 +88,8 @@
7488
"type": "timeseries",
7589
"targets": [
7690
{
77-
"expr": "system_memory_usage_percent_percent{service=~\"$service\"}",
78-
"legendFormat": "{{service}} Memory",
91+
"expr": "system_memory_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
92+
"legendFormat": "{{service}} {{instance}} Memory",
7993
"refId": "A"
8094
}
8195
],
@@ -110,7 +124,7 @@
110124
"type": "stat",
111125
"targets": [
112126
{
113-
"expr": "system_cpu_usage_percent_percent{service=~\"$service\"}",
127+
"expr": "system_cpu_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
114128
"legendFormat": "{{service}}",
115129
"refId": "A"
116130
}
@@ -158,7 +172,7 @@
158172
"type": "stat",
159173
"targets": [
160174
{
161-
"expr": "system_memory_usage_percent_percent{service=~\"$service\"}",
175+
"expr": "system_memory_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
162176
"legendFormat": "{{service}}",
163177
"refId": "A"
164178
}
@@ -206,8 +220,8 @@
206220
"type": "timeseries",
207221
"targets": [
208222
{
209-
"expr": "system_network_qps_per_second{service=~\"$service\"}",
210-
"legendFormat": "{{service}} QPS",
223+
"expr": "system_network_qps_per_second{service=~\"$service\",instance=~\"$instance\"}",
224+
"legendFormat": "{{service}} {{instance}} QPS",
211225
"refId": "A"
212226
}
213227
],
@@ -241,7 +255,7 @@
241255
"type": "stat",
242256
"targets": [
243257
{
244-
"expr": "system_machine_online_status{service=~\"$service\"}",
258+
"expr": "system_machine_online_status{service=~\"$service\",instance=~\"$instance\"}",
245259
"legendFormat": "{{service}}",
246260
"refId": "A"
247261
}
@@ -351,28 +365,28 @@
351365
"type": "table",
352366
"targets": [
353367
{
354-
"expr": "system_cpu_usage_percent_percent{service=~\"$service\"}",
368+
"expr": "system_cpu_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
355369
"legendFormat": "",
356370
"refId": "A",
357371
"instant": true,
358372
"format": "table"
359373
},
360374
{
361-
"expr": "system_memory_usage_percent_percent{service=~\"$service\"}",
375+
"expr": "system_memory_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}",
362376
"legendFormat": "",
363377
"refId": "B",
364378
"instant": true,
365379
"format": "table"
366380
},
367381
{
368-
"expr": "system_network_qps_per_second{service=~\"$service\"}",
382+
"expr": "system_network_qps_per_second{service=~\"$service\",instance=~\"$instance\"}",
369383
"legendFormat": "",
370384
"refId": "C",
371385
"instant": true,
372386
"format": "table"
373387
},
374388
{
375-
"expr": "system_machine_online_status{service=~\"$service\"}",
389+
"expr": "system_machine_online_status{service=~\"$service\",instance=~\"$instance\"}",
376390
"legendFormat": "",
377391
"refId": "D",
378392
"instant": true,
@@ -389,14 +403,14 @@
389403
"options": {
390404
"excludeByName": {
391405
"Time": true,
392-
"instance": true,
393406
"job": true,
394407
"node": true,
395408
"otel_scope_name": true,
396409
"__name__": true
397410
},
398411
"renameByName": {
399412
"service": "Service",
413+
"instance": "Instance",
400414
"Value #A": "CPU (%)",
401415
"Value #B": "Memory (%)",
402416
"Value #C": "Network QPS",
@@ -494,8 +508,8 @@
494508
"type": "piechart",
495509
"targets": [
496510
{
497-
"expr": "topk(5, system_cpu_usage_percent_percent{service=~\"$service\"})",
498-
"legendFormat": "{{service}}",
511+
"expr": "topk(5, system_cpu_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"})",
512+
"legendFormat": "{{service}} {{instance}}",
499513
"refId": "A"
500514
}
501515
],
@@ -534,8 +548,8 @@
534548
"type": "heatmap",
535549
"targets": [
536550
{
537-
"expr": "avg_over_time(system_cpu_usage_percent_percent{service=~\"$service\"}[5m])",
538-
"legendFormat": "{{service}}",
551+
"expr": "avg_over_time(system_cpu_usage_percent_percent{service=~\"$service\",instance=~\"$instance\"}[5m])",
552+
"legendFormat": "{{service}} {{instance}}",
539553
"refId": "A"
540554
}
541555
],
@@ -560,4 +574,4 @@
560574
},
561575
"schemaVersion": 36,
562576
"version": 1
563-
}
577+
}

mock/s3/deployments/observability/prometheus.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ scrape_configs:
3333
relabel_configs:
3434
- source_labels: [__meta_consul_service]
3535
target_label: service
36+
- source_labels: [__meta_consul_service_id]
37+
target_label: service_id
3638
- source_labels: [__meta_consul_node]
3739
target_label: node
3840

39-
# 存储配置 - 命令行参数将在docker-compose中配置
41+
# 存储配置 - 命令行参数将在docker-compose中配置

mock/s3/shared/observability/providers.go

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
package observability
22

33
import (
4-
"context"
5-
"fmt"
6-
"mocks3/shared/observability/config"
4+
"context"
5+
"fmt"
6+
"mocks3/shared/observability/config"
7+
"os"
78

8-
"go.opentelemetry.io/otel"
9+
"go.opentelemetry.io/otel"
910
"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
1011
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
1112
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
@@ -182,11 +183,21 @@ func (p *Providers) Shutdown(ctx context.Context) error {
182183

183184
// createResource 创建OTEL资源
184185
func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) {
185-
return resource.New(context.Background(),
186-
resource.WithAttributes(
187-
semconv.ServiceName(config.ServiceName),
188-
semconv.ServiceVersion(config.ServiceVersion),
189-
semconv.DeploymentEnvironment(config.Environment),
190-
),
191-
)
186+
// Derive a stable service instance id
187+
instanceID := os.Getenv("INSTANCE_ID")
188+
if instanceID == "" {
189+
if h, err := os.Hostname(); err == nil && h != "" {
190+
instanceID = h
191+
} else {
192+
instanceID = "unknown-instance"
193+
}
194+
}
195+
return resource.New(context.Background(),
196+
resource.WithAttributes(
197+
semconv.ServiceName(config.ServiceName),
198+
semconv.ServiceVersion(config.ServiceVersion),
199+
semconv.DeploymentEnvironment(config.Environment),
200+
semconv.ServiceInstanceID(instanceID),
201+
),
202+
)
192203
}

mock/s3/shared/server/service_bootstrap.go

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
package server
22

33
import (
4-
"context"
5-
"fmt"
6-
"mocks3/shared/observability"
7-
"net/http"
8-
"os"
9-
"os/signal"
10-
"syscall"
11-
"time"
4+
"context"
5+
"fmt"
6+
"mocks3/shared/observability"
7+
"net/http"
8+
"net"
9+
"os"
10+
"os/signal"
11+
"syscall"
12+
"time"
1213

1314
"github.com/gin-gonic/gin"
1415
"mocks3/shared/middleware/consul"
@@ -173,7 +174,7 @@ func (sb *ServiceBootstrap) setupObservability() error {
173174

174175
// setupConsulRegistration 设置Consul服务注册
175176
func (sb *ServiceBootstrap) setupConsulRegistration() error {
176-
ctx := context.Background()
177+
ctx := context.Background()
177178

178179
// 检查配置是否支持Consul
179180
consulConfig, ok := sb.Config.(ConsulServiceConfig)
@@ -190,19 +191,23 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error {
190191

191192
sb.ConsulClient = consulClient
192193

193-
// 注册服务到Consul
194-
// 使用hostname作为注册地址,而不是绑定地址"0.0.0.0"
195-
var registerAddress string
196-
if sb.Config.GetHost() == "0.0.0.0" {
197-
// 如果绑定地址是0.0.0.0,使用hostname进行注册
198-
hostname, err := os.Hostname()
199-
if err != nil {
200-
return fmt.Errorf("failed to get hostname for Consul registration: %w", err)
201-
}
202-
registerAddress = hostname
203-
} else {
204-
registerAddress = sb.Config.GetHost()
205-
}
194+
// 注册服务到Consul
195+
// 优先使用可达的容器/主机实例IP地址进行注册,确保多实例下目标唯一
196+
var registerAddress string
197+
if sb.Config.GetHost() == "0.0.0.0" {
198+
// 允许通过环境变量覆盖对外公布地址
199+
if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" {
200+
registerAddress = envAddr
201+
} else {
202+
ip, err := detectAdvertiseAddr()
203+
if err != nil {
204+
return fmt.Errorf("failed to detect advertise address: %w", err)
205+
}
206+
registerAddress = ip
207+
}
208+
} else {
209+
registerAddress = sb.Config.GetHost()
210+
}
206211

207212
err = consul.RegisterService(ctx, consulClient,
208213
sb.Config.GetServiceName(),
@@ -212,12 +217,69 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error {
212217
return fmt.Errorf("failed to register service with Consul: %w", err)
213218
}
214219

215-
sb.Logger.Info(ctx, "Service registered with Consul successfully",
216-
observability.String("consul_addr", consulConfig.GetConsulAddress()),
217-
observability.String("service_name", sb.Config.GetServiceName()),
218-
observability.String("register_address", registerAddress))
220+
sb.Logger.Info(ctx, "Service registered with Consul successfully",
221+
observability.String("consul_addr", consulConfig.GetConsulAddress()),
222+
observability.String("service_name", sb.Config.GetServiceName()),
223+
observability.String("register_address", registerAddress))
219224

220-
return nil
225+
return nil
226+
}
227+
228+
// detectAdvertiseAddr 自动探测一个非回环的IPv4地址,优先选择常见容器网卡
229+
func detectAdvertiseAddr() (string, error) {
230+
// 优先尝试常见的容器网卡名称
231+
preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"}
232+
for _, name := range preferredIfaces {
233+
ifi, err := net.InterfaceByName(name)
234+
if err == nil && (ifi.Flags&net.FlagUp) != 0 {
235+
addrs, err := ifi.Addrs()
236+
if err == nil {
237+
if ip := firstIPv4(addrs); ip != "" {
238+
return ip, nil
239+
}
240+
}
241+
}
242+
}
243+
244+
// 回退:遍历所有网卡,取第一个非回环且Up的IPv4
245+
ifaces, err := net.Interfaces()
246+
if err != nil {
247+
return "", err
248+
}
249+
for _, ifi := range ifaces {
250+
if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 {
251+
continue
252+
}
253+
addrs, err := ifi.Addrs()
254+
if err != nil {
255+
continue
256+
}
257+
if ip := firstIPv4(addrs); ip != "" {
258+
return ip, nil
259+
}
260+
}
261+
return "", fmt.Errorf("no non-loopback IPv4 address found")
262+
}
263+
264+
func firstIPv4(addrs []net.Addr) string {
265+
for _, a := range addrs {
266+
var ip net.IP
267+
switch v := a.(type) {
268+
case *net.IPNet:
269+
ip = v.IP
270+
case *net.IPAddr:
271+
ip = v.IP
272+
}
273+
if ip == nil {
274+
continue
275+
}
276+
ip4 := ip.To4()
277+
if ip4 == nil || ip4.IsLoopback() {
278+
continue
279+
}
280+
return ip4.String()
281+
}
282+
return ""
221283
}
222284

223285
// setupErrorInjection 设置错误注入中间件

0 commit comments

Comments
 (0)