Skip to content

Commit 699a122

Browse files
colegatacole02aknuds1
authored
Use OTel library from tracing while keeping Jaeger exporter config (#11249)
* Migrate to OTel tracing Signed-off-by: Oleg Zaytsev <[email protected]> * Apply suggestions from code review Co-authored-by: Taylor C <[email protected]> * Update OTel case in pkg/util/spanlogger/spanlogger.go Co-authored-by: Arve Knudsen <[email protected]> * Update docs/sources/mimir/configure/configure-tracing.md Co-authored-by: Arve Knudsen <[email protected]> --------- Signed-off-by: Oleg Zaytsev <[email protected]> Co-authored-by: Taylor C <[email protected]> Co-authored-by: Arve Knudsen <[email protected]>
1 parent 351392c commit 699a122

File tree

110 files changed

+1184
-2537
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+1184
-2537
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,11 @@ lint: check-makefiles
456456
"github.com/twmb/franz-go/pkg/kgo.{AllowAutoTopicCreation}" \
457457
./pkg/... ./cmd/... ./tools/... ./integration/...
458458

459+
# We don't use opentracing anymore.
460+
faillint -paths \
461+
"github.com/opentracing/opentracing-go,github.com/opentracing/opentracing-go/log,github.com/uber/jaeger-client-go,github.com/opentracing-contrib/go-stdlib/nethttp" \
462+
./pkg/... ./cmd/... ./tools/... ./integration/...
463+
459464
# Ensure lines are sorted after lint:sorted directives.
460465
go run ./tools/lint-sorted/ \
461466
-path ./pkg \

cmd/mimir-continuous-test/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616
"github.com/grafana/dskit/tracing"
1717
"github.com/prometheus/client_golang/prometheus"
1818
"github.com/prometheus/client_golang/prometheus/collectors"
19-
jaegercfg "github.com/uber/jaeger-client-go/config"
2019

2120
"github.com/grafana/mimir/pkg/continuoustest"
2221
"github.com/grafana/mimir/pkg/util/instrumentation"
@@ -44,7 +43,7 @@ func main() {
4443
level.Warn(util_log.Logger).Log("msg", "The mimir-continuous-test binary you are using is deprecated. Please use the Mimir binary module `mimir -target=continuous-test`.")
4544

4645
// Setting the environment variable JAEGER_AGENT_HOST enables tracing.
47-
if trace, err := tracing.NewFromEnv("mimir-continuous-test", jaegercfg.MaxTagValueLength(16e3)); err != nil {
46+
if trace, err := tracing.NewOTelFromJaegerEnv("mimir-continuous-test"); err != nil {
4847
level.Error(util_log.Logger).Log("msg", "Failed to setup tracing", "err", err.Error())
4948
} else {
5049
defer trace.Close()

cmd/mimir/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"github.com/grafana/dskit/tracing"
2323
"github.com/pkg/errors"
2424
"github.com/prometheus/client_golang/prometheus"
25-
jaegercfg "github.com/uber/jaeger-client-go/config"
2625
"gopkg.in/yaml.v3"
2726

2827
"github.com/grafana/mimir/pkg/mimir"
@@ -193,7 +192,7 @@ func main() {
193192
}
194193

195194
// Setting the environment variable JAEGER_AGENT_HOST enables tracing.
196-
if trace, err := tracing.NewFromEnv(name, jaegercfg.MaxTagValueLength(16e3)); err != nil {
195+
if trace, err := tracing.NewOTelFromJaegerEnv(name); err != nil {
197196
level.Error(util_log.Logger).Log("msg", "Failed to setup tracing", "err", err.Error())
198197
} else {
199198
defer trace.Close()

cmd/query-tee/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/prometheus/client_golang/prometheus"
2020
"github.com/prometheus/client_golang/prometheus/collectors"
2121
"github.com/prometheus/common/model"
22-
jaegercfg "github.com/uber/jaeger-client-go/config"
2322

2423
"github.com/grafana/mimir/pkg/util/instrumentation"
2524
util_log "github.com/grafana/mimir/pkg/util/log"
@@ -87,7 +86,7 @@ func initTracing() io.Closer {
8786
name = "query-tee"
8887
}
8988

90-
trace, err := tracing.NewFromEnv(name, jaegercfg.MaxTagValueLength(16e3))
89+
trace, err := tracing.NewOTelFromJaegerEnv(name)
9190
if err != nil {
9291
level.Error(util_log.Logger).Log("msg", "Failed to setup tracing", "err", err.Error())
9392
return nil

docs/sources/mimir/configure/configure-tracing.md

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,29 @@ weight: 100
1111

1212
# Configure Grafana Mimir tracing
1313

14-
Grafana Mimir uses [Jaeger](https://www.jaegertracing.io/) to implement distributed
15-
tracing. Jaeger is a valuable tool for troubleshooting the behavior of
16-
Grafana Mimir in production.
14+
Distributed tracing is a valuable tool for troubleshooting the behavior of Grafana Mimir in production.
15+
16+
Grafana Mimir is transitioning from [Jaeger](https://www.jaegertracing.io/) to [OpenTelemetry](https://opentelemetry.io/docs/languages/go/getting-started/) to implement distributed tracing.
17+
During this transition, Mimir uses OTel libraries, but performs the configuration using Jaeger environment variables.
1718

1819
## Dependencies
1920

20-
Set up Jaeger deployment to collect and store traces from Grafana Mimir. A
21-
deployment includes either the Jaeger all-in-one binary, or a distributed
22-
system of agents, collectors, and queriers. If you run Grafana Mimir on Kubernetes, refer to [Jaeger
23-
Kubernetes](https://github.com/jaegertracing/jaeger-kubernetes).
21+
Set up a Jaeger deployment to collect and store traces from Grafana Mimir.
22+
A deployment includes either the Jaeger all-in-one binary or a distributed system of agents, collectors, and queriers.
23+
If you run Grafana Mimir on Kubernetes, refer to [Jaeger Kubernetes](https://github.com/jaegertracing/jaeger-kubernetes).
2424

2525
## Configuration
2626

2727
To configure Grafana Mimir to send traces, perform the following steps:
2828

29-
1. Set the `JAEGER_AGENT_HOST` environment variable in all components to point
30-
to the Jaeger agent.
29+
1. Set the `JAEGER_AGENT_HOST` environment variable in all components to point to the Jaeger agent.
3130
1. Enable sampling in the appropriate components:
32-
- The ingester and ruler self-initiate traces and should have sampling
33-
explicitly enabled.
34-
- Sampling for the distributor and query-frontend can be enabled in Grafana Mimir
35-
or in an upstream service, like a proxy or gateway running in front of Grafana Mimir.
36-
37-
To enable sampling in Grafana Mimir components you can specify either
38-
`JAEGER_SAMPLER_MANAGER_HOST_PORT` for remote sampling, or
39-
`JAEGER_SAMPLER_TYPE` and `JAEGER_SAMPLER_PARAM` to manually set sampling
40-
configuration. Refer to [Jaeger Client Go
41-
documentation](https://github.com/jaegertracing/jaeger-client-go#environment-variables)
42-
for the full list of environment variables you can configure.
43-
44-
Note that you must specify one of `JAEGER_AGENT_HOST` or
45-
`JAEGER_SAMPLER_MANAGER_HOST_PORT` in each component for Jaeger to be enabled,
46-
even if you plan to use the default values.
31+
- The ingester and ruler self-initiate traces. You should have sampling explicitly enabled.
32+
- You can enable sampling for the distributor and query-frontend in Grafana Mimir or in an upstream service, like a proxy or gateway running in front of Grafana Mimir.
33+
34+
To enable sampling in Grafana Mimir components, you can specify either `JAEGER_SAMPLER_MANAGER_HOST_PORT` for remote sampling, or `JAEGER_SAMPLER_TYPE` and `JAEGER_SAMPLER_PARAM` to manually set sampling configuration.
35+
Refer to [Jaeger Client Go documentation](https://github.com/jaegertracing/jaeger-client-go#environment-variables)for the full list of environment variables you can configure.
36+
37+
{{< admonition type="note" >}}
38+
You must specify one of `JAEGER_AGENT_HOST` or `JAEGER_SAMPLER_MANAGER_HOST_PORT` in each component for Jaeger to be enabled, even if you plan to use the default values.
39+
{{< /admonition >}}

go.mod

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ require (
2626
github.com/minio/minio-go/v7 v7.0.91
2727
github.com/mitchellh/go-wordwrap v1.0.1
2828
github.com/oklog/ulid v1.3.1 // indirect
29-
github.com/opentracing-contrib/go-grpc v0.1.2
30-
github.com/opentracing-contrib/go-stdlib v1.1.0
31-
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b
29+
github.com/opentracing-contrib/go-grpc v0.1.2 // indirect
30+
github.com/opentracing-contrib/go-stdlib v1.1.0 // indirect
31+
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
3232
github.com/pkg/errors v0.9.1
3333
github.com/prometheus/alertmanager v0.28.1
3434
github.com/prometheus/client_golang v1.22.0
@@ -39,7 +39,7 @@ require (
3939
github.com/sirupsen/logrus v1.9.3
4040
github.com/spf13/afero v1.11.0
4141
github.com/stretchr/testify v1.10.0
42-
github.com/uber/jaeger-client-go v2.30.0+incompatible
42+
github.com/uber/jaeger-client-go v2.30.0+incompatible // indirect
4343
go.uber.org/atomic v1.11.0
4444
go.uber.org/goleak v1.3.0
4545
golang.org/x/crypto v0.38.0
@@ -85,7 +85,10 @@ require (
8585
github.com/twmb/franz-go/plugin/kprom v1.1.0
8686
github.com/xlab/treeprint v1.2.0
8787
go.opentelemetry.io/collector/pdata v1.31.0
88+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0
89+
go.opentelemetry.io/contrib/propagators/jaeger v1.35.0
8890
go.opentelemetry.io/otel v1.35.0
91+
go.opentelemetry.io/otel/sdk v1.35.0
8992
go.opentelemetry.io/otel/trace v1.35.0
9093
go.opentelemetry.io/proto/otlp v1.5.0
9194
go.uber.org/multierr v1.11.0
@@ -177,13 +180,10 @@ require (
177180
go.opentelemetry.io/collector/processor v1.31.0 // indirect
178181
go.opentelemetry.io/contrib/bridges/otelzap v0.10.0 // indirect
179182
go.opentelemetry.io/contrib/detectors/gcp v1.35.0 // indirect
180-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
181183
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.60.0 // indirect
182-
go.opentelemetry.io/contrib/propagators/jaeger v1.35.0 // indirect
183184
go.opentelemetry.io/contrib/samplers/jaegerremote v0.29.0 // indirect
184185
go.opentelemetry.io/otel/exporters/jaeger v1.17.0 // indirect
185186
go.opentelemetry.io/otel/log v0.11.0 // indirect
186-
go.opentelemetry.io/otel/sdk v1.35.0 // indirect
187187
go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect
188188
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
189189
gopkg.in/mail.v2 v2.3.1 // indirect
@@ -311,7 +311,7 @@ require (
311311
go.etcd.io/etcd/client/v3 v3.5.4 // indirect
312312
go.mongodb.org/mongo-driver v1.14.0 // indirect
313313
go.opentelemetry.io/collector/semconv v0.125.0
314-
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
314+
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0
315315
go.opentelemetry.io/otel/metric v1.35.0 // indirect
316316
go.uber.org/zap v1.27.0 // indirect
317317
golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect

pkg/alertmanager/alertmanager.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ import (
6363
commoncfg "github.com/prometheus/common/config"
6464
"github.com/prometheus/common/model"
6565
"github.com/prometheus/common/route"
66+
"go.opentelemetry.io/otel"
6667
"go.uber.org/atomic"
6768
"golang.org/x/time/rate"
6869

@@ -84,6 +85,8 @@ const (
8485
silencesStateKeyPrefix = "sil:"
8586
)
8687

88+
var tracer = otel.Tracer("pkg/alertmanager")
89+
8790
// Config configures an Alertmanager.
8891
type Config struct {
8992
UserID string

pkg/alertmanager/alertmanager_client.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/pkg/errors"
1717
"github.com/prometheus/client_golang/prometheus"
1818
"github.com/prometheus/client_golang/prometheus/promauto"
19+
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
1920
"google.golang.org/grpc"
2021
"google.golang.org/grpc/health/grpc_health_v1"
2122

@@ -111,6 +112,8 @@ func dialAlertmanagerClient(cfg grpcclient.Config, inst ring.InstanceDesc, reque
111112
if err != nil {
112113
return nil, err
113114
}
115+
opts = append(opts, grpc.WithStatsHandler(otelgrpc.NewClientHandler()))
116+
114117
// nolint:staticcheck // grpc.Dial() has been deprecated; we'll address it before upgrading to gRPC 2.
115118
conn, err := grpc.Dial(inst.Addr, opts...)
116119
if err != nil {

pkg/alertmanager/distributor.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"github.com/grafana/dskit/services"
2424
"github.com/grafana/dskit/tenant"
2525
"github.com/grafana/dskit/user"
26-
"github.com/opentracing/opentracing-go"
2726
"github.com/pkg/errors"
2827
"github.com/prometheus/client_golang/prometheus"
2928

@@ -198,8 +197,8 @@ func (d *Distributor) doAll(userID string, w http.ResponseWriter, r *http.Reques
198197

199198
results, err := replicationSet.Do(r.Context(), 0, func(ctx context.Context, instance *ring.InstanceDesc) (any, error) {
200199
ctx = user.InjectOrgID(ctx, userID)
201-
sp, ctx := opentracing.StartSpanFromContext(ctx, "Distributor.doAll")
202-
defer sp.Finish()
200+
ctx, sp := tracer.Start(ctx, "Distributor.doAll")
201+
defer sp.End()
203202

204203
resp, err := d.doRequest(ctx, *instance, &httpgrpc.HTTPRequest{
205204
Method: r.Method,
@@ -259,8 +258,8 @@ func (d *Distributor) doQuorum(userID string, w http.ResponseWriter, r *http.Req
259258
err = ring.DoBatchWithOptions(r.Context(), RingOp, d.alertmanagerRing, []uint32{shardByUser(userID)}, func(am ring.InstanceDesc, _ []int) error {
260259
// Use a background context to make sure all alertmanagers get the request even if we return early.
261260
localCtx := user.InjectOrgID(context.Background(), userID)
262-
sp, localCtx := opentracing.StartSpanFromContext(localCtx, "Distributor.doQuorum")
263-
defer sp.Finish()
261+
localCtx, sp := tracer.Start(localCtx, "Distributor.doQuorum")
262+
defer sp.End()
264263

265264
resp, err := d.doRequest(localCtx, am, &httpgrpc.HTTPRequest{
266265
Method: r.Method,
@@ -327,8 +326,8 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
327326
Headers: httpToHttpgrpcHeaders(r.Header),
328327
}
329328

330-
sp, ctx := opentracing.StartSpanFromContext(r.Context(), "Distributor.doUnary")
331-
defer sp.Finish()
329+
ctx, sp := tracer.Start(r.Context(), "Distributor.doUnary")
330+
defer sp.End()
332331
// Until we have a mechanism to combine the results from multiple alertmanagers,
333332
// we forward the request to only one of the alertmanagers.
334333
amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))]

pkg/blockbuilder/blockbuilder.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ import (
1515
"github.com/grafana/dskit/backoff"
1616
"github.com/grafana/dskit/runutil"
1717
"github.com/grafana/dskit/services"
18-
otgrpc "github.com/opentracing-contrib/go-grpc"
19-
"github.com/opentracing/opentracing-go"
2018
"github.com/prometheus/client_golang/prometheus"
2119
"github.com/prometheus/prometheus/tsdb"
2220
"github.com/thanos-io/objstore"
2321
"github.com/twmb/franz-go/pkg/kadm"
2422
"github.com/twmb/franz-go/pkg/kgo"
23+
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
24+
"go.opentelemetry.io/otel"
2525
"go.uber.org/atomic"
2626
"google.golang.org/grpc"
2727

@@ -35,6 +35,8 @@ import (
3535
"github.com/grafana/mimir/pkg/util/validation"
3636
)
3737

38+
var tracer = otel.Tracer("pkg/blockbuilder")
39+
3840
type BlockBuilder struct {
3941
services.Service
4042

@@ -128,17 +130,18 @@ func newWithSchedulerClient(
128130
}
129131

130132
func (b *BlockBuilder) makeSchedulerClient() (schedulerpb.SchedulerClient, *grpc.ClientConn, error) {
131-
dialOpts, err := b.cfg.SchedulerConfig.GRPCClientConfig.DialOption(
132-
[]grpc.UnaryClientInterceptor{otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer())},
133+
opts, err := b.cfg.SchedulerConfig.GRPCClientConfig.DialOption(
134+
nil,
133135
nil,
134136
util.NewInvalidClusterValidationReporter(b.cfg.SchedulerConfig.GRPCClientConfig.ClusterValidation.Label, b.blockBuilderMetrics.invalidClusterValidation, b.logger),
135137
)
136138
if err != nil {
137139
return nil, nil, err
138140
}
141+
opts = append(opts, grpc.WithStatsHandler(otelgrpc.NewClientHandler()))
139142

140143
// nolint:staticcheck // grpc.Dial() has been deprecated; we'll address it before upgrading to gRPC 2.
141-
conn, err := grpc.Dial(b.cfg.SchedulerConfig.Address, dialOpts...)
144+
conn, err := grpc.Dial(b.cfg.SchedulerConfig.Address, opts...)
142145
if err != nil {
143146
return nil, nil, err
144147
}
@@ -242,7 +245,7 @@ func (b *BlockBuilder) consumeJob(ctx context.Context, key schedulerpb.JobKey, s
242245
b.blockBuilderMetrics.consumeJobDuration.WithLabelValues(success).Observe(time.Since(start).Seconds())
243246
}(time.Now())
244247

245-
sp, ctx := spanlogger.NewWithLogger(ctx, b.logger, "BlockBuilder.consumeJob")
248+
sp, ctx := spanlogger.New(ctx, b.logger, tracer, "BlockBuilder.consumeJob")
246249
defer sp.Finish()
247250

248251
logger := log.With(sp, "partition", spec.Partition, "job_id", key.Id, "job_epoch", key.Epoch)
@@ -445,7 +448,7 @@ func PartitionStateFromLag(logger log.Logger, lag kadm.GroupMemberLag, fallbackM
445448
// consumePartition consumes records from the given partition until the cycleEnd timestamp.
446449
// If the partition is lagging behind, it takes care of consuming it in sections.
447450
func (b *BlockBuilder) consumePartition(ctx context.Context, partition int32, state PartitionState, cycleEndTime time.Time, cycleEndOffset int64, logger log.Logger) (finalState PartitionState, err error) {
448-
sp, ctx := spanlogger.NewWithLogger(ctx, logger, "BlockBuilder.consumePartition")
451+
sp, ctx := spanlogger.New(ctx, logger, tracer, "BlockBuilder.consumePartition")
449452
defer sp.Finish()
450453

451454
logger = log.With(sp, "partition", partition, "cycle_end", cycleEndTime, "cycle_end_offset", cycleEndOffset)

0 commit comments

Comments
 (0)