Skip to content
Draft
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
8b48ad9
Add mechanism to detect leaked references to mimirpb buffers.
tcard Nov 21, 2025
56fd9b0
Simplify test.
tcard Nov 21, 2025
0b57a52
Plumb ingester with ref leaks detection.
tcard Nov 21, 2025
f27be92
Fix lint, test.
tcard Nov 21, 2025
86258b7
More fixing.
tcard Nov 21, 2025
71f14aa
Fix racy test
tcard Nov 21, 2025
f1ab684
make doc
tcard Nov 21, 2025
f6a6b41
CHANGELOG.
tcard Nov 21, 2025
ef1d54e
about-versioning.md
tcard Nov 21, 2025
69ca33a
Revert "mimirpb: Clone metadata fields rather than maintaining refere…
tcard Nov 21, 2025
9d6d2d0
Make test deterministic.
tcard Nov 24, 2025
efc7fec
Fix mimirpb build.
tcard Nov 24, 2025
9012300
Reapply "mimirpb: Clone metadata fields rather than maintaining refer…
tcard Nov 24, 2025
a6eb3e6
Lint.
tcard Nov 24, 2025
a6e1c78
make license
tcard Nov 24, 2025
de97dd5
Fixes from Cursor review.
tcard Nov 24, 2025
da56796
Apply suggestions from code review
tcard Nov 25, 2025
7ca73a3
pct -> percentage
tcard Nov 26, 2025
d9afc39
Don't rely on GC; use mmap/munmap instead.
tcard Nov 25, 2025
466a6de
Don't instrument empty buffers.
tcard Nov 26, 2025
628ee24
Atomic ref count.
tcard Nov 26, 2025
2dbb9a6
make reference-help
tcard Nov 26, 2025
4180411
Merge branch 'main' into instrument-ref-leaks
tcard Nov 27, 2025
de52a8a
Move things around to ensure no mmap leak.
tcard Dec 2, 2025
4b0636e
Reword description.
tcard Dec 2, 2025
c10028f
Register default global codec on init.
tcard Dec 2, 2025
25fef18
Don't wrap encoding.CodecV2.
tcard Dec 2, 2025
a47d9c2
Fewer public globals.
tcard Dec 2, 2025
49fc9a2
Document that pool won't be used, and make it explicitly so.
tcard Dec 2, 2025
b39485d
Wait before munmapping.
tcard Dec 2, 2025
8bff504
Consistent naming.
tcard Dec 2, 2025
e5f7446
Fix YAML unmarshaling.
tcard Dec 2, 2025
375561b
Propagate InstrumentRefLeaksBeforeReusePeriod.
tcard Dec 2, 2025
325f3c6
Forgot to mprotect.
tcard Dec 3, 2025
a3ff570
Merge branch 'main' of github.com:grafana/mimir into instrument-ref-l…
tcard Dec 3, 2025
739133c
Missing FreeBuffer in test.
tcard Dec 4, 2025
fc76511
blockbuilder: defer WriteRequest.FreeBuffer
tcard Dec 4, 2025
5785b01
Fix test build.
tcard Dec 4, 2025
f44250c
Add max-inflight-instrumented-bytes.
tcard Dec 4, 2025
ff97b8c
Pass inflightInstrumentedBytes around.
tcard Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
* [ENHANCEMENT] OTLP: Add metric `cortex_distributor_otlp_requests_by_content_type_total` to track content type (json or proto) of OTLP packets. #13525
* [ENHANCEMENT] OTLP: Add experimental metric `cortex_distributor_otlp_array_lengths` to better understand the layout of OTLP packets in practice. #13525
* [ENHANCEMENT] Ruler: gRPC errors without details are classified as `operator` errors, and rule evaluation failures (such as duplicate labelsets) are classified as `user` errors. #13586
* [ENHANCEMENT] Add experimental flag `common.instrument-reference-leaks-percentage` to leaked references to gRPC buffers. #13609
* [BUGFIX] Compactor: Fix potential concurrent map writes. #13053
* [BUGFIX] Query-frontend: Fix issue where queries sometimes fail with `failed to receive query result stream message: rpc error: code = Canceled desc = context canceled` if remote execution is enabled. #13084
* [BUGFIX] Query-frontend: Fix issue where query stats, such as series read, did not include the parameters to the `histogram_quantile` and `histogram_fraction` functions if remote execution was enabled. #13084
Expand Down
22 changes: 22 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -20769,6 +20769,28 @@
],
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "instrument_ref_leaks_percentage",
"required": false,
"desc": "Percentage [0-100] of request or message buffers to instrument for reference leaks. Set to 0 to disable.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "common.instrument-reference-leaks-percentage",
"fieldType": "float",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "instrument_ref_leaks_before_reuse_period",
"required": false,
"desc": "Period after a buffer instrumented for referenced leaks is nominally freed until the buffer is uninstrumented and effectively freed to be reused. After this period, any lingering references to the buffer may potentially be dereferenced again with no detection.",
"fieldValue": null,
"fieldDefaultValue": 120000000000,
"fieldFlag": "common.instrument-reference-leaks-before-reuse-period",
"fieldType": "duration",
"fieldCategory": "experimental"
}
],
"fieldValue": null,
Expand Down
4 changes: 4 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,10 @@ Usage of ./cmd/mimir/mimir:
TSDB WAL segments files max size (bytes). (default 134217728)
-common.client-cluster-validation.label string
[experimental] Primary cluster validation label.
-common.instrument-reference-leaks-before-reuse-period duration
[experimental] Period after a buffer instrumented for referenced leaks is nominally freed until the buffer is uninstrumented and effectively freed to be reused. After this period, any lingering references to the buffer may potentially be dereferenced again with no detection. (default 2m0s)
-common.instrument-reference-leaks-percentage float
[experimental] Percentage [0-100] of request or message buffers to instrument for reference leaks. Set to 0 to disable.
-common.storage.azure.account-key string
Azure storage account key. If unset, Azure managed identities will be used for authentication instead.
-common.storage.azure.account-name string
Expand Down
4 changes: 4 additions & 0 deletions docs/sources/mimir/configure/about-versioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ The following features are currently experimental:
- Assuming that a gRPC client configuration can be reached via `-<grpc-client-config-path>`, cluster validation label is configured via: `-<grpc-client-config-path>.cluster-validation.label`.
- The cluster validation label of all gRPC clients can be configured via `-common.client-cluster-validation.label`.
- Requests with invalid cluster validation labels are tracked via the `cortex_client_invalid_cluster_validation_label_requests_total` metric.
- Common
- Instrument a fraction of pooled objects for references that outlive their lifetime.
- Only implemented for objects embedding `mimirpb.BufferHolder`.
- Flag: `-common.instrument-reference-leaks-percentage`
- Preferred available zone for querying ingesters and store-gateways
- `-querier.prefer-availability-zone`
- Memberlist zone-aware routing
Expand Down
12 changes: 12 additions & 0 deletions docs/sources/mimir/configure/configuration-parameters/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,18 @@ client_cluster_validation:
# (experimental) Primary cluster validation label.
# CLI flag: -common.client-cluster-validation.label
[label: <string> | default = ""]

# (experimental) Percentage [0-100] of request or message buffers to instrument
# for reference leaks. Set to 0 to disable.
# CLI flag: -common.instrument-reference-leaks-percentage
[instrument_ref_leaks_percentage: <float> | default = 0]

# (experimental) Period after a buffer instrumented for referenced leaks is
# nominally freed until the buffer is uninstrumented and effectively freed to be
# reused. After this period, any lingering references to the buffer may
# potentially be dereferenced again with no detection.
# CLI flag: -common.instrument-reference-leaks-before-reuse-period
[instrument_ref_leaks_before_reuse_period: <duration> | default = 2m]
```

### server
Expand Down
2 changes: 2 additions & 0 deletions operations/mimir/mimir-flags-defaults.json
Original file line number Diff line number Diff line change
Expand Up @@ -1475,6 +1475,8 @@
"common.storage.swift.request-timeout": 5000000000,
"common.storage.filesystem.dir": "",
"common.client-cluster-validation.label": "",
"common.instrument-reference-leaks-percentage": 0,
"common.instrument-reference-leaks-before-reuse-period": 120000000000,
"timeseries-unmarshal-caching-optimization-enabled": true,
"cost-attribution.eviction-interval": 1200000000000,
"cost-attribution.registry-path": "",
Expand Down
69 changes: 69 additions & 0 deletions pkg/ingester/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9136,6 +9136,44 @@ func TestIngesterMetadataMetrics(t *testing.T) {

}

func TestIngesterNoRW2MetadataRefLeaks(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
cfg := defaultIngesterTestConfig(t)
cfg.MetadataRetainPeriod = 20 * time.Second

ing, r, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, defaultLimitsTestConfig(), nil, "", reg)
require.NoError(t, err)
startAndWaitHealthy(t, ing, r)

syms := util_test.NewSymbolTableBuilder(nil)
orig := makeTestRW2WriteRequest(syms)
buf, err := orig.Marshal()
require.NoError(t, err)

var wr mimirpb.PreallocWriteRequest
wr.UnmarshalFromRW2 = true
wr.SkipNormalizeMetadataMetricName = true
wr.SkipDeduplicateMetadata = true
err = mimirpb.Unmarshal(buf, &wr)
require.NoError(t, err)

ctx := user.InjectOrgID(context.Background(), userID)
_, err = ing.Push(ctx, &wr.WriteRequest)
require.NoError(t, err)

meta, err := ing.MetricsMetadata(ctx, &client.MetricsMetadataRequest{
Metric: "test_metric_total",
Limit: 1, LimitPerMetric: 1,
})
require.NoError(t, err)
require.Equal(t, []*mimirpb.MetricMetadata{{
MetricFamilyName: "test_metric_total",
Type: mimirpb.COUNTER,
Help: "test_metric_help",
Unit: "test_metric_unit",
}}, meta.Metadata)
}

func TestIngesterSendsOnlySeriesWithData(t *testing.T) {
ing, r, err := prepareIngesterWithBlocksStorageAndLimits(t, defaultIngesterTestConfig(t), defaultLimitsTestConfig(), nil, "", nil)
require.NoError(t, err)
Expand Down Expand Up @@ -12255,3 +12293,34 @@ func TestIngester_NotifyPreCommit(t *testing.T) {
// As there are three users, fsync should have been called at least three times
assert.GreaterOrEqual(t, fsyncCountAfter-fsyncCountBefore, uint64(3))
}

func makeTestRW2WriteRequest(syms *util_test.SymbolTableBuilder) *mimirpb.WriteRequest {
req := &mimirpb.WriteRequest{
TimeseriesRW2: []mimirpb.TimeSeriesRW2{
{
LabelsRefs: []uint32{syms.GetSymbol("__name__"), syms.GetSymbol("test_metric_total"), syms.GetSymbol("job"), syms.GetSymbol("test_job")},
Samples: []mimirpb.Sample{
{
Value: 123.456,
TimestampMs: 1234567890,
},
},
Exemplars: []mimirpb.ExemplarRW2{
{
Value: 123.456,
Timestamp: 1234567890,
LabelsRefs: []uint32{syms.GetSymbol("__name__"), syms.GetSymbol("test_metric_total"), syms.GetSymbol("traceID"), syms.GetSymbol("1234567890abcdef")},
},
},
Metadata: mimirpb.MetadataRW2{
Type: mimirpb.METRIC_TYPE_COUNTER,
HelpRef: syms.GetSymbol("test_metric_help"),
UnitRef: syms.GetSymbol("test_metric_unit"),
},
},
},
}
req.SymbolsRW2 = syms.GetSymbols()

return req
}
39 changes: 31 additions & 8 deletions pkg/mimir/mimir.go
Original file line number Diff line number Diff line change
Expand Up @@ -713,11 +713,21 @@ func UnmarshalCommonYAML(value *yaml.Node, inheriters ...CommonConfigInheriter)
for name, loc := range inheritance.ClientClusterValidation {
specificClusterValidationLocations[name] = loc
}
specificInstrumentRefLeaksPercentageLocations := specificLocationsUnmarshaler{}
for name, loc := range inheritance.InstrumentRefLeaksPercentage {
specificInstrumentRefLeaksPercentageLocations[name] = loc
}
specificInstrumentRefLeaksBeforeReusePeriodLocations := specificLocationsUnmarshaler{}
for name, loc := range inheritance.InstrumentRefLeaksBeforeReusePeriod {
specificInstrumentRefLeaksBeforeReusePeriodLocations[name] = loc
}

common := configWithCustomCommonUnmarshaler{
Common: &commonConfigUnmarshaler{
Storage: &specificStorageLocations,
ClientClusterValidation: &specificClusterValidationLocations,
Storage: &specificStorageLocations,
ClientClusterValidation: &specificClusterValidationLocations,
InstrumentRefLeaksPercentage: &specificInstrumentRefLeaksPercentageLocations,
InstrumentRefLeaksBeforeReusePeriod: &specificInstrumentRefLeaksBeforeReusePeriodLocations,
},
}

Expand Down Expand Up @@ -785,19 +795,25 @@ func inheritFlags(log log.Logger, orig flagext.RegisteredFlagsTracker, dest flag
}

type CommonConfig struct {
Storage bucket.StorageBackendConfig `yaml:"storage"`
ClientClusterValidation clusterutil.ClusterValidationConfig `yaml:"client_cluster_validation" category:"experimental"`
Storage bucket.StorageBackendConfig `yaml:"storage"`
ClientClusterValidation clusterutil.ClusterValidationConfig `yaml:"client_cluster_validation" category:"experimental"`
InstrumentRefLeaksPercentage float64 `yaml:"instrument_ref_leaks_percentage" category:"experimental"`
InstrumentRefLeaksBeforeReusePeriod time.Duration `yaml:"instrument_ref_leaks_before_reuse_period" category:"experimental"`
}

type CommonConfigInheritance struct {
Storage map[string]*bucket.StorageBackendConfig
ClientClusterValidation map[string]*clusterutil.ClusterValidationConfig
Storage map[string]*bucket.StorageBackendConfig
ClientClusterValidation map[string]*clusterutil.ClusterValidationConfig
InstrumentRefLeaksPercentage map[string]*float64
InstrumentRefLeaksBeforeReusePeriod map[string]*time.Duration
}

// RegisterFlags registers flag.
func (c *CommonConfig) RegisterFlags(f *flag.FlagSet) {
c.Storage.RegisterFlagsWithPrefix("common.storage.", f)
c.ClientClusterValidation.RegisterFlagsWithPrefix("common.client-cluster-validation.", f)
f.Float64Var(&c.InstrumentRefLeaksPercentage, "common.instrument-reference-leaks-percentage", 0, `Percentage [0-100] of request or message buffers to instrument for reference leaks. Set to 0 to disable.`)
f.DurationVar(&c.InstrumentRefLeaksBeforeReusePeriod, "common.instrument-reference-leaks-before-reuse-period", 2*time.Minute, `Period after a buffer instrumented for referenced leaks is nominally freed until the buffer is uninstrumented and effectively freed to be reused. After this period, any lingering references to the buffer may potentially be dereferenced again with no detection.`)
}

// configWithCustomCommonUnmarshaler unmarshals config with custom unmarshaler for the `common` field.
Expand All @@ -812,8 +828,10 @@ type configWithCustomCommonUnmarshaler struct {

// commonConfigUnmarshaler will unmarshal each field of the common config into specific locations.
type commonConfigUnmarshaler struct {
Storage *specificLocationsUnmarshaler `yaml:"storage"`
ClientClusterValidation *specificLocationsUnmarshaler `yaml:"client_cluster_validation"`
Storage *specificLocationsUnmarshaler `yaml:"storage"`
ClientClusterValidation *specificLocationsUnmarshaler `yaml:"client_cluster_validation"`
InstrumentRefLeaksPercentage *specificLocationsUnmarshaler `yaml:"instrument_ref_leaks_percentage"`
InstrumentRefLeaksBeforeReusePeriod *specificLocationsUnmarshaler `yaml:"instrument_ref_leaks_before_reuse_period"`
}

// specificLocationsUnmarshaler will unmarshal yaml into specific locations.
Expand Down Expand Up @@ -909,6 +927,11 @@ func New(cfg Config, reg prometheus.Registerer) (*Mimir, error) {

setUpGoRuntimeMetrics(cfg, reg)

mimirpb.CustomCodecConfig{
InstrumentRefLeaksPct: cfg.Common.InstrumentRefLeaksPercentage,
WaitBeforeReuseInstrumentedBuffer: cfg.Common.InstrumentRefLeaksBeforeReusePeriod,
}.RegisterGlobally()

if cfg.TenantFederation.Enabled && cfg.Ruler.TenantFederation.Enabled {
util_log.WarnExperimentalUse("ruler.tenant-federation")
}
Expand Down
12 changes: 12 additions & 0 deletions pkg/mimir/mimir_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ func TestCommonConfigCanBeExtended(t *testing.T) {
args := []string{
"-common.storage.backend", "s3",
"-common.client-cluster-validation.label", "client-cluster",
"-common.instrument-reference-leaks-percentage", "13.37",
"-common.instrument-reference-leaks-before-reuse-period", "20h",
}
require.NoError(t, fs.Parse(args))

Expand All @@ -36,6 +38,10 @@ func TestCommonConfigCanBeExtended(t *testing.T) {

// Mimir's inheritance should still work.
checkAllClusterValidationLabels(t, cfg, "client-cluster")

// Non-inherited flags still work.
require.Equal(t, 13.37, cfg.MimirConfig.Common.InstrumentRefLeaksPercentage)
require.Equal(t, 20*time.Hour, cfg.MimirConfig.Common.InstrumentRefLeaksBeforeReusePeriod)
})

t.Run("yaml inheritance", func(t *testing.T) {
Expand All @@ -45,6 +51,8 @@ common:
backend: s3
client_cluster_validation:
label: client-cluster
instrument_ref_leaks_percentage: 13.37
instrument_ref_leaks_before_reuse_period: 20h
`

var cfg customExtendedConfig
Expand All @@ -60,6 +68,10 @@ common:

// Mimir's inheritance should still work.
checkAllClusterValidationLabels(t, cfg, "client-cluster")

// Non-inherited flags should still work.
require.Equal(t, 13.37, cfg.MimirConfig.Common.InstrumentRefLeaksPercentage)
require.Equal(t, 20*time.Hour, cfg.MimirConfig.Common.InstrumentRefLeaksBeforeReusePeriod)
})
}

Expand Down
Loading
Loading