Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions go.work.sum
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,12 @@ github.com/grafana/e2e v0.1.1/go.mod h1:RpNLgae5VT+BUHvPE+/zSypmOXKwEu4t+tnEMS1A
github.com/grafana/grafana-cloud-migration-snapshot v1.2.0 h1:FCUWASPPzGGbF2jTutR5i3rmoQdmnC4bypwJswdW3fI=
github.com/grafana/grafana-cloud-migration-snapshot v1.2.0/go.mod h1:bd6Cm06EK0MzRO5ahUpbDz1SxNOKu+fzladbaRPHZPY=
github.com/grafana/grafana/pkg/apimachinery v0.0.0-20240701135906-559738ce6ae1/go.mod h1:DkxMin+qOh1Fgkxfbt+CUfBqqsCQJMG9op8Os/irBPA=
github.com/grafana/grafana-azure-sdk-go/v2 v2.1.0 h1:lajVqTWaE96MpbjZToj7EshvqgRWOfYNkD4MbIZizaY=
github.com/grafana/grafana-azure-sdk-go/v2 v2.1.0/go.mod h1:aKlFPE36IDa8qccRg3KbgZX3MQ5xymS3RelT4j6kkVU=
github.com/grafana/grafana-plugin-sdk-go v0.235.0/go.mod h1:6n9LbrjGL3xAATntYVNcIi90G9BVHRJjzHKz5FXVfWw=
github.com/grafana/grafana/pkg/apimachinery v0.0.0-20240701135906-559738ce6ae1/go.mod h1:DkxMin+qOh1Fgkxfbt+CUfBqqsCQJMG9op8Os/irBPA=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240422145632-c33c6b5b6e6b h1:HCbWyVL6vi7gxyO76gQksSPH203oBJ1MJ3JcG1OQlsg=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240422145632-c33c6b5b6e6b/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc=
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 h1:pdN6V1QBWetyv/0+wjACpqVH+eVULgEjkurDLq3goeM=
github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645 h1:MJG/KsmcqMwFAkh8mTnAwhyKoB+sTAnY4CACC110tbU=
github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645/go.mod h1:6iZfnjpejD4L/4DwD7NryNaJyCQdzwWwH2MWhCA90Kw=
Expand Down Expand Up @@ -793,15 +799,24 @@ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.23.1 h1:ZqR
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.23.1/go.mod h1:D7ynngPWlGJrqyGSDOdscuv7uqttfCE3jcBvffDv9y4=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.23.1 h1:q/Nj5/2TZRIt6PderQ9oU0M00fzoe8UZuINGw6ETGTw=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.23.1/go.mod h1:DTE9yAu6r08jU3xa68GiSeI7oRcSEQ2RpKbbQGO+dWM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0/go.mod h1:z46paqbJ9l7c9fIPCXTqTGwhQZ5XoTIsfeFYWboizjs=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.26.0/go.mod h1:wnJIG4fOqyynOnnQF/eQb4/16VlX2EJAHhHgqIqWfAo=
go.opentelemetry.io/otel/exporters/prometheus v0.37.0 h1:NQc0epfL0xItsmGgSXgfbH2C1fq2VLXkZoDFsfRNHpc=
go.opentelemetry.io/otel/exporters/prometheus v0.37.0/go.mod h1:hB8qWjsStK36t50/R0V2ULFb4u95X/Q6zupXLgvjTh8=
go.opentelemetry.io/otel/exporters/prometheus v0.46.0 h1:I8WIFXR351FoLJYuloU4EgXbtNX2URfU/85pUPheIEQ=
go.opentelemetry.io/otel/exporters/prometheus v0.46.0/go.mod h1:ztwVUHe5DTR/1v7PeuGRnU5Bbd4QKYwApWmuutKsJSs=
go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.23.1 h1:C8r95vDR125t815KD+b1tI0Fbc1pFnwHTBxkbIZ6Szc=
go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.23.1/go.mod h1:Qr0qomr64jentMtOjWMbtYeJMSuMSlsPEjmnRA2sWZ4=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.24.0 h1:s0PHtIkN+3xrbDOpt2M8OTG92cWqUESvzh2MxiR5xY8=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.24.0/go.mod h1:hZlFbDbRt++MMPCCfSJfmhkGIWnX1h3XjkfxZUjLrIA=
go.opentelemetry.io/otel/metric v1.26.0/go.mod h1:SY+rHOI4cEawI9a7N1A4nIg/nTQXe1ccCNWYOJUrpX4=
go.opentelemetry.io/otel/sdk v1.26.0/go.mod h1:0p8MXpqLeJ0pzcszQQN4F0S5FVjBLgypeGSngLsmirs=
go.opentelemetry.io/otel/sdk/metric v0.39.0 h1:Kun8i1eYf48kHH83RucG93ffz0zGV1sh46FAScOTuDI=
go.opentelemetry.io/otel/sdk/metric v0.39.0/go.mod h1:piDIRgjcK7u0HCL5pCA4e74qpK/jk3NiUoAHATVAmiI=
go.opentelemetry.io/otel/sdk/metric v1.26.0 h1:cWSks5tfriHPdWFnl+qpX3P681aAYqlZHcAyHw5aU9Y=
go.opentelemetry.io/otel/sdk/metric v1.26.0/go.mod h1:ClMFFknnThJCksebJwz7KIyEDHO+nTB6gK8obLy8RyE=
go.opentelemetry.io/otel/trace v1.26.0/go.mod h1:4iDxvGDQuUkHve82hJJ8UqrwswHYsZuWCBllGV2U2y0=
go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A=
go.uber.org/automaxprocs v1.5.3 h1:kWazyxZUrS3Gs4qUpbwo5kEIMGe/DAvi5Z4tl2NW4j8=
go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee h1:0mgffUl7nfd+FpvXMVz4IDEaUSmT1ysygQC7qYo7sG4=
golang.org/x/arch v0.4.0 h1:A8WCeEWhLwPBKNbFi5Wv5UTCBx5zzubnXDlMOFAzFMc=
Expand All @@ -825,6 +840,7 @@ google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237/go.
google.golang.org/genproto/googleapis/api v0.0.0-20240604185151-ef581f913117/go.mod h1:OimBR/bc1wPO9iV4NC2bpyjy3VnAwZh5EBPQdtaE5oo=
google.golang.org/genproto/googleapis/bytestream v0.0.0-20240325203815-454cdb8f5daa h1:wBkzraZsSqhj1M4L/nMrljUU6XasJkgHvUsq8oRGwF0=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY=
google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0 h1:M1YKkFIboKNieVO5DLUEVzQfGwJD30Nv2jfUgzb5UcE=
Expand Down
1 change: 0 additions & 1 deletion pkg/apiserver/rest/dualwriter_mode1_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ func TestMode1_Get(t *testing.T) {
tt.setupStorageFn(m, tt.input)
}

p := prometheus.NewRegistry()
dw := NewDualWriter(Mode1, ls, us, p)

obj, err := dw.Get(context.Background(), tt.input, &metav1.GetOptions{})
Expand Down
158 changes: 98 additions & 60 deletions pkg/apiserver/rest/dualwriter_mode3.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ package rest

import (
"context"
"errors"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metainternalversion "k8s.io/apimachinery/pkg/apis/meta/internalversion"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -21,114 +22,151 @@ type DualWriterMode3 struct {
// newDualWriterMode3 returns a new DualWriter in mode 3.
// Mode 3 represents writing to LegacyStorage and Storage and reading from Storage.
func newDualWriterMode3(legacy LegacyStorage, storage Storage, dwm *dualWriterMetrics) *DualWriterMode3 {
return &DualWriterMode3{Legacy: legacy, Storage: storage, Log: klog.NewKlogr().WithName("DualWriterMode3"), dualWriterMetrics: dwm}
return &DualWriterMode3{Legacy: legacy, Storage: storage, Log: klog.NewKlogr().WithName("DualWriterMode3").WithValues("mode", mode3Str), dualWriterMetrics: dwm}
}

// Mode returns the mode of the dual writer.
func (d *DualWriterMode3) Mode() DualWriterMode {
return Mode3
}

const mode3Str = "3"

// Create overrides the behavior of the generic DualWriter and writes to LegacyStorage and Storage.
func (d *DualWriterMode3) Create(ctx context.Context, obj runtime.Object, createValidation rest.ValidateObjectFunc, options *metav1.CreateOptions) (runtime.Object, error) {
log := klog.FromContext(ctx)
var method = "create"
log := d.Log.WithValues("kind", options.Kind, "method", method)
ctx = klog.NewContext(ctx, log)

startStorage := time.Now()
created, err := d.Storage.Create(ctx, obj, createValidation, options)
if err != nil {
log.Error(err, "unable to create object in storage")
d.recordLegacyDuration(true, mode3Str, options.Kind, method, startStorage)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Broken Storage Performance Monitoring

When Storage.Create fails, the code incorrectly calls recordLegacyDuration instead of recordStorageDuration. This records the storage operation's duration under the wrong metric, making it impossible to accurately monitor storage performance and distinguish between legacy and storage failures.

Fix in Cursor Fix in Web

return created, err
}
d.recordStorageDuration(false, mode3Str, options.Kind, method, startStorage)

if _, err := d.Legacy.Create(ctx, obj, createValidation, options); err != nil {
log.WithValues("object", created).Error(err, "unable to create object in legacy storage")
}
return created, nil
go func() {
ctx, cancel := context.WithTimeoutCause(ctx, time.Second*10, errors.New("legacy create timeout"))
defer cancel()

startLegacy := time.Now()
_, errObjectSt := d.Legacy.Create(ctx, obj, createValidation, options)
d.recordLegacyDuration(errObjectSt != nil, mode3Str, options.Kind, method, startLegacy)
}()

return created, err
}

// Get overrides the behavior of the generic DualWriter and retrieves an object from Storage.
func (d *DualWriterMode3) Get(ctx context.Context, name string, options *metav1.GetOptions) (runtime.Object, error) {
return d.Storage.Get(ctx, name, &metav1.GetOptions{})
var method = "get"
log := d.Log.WithValues("kind", options.Kind, "name", name, "method", method)
ctx = klog.NewContext(ctx, log)

startStorage := time.Now()
res, err := d.Storage.Get(ctx, name, options)
if err != nil {
log.Error(err, "unable to get object in storage")
}
d.recordStorageDuration(err != nil, mode3Str, options.Kind, method, startStorage)

return res, err
}

func (d *DualWriterMode3) Delete(ctx context.Context, name string, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) {
log := d.Log.WithValues("name", name)
// List overrides the behavior of the generic DualWriter and reads only from Unified Store.
func (d *DualWriterMode3) List(ctx context.Context, options *metainternalversion.ListOptions) (runtime.Object, error) {
var method = "list"
log := d.Log.WithValues("kind", options.Kind, "resourceVersion", options.ResourceVersion, "method", method)
ctx = klog.NewContext(ctx, log)

deleted, async, err := d.Storage.Delete(ctx, name, deleteValidation, options)
startStorage := time.Now()
res, err := d.Storage.List(ctx, options)
if err != nil {
if !apierrors.IsNotFound(err) {
log.Error(err, "could not delete from unified store")
return deleted, async, err
}
log.Error(err, "unable to list object in storage")
}
d.recordStorageDuration(err != nil, mode3Str, options.Kind, method, startStorage)

_, _, errLS := d.Legacy.Delete(ctx, name, deleteValidation, options)
if errLS != nil {
if !apierrors.IsNotFound(errLS) {
log.WithValues("deleted", deleted).Error(errLS, "could not delete from legacy store")
}
return res, err
}

func (d *DualWriterMode3) Delete(ctx context.Context, name string, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) {
var method = "delete"
log := d.Log.WithValues("name", name, "kind", options.Kind, "method", method)
ctx = klog.NewContext(ctx, d.Log)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Delete Operations Drop Critical Logging Context

The Delete function passes d.Log to klog.NewContext instead of the enriched log variable that includes name, kind, and method values. This causes the context to lose important logging metadata, making it harder to trace and debug delete operations.

Fix in Cursor Fix in Web


startStorage := time.Now()
res, async, err := d.Storage.Delete(ctx, name, deleteValidation, options)
if err != nil {
log.Error(err, "unable to delete object in storage")
d.recordStorageDuration(true, mode3Str, options.Kind, method, startStorage)
return res, async, err
}
d.recordStorageDuration(false, mode3Str, name, method, startStorage)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Incorrect Metric Labels Break Monitoring

In the success path of Delete, the code passes name (the object name) instead of options.Kind as the third parameter to recordStorageDuration. This causes metrics to be labeled with object names rather than resource kinds, breaking metric aggregation and making monitoring ineffective.

Fix in Cursor Fix in Web


return deleted, async, err
go func() {
startLegacy := time.Now()
ctx, cancel := context.WithTimeoutCause(ctx, time.Second*10, errors.New("legacy delete timeout"))
defer cancel()
_, _, err := d.Legacy.Delete(ctx, name, deleteValidation, options)
d.recordLegacyDuration(err != nil, mode3Str, options.Kind, method, startLegacy)
}()

return res, async, err
}

// Update overrides the behavior of the generic DualWriter and writes first to Storage and then to LegacyStorage.
func (d *DualWriterMode3) Update(ctx context.Context, name string, objInfo rest.UpdatedObjectInfo, createValidation rest.ValidateObjectFunc, updateValidation rest.ValidateObjectUpdateFunc, forceAllowCreate bool, options *metav1.UpdateOptions) (runtime.Object, bool, error) {
log := d.Log.WithValues("name", name)
var method = "update"
log := d.Log.WithValues("name", name, "kind", options.Kind, "method", method)
ctx = klog.NewContext(ctx, log)
old, err := d.Storage.Get(ctx, name, &metav1.GetOptions{})
if err != nil {
log.WithValues("object", old).Error(err, "could not get object to update")
return nil, false, err
}

updated, err := objInfo.UpdatedObject(ctx, old)
startStorage := time.Now()
res, async, err := d.Storage.Update(ctx, name, objInfo, createValidation, updateValidation, forceAllowCreate, options)
if err != nil {
log.WithValues("object", updated).Error(err, "could not update or create object")
return nil, false, err
}
objInfo = &updateWrapper{
upstream: objInfo,
updated: updated,
log.Error(err, "unable to update in storage")
d.recordLegacyDuration(true, mode3Str, options.Kind, method, startStorage)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Metric Mix-up Corrupts Storage Observability

When Storage.Update fails, the code incorrectly calls recordLegacyDuration instead of recordStorageDuration. This records the storage operation's duration under the wrong metric, corrupting observability data and preventing accurate monitoring of storage vs legacy performance.

Fix in Cursor Fix in Web

return res, async, err
}
d.recordStorageDuration(false, mode3Str, options.Kind, method, startStorage)

obj, created, err := d.Storage.Update(ctx, name, objInfo, createValidation, updateValidation, forceAllowCreate, options)
if err != nil {
log.WithValues("object", obj).Error(err, "could not write to US")
return obj, created, err
}
go func() {
ctx, cancel := context.WithTimeoutCause(ctx, time.Second*10, errors.New("legacy update timeout"))

_, _, errLeg := d.Legacy.Update(ctx, name, &updateWrapper{
upstream: objInfo,
updated: obj,
}, createValidation, updateValidation, forceAllowCreate, options)
if errLeg != nil {
log.Error(errLeg, "could not update object in legacy store")
}
return obj, created, err
startLegacy := time.Now()
defer cancel()
_, _, errObjectSt := d.Legacy.Update(ctx, name, objInfo, createValidation, updateValidation, forceAllowCreate, options)
d.recordLegacyDuration(errObjectSt != nil, mode3Str, options.Kind, method, startLegacy)
}()

return res, async, err
}

// DeleteCollection overrides the behavior of the generic DualWriter and deletes from both LegacyStorage and Storage.
func (d *DualWriterMode3) DeleteCollection(ctx context.Context, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions, listOptions *metainternalversion.ListOptions) (runtime.Object, error) {
log := d.Log.WithValues("kind", options.Kind, "resourceVersion", listOptions.ResourceVersion)
var method = "delete-collection"
log := d.Log.WithValues("kind", options.Kind, "resourceVersion", listOptions.ResourceVersion, "method", method)
ctx = klog.NewContext(ctx, log)

deleted, err := d.Storage.DeleteCollection(ctx, deleteValidation, options, listOptions)
startStorage := time.Now()
res, err := d.Storage.DeleteCollection(ctx, deleteValidation, options, listOptions)
if err != nil {
log.Error(err, "failed to delete collection successfully from Storage")
log.Error(err, "unable to delete collection in storage")
d.recordStorageDuration(true, mode3Str, options.Kind, method, startStorage)
return res, err
}
d.recordStorageDuration(false, mode3Str, options.Kind, method, startStorage)

if deleted, err := d.Legacy.DeleteCollection(ctx, deleteValidation, options, listOptions); err != nil {
log.WithValues("deleted", deleted).Error(err, "failed to delete collection successfully from LegacyStorage")
}
go func() {
startLegacy := time.Now()
ctx, cancel := context.WithTimeoutCause(ctx, time.Second*10, errors.New("legacy deletecollection timeout"))
defer cancel()
_, err := d.Legacy.DeleteCollection(ctx, deleteValidation, options, listOptions)
d.recordStorageDuration(err != nil, mode3Str, options.Kind, method, startLegacy)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Legacy Metrics Pollute Storage Data

In the DeleteCollection goroutine that executes the legacy operation, the code calls recordStorageDuration instead of recordLegacyDuration. This misattributes legacy operation metrics to storage, making it impossible to accurately monitor and compare the performance of legacy vs storage systems.

Fix in Cursor Fix in Web

}()

return deleted, err
}

func (d *DualWriterMode3) List(ctx context.Context, options *metainternalversion.ListOptions) (runtime.Object, error) {
//TODO: implement List
klog.Error("List not implemented")
return nil, nil
return res, err
}

func (d *DualWriterMode3) Destroy() {
Expand Down
Loading
Loading