From 11155f01d4870e2d89fda0e3f46870519a4488e2 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 07:35:40 +0545 Subject: [PATCH 01/35] feat: trace watchable messages Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_metrics_types.go | 49 ++++++++ api/v1alpha1/envoygateway_types.go | 1 + api/v1alpha1/envoyproxy_metric_types.go | 2 + api/v1alpha1/zz_generated.deepcopy.go | 66 +++++++++++ go.mod | 3 +- internal/cmd/server.go | 6 + internal/gatewayapi/resource/resource.go | 8 ++ internal/gatewayapi/runner/runner.go | 39 +++++- internal/globalratelimit/runner/runner.go | 6 +- .../globalratelimit/runner/runner_test.go | 6 +- internal/message/types.go | 17 ++- internal/message/watchutil_test.go | 17 ++- internal/provider/kubernetes/controller.go | 13 +- internal/traces/register.go | 112 ++++++++++++++++++ internal/xds/runner/runner.go | 29 ++++- internal/xds/runner/runner_test.go | 18 ++- site/content/en/latest/api/extension_types.md | 49 ++++++++ 17 files changed, 413 insertions(+), 28 deletions(-) create mode 100644 internal/traces/register.go diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index 62aeec39519..f3b0438dbca 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -15,6 +15,15 @@ type EnvoyGatewayMetrics struct { Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` } +// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +type EnvoyGatewayTraces struct { + // Sink defines the metric sink where metrics are sent to. + Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` + // Disable disables the traces. + // TODO: implement disability + Disable bool `json:"enable,omitempty"` +} + // EnvoyGatewayMetricSink defines control plane // metric sinks where metrics are sent to. type EnvoyGatewayMetricSink struct { @@ -28,6 +37,46 @@ type EnvoyGatewayMetricSink struct { OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` } +// EnvoyGatewayTraceSink defines control plane +// trace sinks where traces are sent to. +type EnvoyGatewayTraceSink struct { + // Type defines the trace sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type TraceSinkType `json:"type"` // TODO: is this even needed? + // OpenTelemetry defines the configuration for OpenTelemetry sink. + // It's required if the sink type is OpenTelemetry. + OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` +} + +type EnvoyGatewayTracingSink struct { + // Host define the sink service hostname. + Host string `json:"host"` + // Protocol define the sink service protocol. + // +kubebuilder:validation:Enum=grpc;http + Protocol string `json:"protocol"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4319 + Port int32 `json:"port,omitempty"` + // ExportInterval configures the intervening time between exports for a + // Sink. This option overrides any value set for the + // OTEL_METRIC_EXPORT_INTERVAL environment variable. + // If ExportInterval is less than or equal to zero, 60 seconds + // is used as the default. + ExportInterval *gwapiv1.Duration `json:"exportInterval,omitempty"` + // ExportTimeout configures the time a Sink waits for an export to + // complete before canceling it. This option overrides any value set for the + // OTEL_METRIC_EXPORT_TIMEOUT environment variable. + // If ExportTimeout is less than or equal to zero, 30 seconds + // is used as the default. + ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` + //TODO sampling rate +} + type EnvoyGatewayOpenTelemetrySink struct { // Host define the sink service hostname. Host string `json:"host"` diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index 42dd3adfc18..0172c299d01 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -168,6 +168,7 @@ type LeaderElection struct { type EnvoyGatewayTelemetry struct { // Metrics defines metrics configuration for envoy gateway. Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` + Traces *EnvoyGatewayTraces `json:"traces,omitempty"` } // EnvoyGatewayLogging defines logging for Envoy Gateway. diff --git a/api/v1alpha1/envoyproxy_metric_types.go b/api/v1alpha1/envoyproxy_metric_types.go index 320b7436caa..e1c031c8fc7 100644 --- a/api/v1alpha1/envoyproxy_metric_types.go +++ b/api/v1alpha1/envoyproxy_metric_types.go @@ -6,9 +6,11 @@ package v1alpha1 type MetricSinkType string +type TraceSinkType string const ( MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" + TraceSinkTypeOpenTelemetry TraceSinkType = "OpenTelemetry" ) type ProxyMetrics struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index c06d4d02c98..c64e03e0284 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -2296,6 +2296,11 @@ func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { *out = new(EnvoyGatewayMetrics) (*in).DeepCopyInto(*out) } + if in.Traces != nil { + in, out := &in.Traces, &out.Traces + *out = new(EnvoyGatewayTraces) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTelemetry. @@ -2328,6 +2333,67 @@ func (in *EnvoyGatewayTopologyInjector) DeepCopy() *EnvoyGatewayTopologyInjector return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayTraceSink) DeepCopyInto(out *EnvoyGatewayTraceSink) { + *out = *in + if in.OpenTelemetry != nil { + in, out := &in.OpenTelemetry, &out.OpenTelemetry + *out = new(EnvoyGatewayOpenTelemetrySink) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraceSink. +func (in *EnvoyGatewayTraceSink) DeepCopy() *EnvoyGatewayTraceSink { + if in == nil { + return nil + } + out := new(EnvoyGatewayTraceSink) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayTraces) DeepCopyInto(out *EnvoyGatewayTraces) { + *out = *in + in.Sink.DeepCopyInto(&out.Sink) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraces. +func (in *EnvoyGatewayTraces) DeepCopy() *EnvoyGatewayTraces { + if in == nil { + return nil + } + out := new(EnvoyGatewayTraces) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayTracingSink) DeepCopyInto(out *EnvoyGatewayTracingSink) { + *out = *in + if in.ExportInterval != nil { + in, out := &in.ExportInterval, &out.ExportInterval + *out = new(v1.Duration) + **out = **in + } + if in.ExportTimeout != nil { + in, out := &in.ExportTimeout, &out.ExportTimeout + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTracingSink. +func (in *EnvoyGatewayTracingSink) DeepCopy() *EnvoyGatewayTracingSink { + if in == nil { + return nil + } + out := new(EnvoyGatewayTracingSink) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyJSONPatchConfig) DeepCopyInto(out *EnvoyJSONPatchConfig) { *out = *in diff --git a/go.mod b/go.mod index fafae13a4be..a08fd505bbf 100644 --- a/go.mod +++ b/go.mod @@ -54,6 +54,8 @@ require ( go.opentelemetry.io/otel v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0 go.opentelemetry.io/otel/metric v1.38.0 @@ -289,7 +291,6 @@ require ( go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect go.opentelemetry.io/otel/trace v1.38.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect diff --git a/internal/cmd/server.go b/internal/cmd/server.go index 5498ddaa3cd..812cb7b6cf9 100644 --- a/internal/cmd/server.go +++ b/internal/cmd/server.go @@ -24,6 +24,7 @@ import ( "github.com/envoyproxy/gateway/internal/message" "github.com/envoyproxy/gateway/internal/metrics" providerrunner "github.com/envoyproxy/gateway/internal/provider/runner" + "github.com/envoyproxy/gateway/internal/traces" xdsrunner "github.com/envoyproxy/gateway/internal/xds/runner" ) @@ -213,6 +214,11 @@ func startRunners(ctx context.Context, cfg *config.Server) (err error) { // It provides metrics endpoints for monitoring. runner: metrics.New(cfg), }, + { + // Start the Metrics Server + // It provides metrics endpoints for monitoring. + runner: traces.New(cfg), + }, } // Start all runners diff --git a/internal/gatewayapi/resource/resource.go b/internal/gatewayapi/resource/resource.go index f06fbfe2915..8c7f3ca839a 100644 --- a/internal/gatewayapi/resource/resource.go +++ b/internal/gatewayapi/resource/resource.go @@ -6,6 +6,7 @@ package resource import ( + "context" "sort" certificatesv1b1 "k8s.io/api/certificates/v1beta1" @@ -208,6 +209,13 @@ func (r *Resources) GetEndpointSlicesForBackend(svcNamespace, svcName, backendKi // ControllerResources holds all the GatewayAPI resources per GatewayClass type ControllerResources []*Resources +// ControllerResourcesContext wraps ControllerResources with trace context +// for propagating spans across async message boundaries +type ControllerResourcesContext struct { + Resources *ControllerResources + Context context.Context +} + // DeepCopy creates a new ControllerResources. // It is handwritten since the tooling was unable to copy into a new slice func (c *ControllerResources) DeepCopy() *ControllerResources { diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 7fad17a928d..f71ae394929 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -16,6 +16,8 @@ import ( "github.com/docker/docker/pkg/fileutils" "github.com/telepresenceio/watchable" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -46,6 +48,8 @@ const ( hmacSecretKey = "hmac-secret" ) +var tracer = otel.Tracer("envoy-gateway/gateway-api") + type Config struct { config.Server ProviderResources *message.ProviderResources @@ -122,18 +126,39 @@ func (r *Runner) startWasmCache(ctx context.Context) { r.wasmCache.Start(ctx) } -func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *resource.ControllerResources]) { +func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *resource.ControllerResourcesContext]) { message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.ProviderResourcesMessageName}, sub, - func(update message.Update[string, *resource.ControllerResources], errChan chan error) { + func(update message.Update[string, *resource.ControllerResourcesContext], errChan chan error) { + + parentCtx := context.Background() + if update.Value != nil && update.Value.Context != nil { + parentCtx = update.Value.Context + } + + _, span := tracer.Start(parentCtx, "Runner.subscribeAndTranslate") + defer span.End() + r.Logger.Info("received an update") - val := update.Value + valWrapper := update.Value // There is only 1 key which is the controller name // so when a delete is triggered, delete all keys - if update.Delete || val == nil { + if update.Delete || valWrapper == nil || valWrapper.Resources == nil { + span.AddEvent("delete_all_keys") r.deleteAllKeys() return } + val := valWrapper.Resources + + // Add span attributes for observability + span.SetAttributes( + attribute.String("controller.key", update.Key), + attribute.Bool("update.delete", update.Delete), + ) + if val != nil { + span.SetAttributes(attribute.Int("resources.count", len(*val))) + } + // Initialize keysToDelete with tracked keys (mark and sweep approach) keysToDelete := r.keyCache.copy() @@ -207,7 +232,11 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re r.Logger.Error(err, "unable to validate xds ir, skipped sending it") errChan <- err } else { - r.XdsIR.Store(key, val) + m := message.XdsIRWithContext{ + XdsIR: val, + Context: parentCtx, + } + r.XdsIR.Store(key, &m) xdsIRCount++ } } diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index 3cd4a33ea84..f9cfc8a4c46 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -132,12 +132,12 @@ func buildXDSResourceFromCache(rateLimitConfigsCache map[string][]cachetype.Reso return xdsResourcesToUpdate } -func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchable.Snapshot[string, *ir.Xds]) { +func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchable.Snapshot[string, *message.XdsIRWithContext]) { // rateLimitConfigsCache is a cache of the rate limit config, which is keyed by the xdsIR key. rateLimitConfigsCache := map[string][]cachetype.Resource{} message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.XDSIRMessageName}, c, - func(update message.Update[string, *ir.Xds], errChan chan error) { + func(update message.Update[string, *message.XdsIRWithContext], errChan chan error) { r.Logger.Info("received a notification") if update.Delete { @@ -145,7 +145,7 @@ func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchab r.updateSnapshot(ctx, buildXDSResourceFromCache(rateLimitConfigsCache)) } else { // Translate to ratelimit xDS Config. - rvt, err := r.translate(update.Value) + rvt, err := r.translate(update.Value.XdsIR) if err != nil { r.Logger.Error(err, "failed to translate an updated xds-ir to ratelimit xDS Config") errChan <- err diff --git a/internal/globalratelimit/runner/runner_test.go b/internal/globalratelimit/runner/runner_test.go index 932131fc70f..848567143c2 100644 --- a/internal/globalratelimit/runner/runner_test.go +++ b/internal/globalratelimit/runner/runner_test.go @@ -230,7 +230,11 @@ func Test_subscribeAndTranslate(t *testing.T) { xdsIR.Delete(xds.Key) continue } - xdsIR.Store(xds.Key, xds.Value) + m := message.XdsIRWithContext{ + XdsIR: xds.Value, + Context: context.Background(), + } + xdsIR.Store(xds.Key, &m) } diff := "" diff --git a/internal/message/types.go b/internal/message/types.go index 1c033678568..48c459d1ef7 100644 --- a/internal/message/types.go +++ b/internal/message/types.go @@ -6,6 +6,8 @@ package message import ( + "context" + "github.com/telepresenceio/watchable" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" @@ -20,8 +22,8 @@ import ( // ProviderResources message type ProviderResources struct { // GatewayAPIResources is a map from a GatewayClass name to - // a group of gateway API and other related resources. - GatewayAPIResources watchable.Map[string, *resource.ControllerResources] + // a group of gateway API and other related resources with trace context. + GatewayAPIResources watchable.Map[string, *resource.ControllerResourcesContext] // GatewayAPIStatuses is a group of gateway api // resource statuses maps. @@ -40,7 +42,9 @@ func (p *ProviderResources) GetResources() []*resource.Resources { } for _, v := range p.GatewayAPIResources.LoadAll() { - return *v + if v != nil && v.Resources != nil { + return *v.Resources + } } return nil @@ -127,9 +131,14 @@ func (e *ExtensionStatuses) Close() { e.BackendStatuses.Close() } +type XdsIRWithContext struct { + XdsIR *ir.Xds + Context context.Context +} + // XdsIR message type XdsIR struct { - watchable.Map[string, *ir.Xds] + watchable.Map[string, *XdsIRWithContext] } // InfraIR message diff --git a/internal/message/watchutil_test.go b/internal/message/watchutil_test.go index 21411b3f6f9..ed16c17c409 100644 --- a/internal/message/watchutil_test.go +++ b/internal/message/watchutil_test.go @@ -247,19 +247,28 @@ func TestControllerResourceUpdate(t *testing.T) { snapshotC := m.GatewayAPIResources.Subscribe(ctx) endCtx, end := context.WithCancel(ctx) - m.GatewayAPIResources.Store("start", &resource.ControllerResources{}) + m.GatewayAPIResources.Store("start", &resource.ControllerResourcesContext{ + Resources: &resource.ControllerResources{}, + Context: ctx, + }) go func() { <-endCtx.Done() for _, r := range tc.resources { r.Sort() - m.GatewayAPIResources.Store("test", r) + m.GatewayAPIResources.Store("test", &resource.ControllerResourcesContext{ + Resources: r, + Context: ctx, + }) } - m.GatewayAPIResources.Store("end", &resource.ControllerResources{}) + m.GatewayAPIResources.Store("end", &resource.ControllerResourcesContext{ + Resources: &resource.ControllerResources{}, + Context: ctx, + }) }() updates := 0 - message.HandleSubscription(message.Metadata{Runner: "demo", Message: "demo"}, snapshotC, func(u message.Update[string, *resource.ControllerResources], errChans chan error) { + message.HandleSubscription(message.Metadata{Runner: "demo", Message: "demo"}, snapshotC, func(u message.Update[string, *resource.ControllerResourcesContext], errChans chan error) { end() if u.Key == "test" { updates += 1 diff --git a/internal/provider/kubernetes/controller.go b/internal/provider/kubernetes/controller.go index e0318427462..a56d41154bb 100644 --- a/internal/provider/kubernetes/controller.go +++ b/internal/provider/kubernetes/controller.go @@ -13,6 +13,7 @@ import ( "time" "github.com/telepresenceio/watchable" + "go.opentelemetry.io/otel" appsv1 "k8s.io/api/apps/v1" certificatesv1b1 "k8s.io/api/certificates/v1beta1" corev1 "k8s.io/api/core/v1" @@ -59,6 +60,8 @@ var skipNameValidation = func() *bool { return ptr.To(false) } +var tracer = otel.Tracer("envoy-gateway/reconciliation") + type gatewayAPIReconciler struct { client client.Client log logging.Logger @@ -298,6 +301,8 @@ func isTransientError(err error) bool { // same reconcile.Request containing the gateway controller name. This allows multiple resource updates to // be handled by a single call to Reconcile. The reconcile.Request DOES NOT map to a specific resource. func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + ctx, span := tracer.Start(ctx, "GatewayAPIReconciler.Reconcile") + defer span.End() var ( managedGCs []*gwapiv1.GatewayClass err error @@ -572,11 +577,15 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // which impacts translation output gwcResources.Sort() - // Store the Gateway Resources for the GatewayClass. + // Store the Gateway Resources for the GatewayClass with trace context. // The Store is triggered even when there are no Gateways associated to the // GatewayClass. This would happen in case the last Gateway is removed and the // Store will be required to trigger a cleanup of envoy infra resources. - r.resources.GatewayAPIResources.Store(string(r.classController), &gwcResources) + resourcesWithContext := &resource.ControllerResourcesContext{ + Resources: &gwcResources, + Context: ctx, + } + r.resources.GatewayAPIResources.Store(string(r.classController), resourcesWithContext) message.PublishMetric(message.Metadata{ Runner: string(egv1a1.LogComponentProviderRunner), Message: message.ProviderResourcesMessageName, diff --git a/internal/traces/register.go b/internal/traces/register.go new file mode 100644 index 00000000000..287244b6e92 --- /dev/null +++ b/internal/traces/register.go @@ -0,0 +1,112 @@ +package traces + +import ( + "context" + "fmt" + "time" + + egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.4.0" + + "github.com/envoyproxy/gateway/internal/envoygateway/config" +) + +type Runner struct { + cfg *config.Server + tp *trace.TracerProvider +} + +func New(cfg *config.Server) *Runner { + return &Runner{ + cfg: cfg, + } +} + +func (r *Runner) Start(ctx context.Context) error { + config := r.cfg.EnvoyGateway.GetEnvoyGatewayTelemetry().Traces.Sink + configObj := config.OpenTelemetry + + endpoint := fmt.Sprintf("%s:%d", config.OpenTelemetry.Host, config.OpenTelemetry.Port) + if configObj.Protocol == egv1a1.GRPCProtocol { + exporter, err := otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(endpoint), + otlptracegrpc.WithInsecure(), + ) + if err != nil { + return err + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String("envoy-gateway"), + ), + ) + if err != nil { + return err + } + + tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), + trace.WithResource(res), + trace.WithSampler(trace.AlwaysSample()), // TODO: configurable? + ) + + otel.SetTracerProvider(tp) + r.tp = tp + + return nil + } + + if configObj.Protocol == egv1a1.HTTPProtocol { + // Create OTLP HTTP exporter + exporter, err := otlptracehttp.New(ctx, + otlptracehttp.WithEndpoint(endpoint), + otlptracehttp.WithInsecure(), + // TODO: should we make path configurable? + // otlptracehttp.WithURLPath("/v1/traces"), // Optional: custom path + ) + if err != nil { + return err + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String("envoy-gateway"), + ), + ) + if err != nil { + return err + } + + tp := trace.NewTracerProvider( + trace.WithBatcher(exporter), + trace.WithResource(res), + trace.WithSampler(trace.AlwaysSample()), // TODO: configurable? + ) + + otel.SetTracerProvider(tp) + r.tp = tp + + return nil + } + + return nil +} + +func (r *Runner) Name() string { + return "traces" +} + +func (r *Runner) Close() error { + if r.tp != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return r.tp.Shutdown(ctx) + } + return nil +} diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index e010da2439a..af6eaa9ba10 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -29,13 +29,15 @@ import ( "google.golang.org/grpc/keepalive" ktypes "k8s.io/apimachinery/pkg/types" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/crypto" "github.com/envoyproxy/gateway/internal/envoygateway/config" extension "github.com/envoyproxy/gateway/internal/extension/types" "github.com/envoyproxy/gateway/internal/infrastructure/host" "github.com/envoyproxy/gateway/internal/infrastructure/kubernetes/ratelimit" - "github.com/envoyproxy/gateway/internal/ir" "github.com/envoyproxy/gateway/internal/message" "github.com/envoyproxy/gateway/internal/xds/bootstrap" "github.com/envoyproxy/gateway/internal/xds/cache" @@ -65,6 +67,8 @@ const ( defaultMaxConnectionAgeGrace = 2 * time.Minute ) +var tracer = otel.Tracer("envoy-gateway/gateway-api") + var maxConnectionAgeValues = []time.Duration{ 10 * time.Hour, 11 * time.Hour, @@ -220,14 +224,29 @@ func registerServer(srv serverv3.Server, g *grpc.Server) { runtimev3.RegisterRuntimeDiscoveryServiceServer(g, srv) } -func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, *ir.Xds]) { +func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, *message.XdsIRWithContext]) { // Subscribe to resources message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.XDSIRMessageName}, sub, - func(update message.Update[string, *ir.Xds], errChan chan error) { + func(update message.Update[string, *message.XdsIRWithContext], errChan chan error) { r.Logger.Info("received an update") + + parentCtx := context.Background() + if update.Value != nil && update.Value.Context != nil { + parentCtx = update.Value.Context + } + + _, span := tracer.Start(parentCtx, "Runner.subscribeAndTranslate") + defer span.End() + key := update.Key val := update.Value + // Add span attributes for observability + span.SetAttributes( + attribute.String("controller.key", update.Key), + attribute.Bool("update.delete", update.Delete), + ) + if update.Delete { if err := r.cache.GenerateNewSnapshot(key, nil); err != nil { r.Logger.Error(err, "failed to delete the snapshot") @@ -237,7 +256,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, // Translate to xds resources t := &translator.Translator{ ControllerNamespace: r.ControllerNamespace, - FilterOrder: val.FilterOrder, + FilterOrder: val.XdsIR.FilterOrder, RuntimeFlags: r.EnvoyGateway.RuntimeFlags, Logger: r.Logger, } @@ -264,7 +283,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, } } - result, err := t.Translate(val) + result, err := t.Translate(val.XdsIR) if err != nil { r.Logger.Error(err, "failed to translate xds ir") errChan <- err diff --git a/internal/xds/runner/runner_test.go b/internal/xds/runner/runner_test.go index 2100919faed..8fc9cab32cb 100644 --- a/internal/xds/runner/runner_test.go +++ b/internal/xds/runner/runner_test.go @@ -308,7 +308,11 @@ func TestRunner(t *testing.T) { }, }, } - xdsIR.Store("test", &res) + m := message.XdsIRWithContext{ + XdsIR: &res, + Context: context.Background(), + } + xdsIR.Store("test", &m) require.Eventually(t, func() bool { // Check that the cache has the snapshot for our test key return r.cache.SnapshotHasIrKey("test") @@ -397,7 +401,11 @@ func TestRunner_withExtensionManager_FailOpen(t *testing.T) { }, }, } - xdsIR.Store("test", &res) + m := message.XdsIRWithContext{ + XdsIR: &res, + Context: context.Background(), + } + xdsIR.Store("test", &m) require.Eventually(t, func() bool { // Since the extension manager is configured to fail open, in an event of an error // from the extension manager hooks, xds update should be published. @@ -479,7 +487,11 @@ func TestRunner_withExtensionManager_FailClosed(t *testing.T) { }, }, } - xdsIR.Store("test", &res) + m := message.XdsIRWithContext{ + XdsIR: &res, + Context: context.Background(), + } + xdsIR.Store("test", &m) require.Never(t, func() bool { // Since the extension manager is configured to fail closed, in an event of an error // from the extension manager hooks, xds update should not be published. diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 1592fe8414e..35df0d93f8b 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -1475,6 +1475,7 @@ _Appears in:_ _Appears in:_ - [EnvoyGatewayMetricSink](#envoygatewaymetricsink) +- [EnvoyGatewayTraceSink](#envoygatewaytracesink) | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | @@ -1567,6 +1568,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | | `metrics` | _[EnvoyGatewayMetrics](#envoygatewaymetrics)_ | true | | Metrics defines metrics configuration for envoy gateway. | +| `traces` | _[EnvoyGatewayTraces](#envoygatewaytraces)_ | true | | | #### EnvoyGatewayTopologyInjector @@ -1583,6 +1585,39 @@ _Appears in:_ | `disabled` | _boolean_ | false | | | +#### EnvoyGatewayTraceSink + + + +EnvoyGatewayTraceSink defines control plane +trace sinks where traces are sent to. + +_Appears in:_ +- [EnvoyGatewayTraces](#envoygatewaytraces) + +| Field | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `type` | _[TraceSinkType](#tracesinktype)_ | true | OpenTelemetry | Type defines the trace sink type.
EG control plane currently supports OpenTelemetry. | +| `openTelemetry` | _[EnvoyGatewayOpenTelemetrySink](#envoygatewayopentelemetrysink)_ | true | | OpenTelemetry defines the configuration for OpenTelemetry sink.
It's required if the sink type is OpenTelemetry. | + + +#### EnvoyGatewayTraces + + + +EnvoyGatewayMetrics defines control plane push/pull metrics configurations. + +_Appears in:_ +- [EnvoyGatewayTelemetry](#envoygatewaytelemetry) + +| Field | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `sink` | _[EnvoyGatewayTraceSink](#envoygatewaytracesink)_ | true | | Sink defines the metric sink where metrics are sent to. | +| `enable` | _boolean_ | true | | Disable disables the traces. | + + + + #### EnvoyJSONPatchConfig @@ -5144,6 +5179,20 @@ _Appears in:_ | `http` | _[HTTPTimeout](#httptimeout)_ | false | | Timeout settings for HTTP. | +#### TraceSinkType + +_Underlying type:_ _string_ + + + +_Appears in:_ +- [EnvoyGatewayTraceSink](#envoygatewaytracesink) + +| Value | Description | +| ----- | ----------- | +| `OpenTelemetry` | | + + #### Tracing From f66879ccdfaf314fd93f3e516240ecc6a35159eb Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 02:11:19 +0000 Subject: [PATCH 02/35] license header Signed-off-by: Shreemaan Abhishek --- internal/traces/register.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/traces/register.go b/internal/traces/register.go index 287244b6e92..6ecddd7cb83 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -1,3 +1,8 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + package traces import ( From 2c0d0a1ec029c42af00ac959353c1bfcdf35b7f4 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 09:18:46 +0545 Subject: [PATCH 03/35] lint Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_metrics_types.go | 2 +- api/v1alpha1/envoyproxy_metric_types.go | 8 +++++--- internal/admin/console/api.go | 8 ++++---- internal/admin/console/api_test.go | 2 +- internal/gatewayapi/runner/runner.go | 1 - internal/traces/register.go | 2 +- internal/xds/runner/runner.go | 5 ++--- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index f3b0438dbca..4f7cf2cd69a 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -74,7 +74,7 @@ type EnvoyGatewayTracingSink struct { // If ExportTimeout is less than or equal to zero, 30 seconds // is used as the default. ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` - //TODO sampling rate + // TODO sampling rate } type EnvoyGatewayOpenTelemetrySink struct { diff --git a/api/v1alpha1/envoyproxy_metric_types.go b/api/v1alpha1/envoyproxy_metric_types.go index e1c031c8fc7..c37a23f5ca3 100644 --- a/api/v1alpha1/envoyproxy_metric_types.go +++ b/api/v1alpha1/envoyproxy_metric_types.go @@ -5,12 +5,14 @@ package v1alpha1 -type MetricSinkType string -type TraceSinkType string +type ( + MetricSinkType string + TraceSinkType string +) const ( MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" - TraceSinkTypeOpenTelemetry TraceSinkType = "OpenTelemetry" + TraceSinkTypeOpenTelemetry TraceSinkType = "OpenTelemetry" ) type ProxyMetrics struct { diff --git a/internal/admin/console/api.go b/internal/admin/console/api.go index 22a101dad05..fa2d4edba19 100644 --- a/internal/admin/console/api.go +++ b/internal/admin/console/api.go @@ -260,14 +260,14 @@ func (h *Handler) loadConfigDump() ConfigDumpInfo { if h.providerResources != nil { // Load controller resources directly from the provider resources - controllerResources := h.providerResources.GatewayAPIResources.LoadAll() + controllerResourcesContext := h.providerResources.GatewayAPIResources.LoadAll() - for _, resources := range controllerResources { - if resources == nil { + for _, resourcesContext := range controllerResourcesContext { + if resourcesContext == nil { continue } - for _, res := range *resources { + for _, res := range *resourcesContext.Resources { if res == nil { continue } diff --git a/internal/admin/console/api_test.go b/internal/admin/console/api_test.go index 539dbb7195a..ee353c339dd 100644 --- a/internal/admin/console/api_test.go +++ b/internal/admin/console/api_test.go @@ -151,7 +151,7 @@ func TestLoadConfigDumpWithData(t *testing.T) { // This test focuses on the basic functionality providerRes := &message.ProviderResources{} // Initialize empty watchable map - providerRes.GatewayAPIResources = watchable.Map[string, *resource.ControllerResources]{} + providerRes.GatewayAPIResources = watchable.Map[string, *resource.ControllerResourcesContext]{} // Skip storing to avoid watchable copy issues // providerResources.Store("test", providerRes) diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index f71ae394929..04ae78f6de3 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -129,7 +129,6 @@ func (r *Runner) startWasmCache(ctx context.Context) { func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *resource.ControllerResourcesContext]) { message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.ProviderResourcesMessageName}, sub, func(update message.Update[string, *resource.ControllerResourcesContext], errChan chan error) { - parentCtx := context.Background() if update.Value != nil && update.Value.Context != nil { parentCtx = update.Value.Context diff --git a/internal/traces/register.go b/internal/traces/register.go index 6ecddd7cb83..51205ad3798 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -10,7 +10,6 @@ import ( "fmt" "time" - egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" @@ -18,6 +17,7 @@ import ( "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" + egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/envoygateway/config" ) diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index af6eaa9ba10..437a599330d 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -24,14 +24,13 @@ import ( secretv3 "github.com/envoyproxy/go-control-plane/envoy/service/secret/v3" serverv3 "github.com/envoyproxy/go-control-plane/pkg/server/v3" "github.com/telepresenceio/watchable" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/keepalive" ktypes "k8s.io/apimachinery/pkg/types" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/crypto" "github.com/envoyproxy/gateway/internal/envoygateway/config" From 4777f57e7b2ac3729fe963745600b03844e4a31b Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 11:41:08 +0545 Subject: [PATCH 04/35] default traces Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_helpers.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index f33875ce0c4..43601949c65 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -206,6 +206,7 @@ func (e *EnvoyGateway) DisablePrometheus() bool { func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return &EnvoyGatewayTelemetry{ Metrics: DefaultEnvoyGatewayMetrics(), + Traces: DefaultEnvoyGatewayTraces(), } } @@ -216,6 +217,10 @@ func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { } } +func DefaultEnvoyGatewayTraces() *EnvoyGatewayTraces { + return &EnvoyGatewayTraces{} +} + // DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. func DefaultEnvoyGatewayPrometheus() *EnvoyGatewayPrometheusProvider { return &EnvoyGatewayPrometheusProvider{ From 0af47d13f5695eb6ab6ce67413e1dbca2ed8e482 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 06:40:07 +0000 Subject: [PATCH 05/35] disability Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_helpers.go | 9 ++++++++- internal/traces/register.go | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index 43601949c65..e3cd5ac8a48 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -202,6 +202,11 @@ func (e *EnvoyGateway) DisablePrometheus() bool { return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Disable } +// DisableTraces returns if disable prometheus. +func (e *EnvoyGateway) DisableTraces() bool { + return e.GetEnvoyGatewayTelemetry().Traces.Disable +} + // DefaultEnvoyGatewayTelemetry returns a new EnvoyGatewayTelemetry with default configuration parameters. func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return &EnvoyGatewayTelemetry{ @@ -218,7 +223,9 @@ func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { } func DefaultEnvoyGatewayTraces() *EnvoyGatewayTraces { - return &EnvoyGatewayTraces{} + return &EnvoyGatewayTraces{ + Disable: true, + } } // DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. diff --git a/internal/traces/register.go b/internal/traces/register.go index 51205ad3798..6c12727c184 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -33,6 +33,10 @@ func New(cfg *config.Server) *Runner { } func (r *Runner) Start(ctx context.Context) error { + if r.cfg.EnvoyGateway.DisableTraces() { + return nil + } + config := r.cfg.EnvoyGateway.GetEnvoyGatewayTelemetry().Traces.Sink configObj := config.OpenTelemetry From c620c6e58c5353206fba1ad071336ddca34bd9cf Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 09:41:23 +0000 Subject: [PATCH 06/35] deepcopy errors Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/resource/resource.go | 12 ++ internal/gatewayapi/resource/resource_test.go | 139 ++++++++++++++++++ internal/message/types.go | 12 ++ 3 files changed, 163 insertions(+) diff --git a/internal/gatewayapi/resource/resource.go b/internal/gatewayapi/resource/resource.go index 8c7f3ca839a..4201cb801ff 100644 --- a/internal/gatewayapi/resource/resource.go +++ b/internal/gatewayapi/resource/resource.go @@ -216,6 +216,18 @@ type ControllerResourcesContext struct { Context context.Context } +// DeepCopy creates a new ControllerResourcesContext. +// The Context field is preserved (not deep copied) since contexts are meant to be passed around. +func (c *ControllerResourcesContext) DeepCopy() *ControllerResourcesContext { + if c == nil { + return nil + } + return &ControllerResourcesContext{ + Resources: c.Resources.DeepCopy(), + Context: c.Context, + } +} + // DeepCopy creates a new ControllerResources. // It is handwritten since the tooling was unable to copy into a new slice func (c *ControllerResources) DeepCopy() *ControllerResources { diff --git a/internal/gatewayapi/resource/resource_test.go b/internal/gatewayapi/resource/resource_test.go index 1db2e0f6071..91142412774 100644 --- a/internal/gatewayapi/resource/resource_test.go +++ b/internal/gatewayapi/resource/resource_test.go @@ -6,6 +6,7 @@ package resource import ( + "context" "testing" "github.com/google/go-cmp/cmp" @@ -205,3 +206,141 @@ func TestGetEndpointSlicesForBackendDualStack(t *testing.T) { } }) } + +func TestControllerResourcesContextDeepCopy(t *testing.T) { + tests := []struct { + name string + ctx *ControllerResourcesContext + }{ + { + name: "nil context", + ctx: nil, + }, + { + name: "empty context", + ctx: &ControllerResourcesContext{ + Resources: &ControllerResources{}, + Context: context.Background(), + }, + }, + { + name: "context with resources", + ctx: &ControllerResourcesContext{ + Resources: &ControllerResources{ + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-gateway-class", + }, + }, + }, + }, + Context: context.Background(), + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + copied := tc.ctx.DeepCopy() + + if tc.ctx == nil { + assert.Nil(t, copied) + return + } + + // Verify the copy is not nil + require.NotNil(t, copied) + + // Verify the copy is a different object + assert.NotSame(t, tc.ctx, copied) + + // Verify Resources are deep copied + if tc.ctx.Resources != nil { + require.NotNil(t, copied.Resources) + assert.NotSame(t, tc.ctx.Resources, copied.Resources) + + // Verify the contents are equal + assert.Equal(t, len(*tc.ctx.Resources), len(*copied.Resources)) + } + + // Verify Context is preserved (not deep copied, same reference) + assert.Equal(t, tc.ctx.Context, copied.Context) + }) + } +} + +func TestControllerResourcesDeepCopy(t *testing.T) { + tests := []struct { + name string + resources *ControllerResources + }{ + { + name: "nil resources", + resources: nil, + }, + { + name: "empty resources", + resources: &ControllerResources{}, + }, + { + name: "resources with gateway class", + resources: &ControllerResources{ + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-gateway-class", + }, + }, + }, + }, + }, + { + name: "multiple resources", + resources: &ControllerResources{ + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gateway-class-1", + }, + }, + }, + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gateway-class-2", + }, + }, + }, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + copied := tc.resources.DeepCopy() + + if tc.resources == nil { + assert.Nil(t, copied) + return + } + + // Verify the copy is not nil + require.NotNil(t, copied) + + // Verify the copy is a different object + assert.NotSame(t, tc.resources, copied) + + // Verify the length is the same + assert.Equal(t, len(*tc.resources), len(*copied)) + + // Verify each resource is deep copied + for i := range *tc.resources { + if (*tc.resources)[i] != nil { + require.NotNil(t, (*copied)[i]) + assert.NotSame(t, (*tc.resources)[i], (*copied)[i]) + } + } + }) + } +} diff --git a/internal/message/types.go b/internal/message/types.go index 48c459d1ef7..fd9786c7007 100644 --- a/internal/message/types.go +++ b/internal/message/types.go @@ -136,6 +136,18 @@ type XdsIRWithContext struct { Context context.Context } +// DeepCopy creates a new ControllerResourcesContext. +// The Context field is preserved (not deep copied) since contexts are meant to be passed around. +func (x *XdsIRWithContext) DeepCopy() *XdsIRWithContext { + if x == nil { + return nil + } + return &XdsIRWithContext{ + XdsIR: x.XdsIR.DeepCopy(), + Context: x.Context, + } +} + // XdsIR message type XdsIR struct { watchable.Map[string, *XdsIRWithContext] From aa6b91bb9d1aef4228f7441b46da3bb5406d8043 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 31 Oct 2025 15:45:46 +0545 Subject: [PATCH 07/35] lint Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/resource/resource_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/gatewayapi/resource/resource_test.go b/internal/gatewayapi/resource/resource_test.go index 91142412774..9e6bb054e4b 100644 --- a/internal/gatewayapi/resource/resource_test.go +++ b/internal/gatewayapi/resource/resource_test.go @@ -261,7 +261,7 @@ func TestControllerResourcesContextDeepCopy(t *testing.T) { assert.NotSame(t, tc.ctx.Resources, copied.Resources) // Verify the contents are equal - assert.Equal(t, len(*tc.ctx.Resources), len(*copied.Resources)) + assert.Len(t, *copied.Resources, len(*tc.ctx.Resources)) } // Verify Context is preserved (not deep copied, same reference) @@ -332,7 +332,7 @@ func TestControllerResourcesDeepCopy(t *testing.T) { assert.NotSame(t, tc.resources, copied) // Verify the length is the same - assert.Equal(t, len(*tc.resources), len(*copied)) + assert.Len(t, *copied, len(*tc.resources)) // Verify each resource is deep copied for i := range *tc.resources { From 2ad73992c450f59518235916459b7cdc9b5d5dcd Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sat, 1 Nov 2025 07:29:31 +0000 Subject: [PATCH 08/35] e2e Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_metrics_types.go | 40 ++- api/v1alpha1/zz_generated.deepcopy.go | 40 +++ internal/gatewayapi/runner/runner.go | 2 +- internal/traces/register.go | 93 +++++-- internal/traces/register_test.go | 177 +++++++++++++ internal/xds/runner/runner.go | 2 +- site/content/en/latest/api/extension_types.md | 23 +- .../envoy-gateaway-config/traces-enabled.yaml | 35 +++ test/config/helm/traces-enabled.yaml | 10 + test/e2e/testdata/controlplane-tracing.yaml | 31 +++ test/e2e/tests/controlplane_tracing.go | 133 ++++++++++ test/helm/gateway-addons-helm/e2e.in.yaml | 7 + test/helm/gateway-addons-helm/e2e.out.yaml | 2 +- test/utils/controlplane_tracing/tracing.go | 232 ++++++++++++++++++ 14 files changed, 793 insertions(+), 34 deletions(-) create mode 100644 internal/traces/register_test.go create mode 100644 test/config/envoy-gateaway-config/traces-enabled.yaml create mode 100644 test/config/helm/traces-enabled.yaml create mode 100644 test/e2e/testdata/controlplane-tracing.yaml create mode 100644 test/e2e/tests/controlplane_tracing.go create mode 100644 test/utils/controlplane_tracing/tracing.go diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index 4f7cf2cd69a..3caef2eb077 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -15,13 +15,49 @@ type EnvoyGatewayMetrics struct { Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` } -// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +// EnvoyGatewayTraces defines control plane tracing configurations. type EnvoyGatewayTraces struct { - // Sink defines the metric sink where metrics are sent to. + // Sink defines the trace sink where traces are sent to. Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` // Disable disables the traces. // TODO: implement disability Disable bool `json:"enable,omitempty"` + // SamplingRate controls the rate at which traces are sampled. + // Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0. + // 0.0 means no sampling, 1.0 means all traces are sampled. + // + // +optional + // +kubebuilder:validation:Minimum=0.0 + // +kubebuilder:validation:Maximum=1.0 + SamplingRate *float64 `json:"samplingRate,omitempty"` + // BatchSpanProcessorConfig defines the configuration for the batch span processor. + // This processor batches spans before exporting them to the configured sink. + // + // +optional + BatchSpanProcessorConfig *BatchSpanProcessorConfig `json:"batchSpanProcessor,omitempty"` +} + +// BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. +// The batch span processor batches spans before sending them to the exporter. +type BatchSpanProcessorConfig struct { + // BatchTimeout is the maximum duration for constructing a batch. Spans are + // exported when either the batch is full or this timeout is reached. + // Default is 5s. For e2e testing, a lower value like 100ms is recommended. + // + // +optional + BatchTimeout *gwapiv1.Duration `json:"batchTimeout,omitempty"` + // MaxExportBatchSize is the maximum number of spans to export in a single batch. + // Default is 512. + // + // +optional + // +kubebuilder:validation:Minimum=1 + MaxExportBatchSize *int `json:"maxExportBatchSize,omitempty"` + // MaxQueueSize is the maximum queue size to buffer spans for delayed processing. + // If the queue gets full it drops the spans. Default is 2048. + // + // +optional + // +kubebuilder:validation:Minimum=1 + MaxQueueSize *int `json:"maxQueueSize,omitempty"` } // EnvoyGatewayMetricSink defines control plane diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index c64e03e0284..2ae170306dc 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -795,6 +795,36 @@ func (in *BasicAuth) DeepCopy() *BasicAuth { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BatchSpanProcessorConfig) DeepCopyInto(out *BatchSpanProcessorConfig) { + *out = *in + if in.BatchTimeout != nil { + in, out := &in.BatchTimeout, &out.BatchTimeout + *out = new(v1.Duration) + **out = **in + } + if in.MaxExportBatchSize != nil { + in, out := &in.MaxExportBatchSize, &out.MaxExportBatchSize + *out = new(int) + **out = **in + } + if in.MaxQueueSize != nil { + in, out := &in.MaxQueueSize, &out.MaxQueueSize + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BatchSpanProcessorConfig. +func (in *BatchSpanProcessorConfig) DeepCopy() *BatchSpanProcessorConfig { + if in == nil { + return nil + } + out := new(BatchSpanProcessorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BodyToExtAuth) DeepCopyInto(out *BodyToExtAuth) { *out = *in @@ -2357,6 +2387,16 @@ func (in *EnvoyGatewayTraceSink) DeepCopy() *EnvoyGatewayTraceSink { func (in *EnvoyGatewayTraces) DeepCopyInto(out *EnvoyGatewayTraces) { *out = *in in.Sink.DeepCopyInto(&out.Sink) + if in.SamplingRate != nil { + in, out := &in.SamplingRate, &out.SamplingRate + *out = new(float64) + **out = **in + } + if in.BatchSpanProcessorConfig != nil { + in, out := &in.BatchSpanProcessorConfig, &out.BatchSpanProcessorConfig + *out = new(BatchSpanProcessorConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraces. diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 04ae78f6de3..3be740d886c 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -134,7 +134,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re parentCtx = update.Value.Context } - _, span := tracer.Start(parentCtx, "Runner.subscribeAndTranslate") + _, span := tracer.Start(parentCtx, "GatewayApiRunner.subscribeAndTranslate") defer span.End() r.Logger.Info("received an update") diff --git a/internal/traces/register.go b/internal/traces/register.go index 6c12727c184..66bac49a046 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -37,10 +37,28 @@ func (r *Runner) Start(ctx context.Context) error { return nil } - config := r.cfg.EnvoyGateway.GetEnvoyGatewayTelemetry().Traces.Sink - configObj := config.OpenTelemetry + tracesConfig := r.cfg.EnvoyGateway.GetEnvoyGatewayTelemetry().Traces + sinkConfig := tracesConfig.Sink + configObj := sinkConfig.OpenTelemetry + + endpoint := fmt.Sprintf("%s:%d", sinkConfig.OpenTelemetry.Host, sinkConfig.OpenTelemetry.Port) + + // Create resource + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String("envoy-gateway"), + ), + ) + if err != nil { + return err + } + + // Get sampler configuration + sampler := r.getSampler(tracesConfig) + + // Get batch span processor options + batchOptions := r.getBatchSpanProcessorOptions(tracesConfig) - endpoint := fmt.Sprintf("%s:%d", config.OpenTelemetry.Host, config.OpenTelemetry.Port) if configObj.Protocol == egv1a1.GRPCProtocol { exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithEndpoint(endpoint), @@ -50,19 +68,11 @@ func (r *Runner) Start(ctx context.Context) error { return err } - res, err := resource.New(ctx, - resource.WithAttributes( - semconv.ServiceNameKey.String("envoy-gateway"), - ), - ) - if err != nil { - return err - } - + bsp := trace.NewBatchSpanProcessor(exporter, batchOptions...) tp := trace.NewTracerProvider( - trace.WithBatcher(exporter), + trace.WithSpanProcessor(bsp), trace.WithResource(res), - trace.WithSampler(trace.AlwaysSample()), // TODO: configurable? + trace.WithSampler(sampler), ) otel.SetTracerProvider(tp) @@ -76,26 +86,16 @@ func (r *Runner) Start(ctx context.Context) error { exporter, err := otlptracehttp.New(ctx, otlptracehttp.WithEndpoint(endpoint), otlptracehttp.WithInsecure(), - // TODO: should we make path configurable? - // otlptracehttp.WithURLPath("/v1/traces"), // Optional: custom path - ) - if err != nil { - return err - } - - res, err := resource.New(ctx, - resource.WithAttributes( - semconv.ServiceNameKey.String("envoy-gateway"), - ), ) if err != nil { return err } + bsp := trace.NewBatchSpanProcessor(exporter, batchOptions...) tp := trace.NewTracerProvider( - trace.WithBatcher(exporter), + trace.WithSpanProcessor(bsp), trace.WithResource(res), - trace.WithSampler(trace.AlwaysSample()), // TODO: configurable? + trace.WithSampler(sampler), ) otel.SetTracerProvider(tp) @@ -107,6 +107,45 @@ func (r *Runner) Start(ctx context.Context) error { return nil } +// getSampler returns the configured sampler or a default sampler +func (r *Runner) getSampler(tracesConfig *egv1a1.EnvoyGatewayTraces) trace.Sampler { + if tracesConfig.SamplingRate != nil { + return trace.TraceIDRatioBased(*tracesConfig.SamplingRate) + } + // Default to always sample (100%) + return trace.AlwaysSample() +} + +// getBatchSpanProcessorOptions returns the configured batch span processor options +func (r *Runner) getBatchSpanProcessorOptions(tracesConfig *egv1a1.EnvoyGatewayTraces) []trace.BatchSpanProcessorOption { + var options []trace.BatchSpanProcessorOption + + if tracesConfig.BatchSpanProcessorConfig != nil { + cfg := tracesConfig.BatchSpanProcessorConfig + + if cfg.BatchTimeout != nil { + timeout, err := time.ParseDuration(string(*cfg.BatchTimeout)) + if err == nil && timeout > 0 { + options = append(options, trace.WithBatchTimeout(timeout)) + } + } + + if cfg.MaxExportBatchSize != nil && *cfg.MaxExportBatchSize > 0 { + options = append(options, trace.WithMaxExportBatchSize(*cfg.MaxExportBatchSize)) + } + + if cfg.MaxQueueSize != nil && *cfg.MaxQueueSize > 0 { + options = append(options, trace.WithMaxQueueSize(*cfg.MaxQueueSize)) + } + } + + // If no options were configured, use defaults + // Default BatchTimeout is 5s, MaxExportBatchSize is 512, MaxQueueSize is 2048 + // These are the OpenTelemetry SDK defaults + + return options +} + func (r *Runner) Name() string { return "traces" } diff --git a/internal/traces/register_test.go b/internal/traces/register_test.go new file mode 100644 index 00000000000..0052af4ae43 --- /dev/null +++ b/internal/traces/register_test.go @@ -0,0 +1,177 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package traces + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + + egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" + "github.com/envoyproxy/gateway/internal/envoygateway/config" +) + +func TestTracesRunner_New(t *testing.T) { + cfg := &config.Server{ + EnvoyGateway: &egv1a1.EnvoyGateway{ + EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ + Telemetry: &egv1a1.EnvoyGatewayTelemetry{ + Traces: &egv1a1.EnvoyGatewayTraces{ + Sink: egv1a1.EnvoyGatewayTraceSink{ + Type: egv1a1.TraceSinkTypeOpenTelemetry, + OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ + Host: "localhost", + Port: 4317, + Protocol: egv1a1.GRPCProtocol, + }, + }, + }, + }, + }, + }, + } + + runner := New(cfg) + require.NotNil(t, runner) + require.Equal(t, cfg, runner.cfg) + require.Nil(t, runner.tp) +} + +func TestTracesRunner_Close(t *testing.T) { + tests := []struct { + name string + runner *Runner + wantErr bool + }{ + { + name: "close with nil tracer provider", + runner: &Runner{ + cfg: &config.Server{}, + tp: nil, + }, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.runner.Close() + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestTracesRunner_Start_ValidConfiguration(t *testing.T) { + tests := []struct { + name string + protocol string + host string + port int32 + }{ + { + name: "grpc protocol configuration", + protocol: egv1a1.GRPCProtocol, + host: "localhost", + port: 4317, + }, + { + name: "http protocol configuration", + protocol: egv1a1.HTTPProtocol, + host: "localhost", + port: 4318, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &config.Server{ + EnvoyGateway: &egv1a1.EnvoyGateway{ + EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ + Telemetry: &egv1a1.EnvoyGatewayTelemetry{ + Traces: &egv1a1.EnvoyGatewayTraces{ + Sink: egv1a1.EnvoyGatewayTraceSink{ + Type: egv1a1.TraceSinkTypeOpenTelemetry, + OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ + Host: tt.host, + Port: tt.port, + Protocol: tt.protocol, + }, + }, + }, + }, + }, + }, + } + + runner := New(cfg) + require.NotNil(t, runner) + + // Create a context with timeout + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Start will create the exporter + err := runner.Start(ctx) + // We don't expect an error during initialization + require.NoError(t, err) + + // Clean up + _ = runner.Close() + }) + } +} + +func TestTracesRunner_Start_Configuration(t *testing.T) { + tests := []struct { + name string + protocol string + }{ + { + name: "grpc protocol configuration", + protocol: egv1a1.GRPCProtocol, + }, + { + name: "http protocol configuration", + protocol: egv1a1.HTTPProtocol, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &config.Server{ + EnvoyGateway: &egv1a1.EnvoyGateway{ + EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ + Telemetry: &egv1a1.EnvoyGatewayTelemetry{ + Traces: &egv1a1.EnvoyGatewayTraces{ + Sink: egv1a1.EnvoyGatewayTraceSink{ + Type: egv1a1.TraceSinkTypeOpenTelemetry, + OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ + Host: "localhost", + Port: 4317, + Protocol: tt.protocol, + }, + }, + }, + }, + }, + }, + } + + runner := New(cfg) + require.NotNil(t, runner) + require.Equal(t, "traces", runner.Name()) + + // Note: We don't call Start() here because it requires a real OTLP endpoint + // This test just verifies the runner can be created with valid configuration + }) + } +} diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index 437a599330d..0f51e1b1bb6 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -234,7 +234,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, parentCtx = update.Value.Context } - _, span := tracer.Start(parentCtx, "Runner.subscribeAndTranslate") + _, span := tracer.Start(parentCtx, "XdsRunner.subscribeAndTranslate") defer span.End() key := update.Key diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 35df0d93f8b..280318d7768 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -564,6 +564,23 @@ _Appears in:_ | `forwardUsernameHeader` | _string_ | false | | This field specifies the header name to forward a successfully authenticated user to
the backend. The header will be added to the request with the username as the value.
If it is not specified, the username will not be forwarded. | +#### BatchSpanProcessorConfig + + + +BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. +The batch span processor batches spans before sending them to the exporter. + +_Appears in:_ +- [EnvoyGatewayTraces](#envoygatewaytraces) + +| Field | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached.
Default is 5s. For e2e testing, a lower value like 100ms is recommended. | +| `maxExportBatchSize` | _integer_ | false | | MaxExportBatchSize is the maximum number of spans to export in a single batch.
Default is 512. | +| `maxQueueSize` | _integer_ | false | | MaxQueueSize is the maximum queue size to buffer spans for delayed processing.
If the queue gets full it drops the spans. Default is 2048. | + + #### BodyToExtAuth @@ -1605,15 +1622,17 @@ _Appears in:_ -EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +EnvoyGatewayTraces defines control plane tracing configurations. _Appears in:_ - [EnvoyGatewayTelemetry](#envoygatewaytelemetry) | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | -| `sink` | _[EnvoyGatewayTraceSink](#envoygatewaytracesink)_ | true | | Sink defines the metric sink where metrics are sent to. | +| `sink` | _[EnvoyGatewayTraceSink](#envoygatewaytracesink)_ | true | | Sink defines the trace sink where traces are sent to. | | `enable` | _boolean_ | true | | Disable disables the traces. | +| `samplingRate` | _float_ | false | | SamplingRate controls the rate at which traces are sampled.
Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0.
0.0 means no sampling, 1.0 means all traces are sampled. | +| `batchSpanProcessor` | _[BatchSpanProcessorConfig](#batchspanprocessorconfig)_ | false | | BatchSpanProcessorConfig defines the configuration for the batch span processor.
This processor batches spans before exporting them to the configured sink. | diff --git a/test/config/envoy-gateaway-config/traces-enabled.yaml b/test/config/envoy-gateaway-config/traces-enabled.yaml new file mode 100644 index 00000000000..15b0ab43b41 --- /dev/null +++ b/test/config/envoy-gateaway-config/traces-enabled.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: envoy-gateway-system +data: + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + provider: + type: Kubernetes + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + extensionApis: + enableEnvoyPatchPolicy: true + enableBackend: true + rateLimit: + backend: + type: Redis + redis: + url: redis.redis-system.svc.cluster.local:6379 + telemetry: + traces: + sink: + type: OpenTelemetry + openTelemetry: + host: "otel-collector.monitoring.svc.cluster.local" + port: 4317 + protocol: grpc + # Fast export settings for e2e tests to ensure traces are available immediately + samplingRate: 1.0 # 100% sampling for tests + batchSpanProcessor: + batchTimeout: 100ms # Export every 100ms instead of default 5s + maxExportBatchSize: 512 + maxQueueSize: 2048 diff --git a/test/config/helm/traces-enabled.yaml b/test/config/helm/traces-enabled.yaml new file mode 100644 index 00000000000..9c611623c7a --- /dev/null +++ b/test/config/helm/traces-enabled.yaml @@ -0,0 +1,10 @@ +config: + envoyGateway: + telemetry: + traces: + sink: + type: OpenTelemetry + openTelemetry: + host: "otel-collector.monitoring.svc.cluster.local" + port: 4317 + protocol: grpc diff --git a/test/e2e/testdata/controlplane-tracing.yaml b/test/e2e/testdata/controlplane-tracing.yaml new file mode 100644 index 00000000000..20860756a08 --- /dev/null +++ b/test/e2e/testdata/controlplane-tracing.yaml @@ -0,0 +1,31 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: controlplane-tracing-test + namespace: gateway-conformance-infra +spec: + gatewayClassName: envoy-gateway + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: All +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: controlplane-tracing-test + namespace: gateway-conformance-infra +spec: + parentRefs: + - name: controlplane-tracing-test + rules: + - matches: + - path: + type: PathPrefix + value: /test + backendRefs: + - name: infra-backend-v1 + port: 8080 diff --git a/test/e2e/tests/controlplane_tracing.go b/test/e2e/tests/controlplane_tracing.go new file mode 100644 index 00000000000..98ecd421ebc --- /dev/null +++ b/test/e2e/tests/controlplane_tracing.go @@ -0,0 +1,133 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +//go:build e2e + +package tests + +import ( + "context" + "testing" + "time" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + httputils "sigs.k8s.io/gateway-api/conformance/utils/http" + "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/conformance/utils/tlog" + + controlplanetracing "github.com/envoyproxy/gateway/test/utils/controlplane_tracing" +) + +func init() { + ConformanceTests = append(ConformanceTests, ControlPlaneTracingTest) +} + +var ControlPlaneTracingTest = suite.ConformanceTest{ + ShortName: "ControlPlaneTracing", + Description: "Verify that control plane traces are being generated and exported to OpenTelemetry collector", + Manifests: []string{"testdata/controlplane-tracing.yaml"}, + Test: func(t *testing.T, suite *suite.ConformanceTestSuite) { + t.Run("OpenTelemetry", func(t *testing.T) { + ns := "gateway-conformance-infra" + routeNN := types.NamespacedName{Name: "controlplane-tracing-test", Namespace: ns} + gwNN := types.NamespacedName{Name: "controlplane-tracing-test", Namespace: ns} + + // Wait for gateway and route to be accepted + // This will trigger control plane operations that should generate traces + gwAddr := kubernetes.GatewayAndRoutesMustBeAccepted( + t, + suite.Client, + suite.TimeoutConfig, + suite.ControllerName, + kubernetes.NewGatewayRef(gwNN), + &gwapiv1.HTTPRoute{}, + false, + routeNN, + ) + + // Make a test request to ensure the gateway is fully operational + expectedResponse := httputils.ExpectedResponse{ + Request: httputils.Request{ + Path: "/test", + }, + Response: httputils.Response{ + StatusCodes: []int{200}, + }, + Namespace: ns, + } + httputils.MakeRequestAndExpectEventuallyConsistentResponse( + t, + suite.RoundTripper, + suite.TimeoutConfig, + gwAddr, + expectedResponse, + ) + + // Wait for traces to be exported and verify they exist + // Control plane traces should have service.name=envoy-gateway + tlog.Logf(t, "waiting for control plane traces to be exported...") + if err := wait.PollUntilContextTimeout( + context.TODO(), + 2*time.Second, + 2*time.Minute, + true, + func(ctx context.Context) (bool, error) { + // Query Tempo for control plane traces + traceCount, err := controlplanetracing.QueryControlPlaneTraces(t, suite.Client, "envoy-gateway") + if err != nil { + tlog.Logf(t, "failed to query traces from tempo: %v", err) + return false, nil + } + + tlog.Logf(t, "found %d control plane traces", traceCount) + + // We expect at least some traces from the gateway operations + if traceCount > 0 { + return true, nil + } + + return false, nil + }, + ); err != nil { + t.Errorf("failed to find control plane traces in tempo: %v", err) + } + + // Verify specific span names exist + // These span names are created by the instrumented code in the gateway + tlog.Logf(t, "verifying expected span names exist...") + expectedSpanNames := []string{ + "GatewayApiRunner.subscribeAndTranslate", + "XdsRunner.subscribeAndTranslate", + } + + if err := wait.PollUntilContextTimeout( + context.TODO(), + 2*time.Second, + 2*time.Minute, + true, + func(ctx context.Context) (bool, error) { + hasExpectedSpans, err := controlplanetracing.VerifyExpectedSpans( + t, + suite.Client, + "envoy-gateway", + expectedSpanNames, + ) + if err != nil { + tlog.Logf(t, "failed to verify expected spans: %v", err) + return false, nil + } + return hasExpectedSpans, nil + }, + ); err != nil { + t.Errorf("failed to find expected span names: %v", err) + } + + tlog.Logf(t, "control plane tracing test completed successfully") + }) + }, +} diff --git a/test/helm/gateway-addons-helm/e2e.in.yaml b/test/helm/gateway-addons-helm/e2e.in.yaml index bf913c259a9..bd554ba47dd 100644 --- a/test/helm/gateway-addons-helm/e2e.in.yaml +++ b/test/helm/gateway-addons-helm/e2e.in.yaml @@ -4,5 +4,12 @@ grafana: enabled: false opentelemetry-collector: enabled: true + mode: deployment + service: + type: LoadBalancer fluent-bit: enabled: false +tempo: + enabled: true + service: + type: LoadBalancer diff --git a/test/helm/gateway-addons-helm/e2e.out.yaml b/test/helm/gateway-addons-helm/e2e.out.yaml index 6978f3dcefc..4b4d2925337 100644 --- a/test/helm/gateway-addons-helm/e2e.out.yaml +++ b/test/helm/gateway-addons-helm/e2e.out.yaml @@ -10334,7 +10334,7 @@ metadata: app.kubernetes.io/component: standalone-collector component: standalone-collector spec: - type: ClusterIP + type: LoadBalancer ports: - name: datadog diff --git a/test/utils/controlplane_tracing/tracing.go b/test/utils/controlplane_tracing/tracing.go new file mode 100644 index 00000000000..0a4ca68071a --- /dev/null +++ b/test/utils/controlplane_tracing/tracing.go @@ -0,0 +1,232 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package controlplanetracing + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/url" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api/conformance/utils/tlog" +) + +// getTempoHost returns the Tempo host address +// It tries to use the LoadBalancer IP first, then falls back to localhost (port-forward) +func getTempoHost(t *testing.T, c client.Client) (string, error) { + // Verify Tempo service exists + svc := corev1.Service{} + if err := c.Get(context.Background(), types.NamespacedName{ + Namespace: "monitoring", + Name: "tempo", + }, &svc); err != nil { + // Fall back to eg-addons prefix if that fails + if err := c.Get(context.Background(), types.NamespacedName{ + Namespace: "monitoring", + Name: "eg-addons-tempo", + }, &svc); err != nil { + return "", fmt.Errorf("failed to get tempo service: %w", err) + } + } + + // Try to use LoadBalancer IP if available (more reliable than port-forward) + if svc.Spec.Type == corev1.ServiceTypeLoadBalancer { + if len(svc.Status.LoadBalancer.Ingress) > 0 { + if svc.Status.LoadBalancer.Ingress[0].IP != "" { + host := svc.Status.LoadBalancer.Ingress[0].IP + tlog.Logf(t, "using Tempo at %s:3100 (via LoadBalancer)", host) + return host, nil + } + } + } + + return "", fmt.Errorf("tempo loadbalancer IP not found") +} + +// QueryControlPlaneTraces queries Tempo for control plane traces with the given service name +func QueryControlPlaneTraces(t *testing.T, c client.Client, serviceName string) (int, error) { + host, err := getTempoHost(t, c) + if err != nil { + return -1, err + } + + tempoURL := url.URL{ + Scheme: "http", + Host: net.JoinHostPort(host, "3100"), + Path: "/api/search", + } + query := tempoURL.Query() + query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) + query.Add("end", fmt.Sprintf("%d", time.Now().Unix())) + query.Add("tags", fmt.Sprintf("service.name=%s", serviceName)) + tempoURL.RawQuery = query.Encode() + + req, err := http.NewRequest("GET", tempoURL.String(), nil) + if err != nil { + return -1, err + } + + tlog.Logf(t, "querying tempo: %s", tempoURL.String()) + res, err := http.DefaultClient.Do(req) + if err != nil { + return -1, err + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return -1, fmt.Errorf("tempo returned status %s", res.Status) + } + + resp := &TempoResponse{} + data, err := io.ReadAll(res.Body) + if err != nil { + return -1, err + } + if err := json.Unmarshal(data, &resp); err != nil { + tlog.Logf(t, "failed to unmarshal response: %s", string(data)) + return -1, err + } + + total := len(resp.Traces) + tlog.Logf(t, "found %d traces from tempo for service %s", total, serviceName) + return total, nil +} + +// VerifyExpectedSpans checks that expected span names exist in the traces +// Note: Tempo's search API doesn't index child span names, so we need to fetch +// traces and inspect them directly +func VerifyExpectedSpans(t *testing.T, c client.Client, serviceName string, expectedSpanNames []string) (bool, error) { + host, err := getTempoHost(t, c) + if err != nil { + return false, err + } + + // First, get all traces for the service + searchURL := url.URL{ + Scheme: "http", + Host: net.JoinHostPort(host, "3100"), + Path: "/api/search", + } + query := searchURL.Query() + query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) + query.Add("end", fmt.Sprintf("%d", time.Now().Unix())) + query.Add("tags", fmt.Sprintf("service.name=%s", serviceName)) + searchURL.RawQuery = query.Encode() + + req, err := http.NewRequest("GET", searchURL.String(), nil) + if err != nil { + return false, err + } + + res, err := http.DefaultClient.Do(req) + if err != nil { + return false, err + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return false, fmt.Errorf("tempo search returned status %s", res.Status) + } + + searchResp := &TempoResponse{} + data, err := io.ReadAll(res.Body) + if err != nil { + return false, err + } + if err := json.Unmarshal(data, &searchResp); err != nil { + return false, err + } + + if len(searchResp.Traces) == 0 { + tlog.Logf(t, "no traces found for service %s", serviceName) + return false, nil + } + + // Now fetch each trace and check for the expected span names + foundSpans := make(map[string]bool) + for _, trace := range searchResp.Traces { + traceID := trace["traceID"].(string) + traceURL := url.URL{ + Scheme: "http", + Host: net.JoinHostPort(host, "3100"), + Path: fmt.Sprintf("/api/traces/%s", traceID), + } + + req, err := http.NewRequest("GET", traceURL.String(), nil) + if err != nil { + continue + } + + res, err := http.DefaultClient.Do(req) + if err != nil { + continue + } + + if res.StatusCode != http.StatusOK { + res.Body.Close() + continue + } + + traceData, err := io.ReadAll(res.Body) + res.Body.Close() + if err != nil { + continue + } + + // Parse the trace to find span names + var traceResp map[string]interface{} + if err := json.Unmarshal(traceData, &traceResp); err != nil { + continue + } + + // Extract span names from the trace + if batches, ok := traceResp["batches"].([]interface{}); ok { + for _, batch := range batches { + if batchMap, ok := batch.(map[string]interface{}); ok { + if scopeSpans, ok := batchMap["scopeSpans"].([]interface{}); ok { + for _, scopeSpan := range scopeSpans { + if scopeSpanMap, ok := scopeSpan.(map[string]interface{}); ok { + if spans, ok := scopeSpanMap["spans"].([]interface{}); ok { + for _, span := range spans { + if spanMap, ok := span.(map[string]interface{}); ok { + if name, ok := spanMap["name"].(string); ok { + foundSpans[name] = true + } + } + } + } + } + } + } + } + } + } + } + + // Check if all expected spans were found + for _, expectedSpan := range expectedSpanNames { + if !foundSpans[expectedSpan] { + tlog.Logf(t, "span '%s' not found yet", expectedSpan) + return false, nil + } + tlog.Logf(t, "found span '%s' in traces", expectedSpan) + } + + return true, nil +} + +// TempoResponse represents the response from Tempo's search API +type TempoResponse struct { + Traces []map[string]interface{} `json:"traces,omitempty"` +} From ff03fd52b621fe37a78c292d2849d8394153afe6 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sat, 1 Nov 2025 13:29:13 +0545 Subject: [PATCH 09/35] doc Signed-off-by: Shreemaan Abhishek --- site/content/en/latest/api/extension_types.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index fd3752e37ae..561284edd5d 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -576,7 +576,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | -| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached.
Default is 5s. For e2e testing, a lower value like 100ms is recommended. | +| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/1.4/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached.
Default is 5s. For e2e testing, a lower value like 100ms is recommended. | | `maxExportBatchSize` | _integer_ | false | | MaxExportBatchSize is the maximum number of spans to export in a single batch.
Default is 512. | | `maxQueueSize` | _integer_ | false | | MaxQueueSize is the maximum queue size to buffer spans for delayed processing.
If the queue gets full it drops the spans. Default is 2048. | From bf58e554ec5d9fde0472c1977ea312390dc6983f Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 16:30:30 +0530 Subject: [PATCH 10/35] minor fixes Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_helpers.go | 4 ++-- api/v1alpha1/envoygateway_metrics_types.go | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index daa08581214..c77277f02a8 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -206,12 +206,12 @@ func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return e.Telemetry } -// DisablePrometheus returns if disable prometheus. +// DisablePrometheus returns true if prometheus is disabled. func (e *EnvoyGateway) DisablePrometheus() bool { return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Disable } -// DisableTraces returns if disable prometheus. +// DisableTraces returns true if tracing is disabled. func (e *EnvoyGateway) DisableTraces() bool { return e.GetEnvoyGatewayTelemetry().Traces.Disable } diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index 3caef2eb077..7e68412113b 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -20,7 +20,6 @@ type EnvoyGatewayTraces struct { // Sink defines the trace sink where traces are sent to. Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` // Disable disables the traces. - // TODO: implement disability Disable bool `json:"enable,omitempty"` // SamplingRate controls the rate at which traces are sampled. // Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0. @@ -42,7 +41,6 @@ type EnvoyGatewayTraces struct { type BatchSpanProcessorConfig struct { // BatchTimeout is the maximum duration for constructing a batch. Spans are // exported when either the batch is full or this timeout is reached. - // Default is 5s. For e2e testing, a lower value like 100ms is recommended. // // +optional BatchTimeout *gwapiv1.Duration `json:"batchTimeout,omitempty"` @@ -110,7 +108,6 @@ type EnvoyGatewayTracingSink struct { // If ExportTimeout is less than or equal to zero, 30 seconds // is used as the default. ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` - // TODO sampling rate } type EnvoyGatewayOpenTelemetrySink struct { From c0846ebb938d5a66a1cbbc02ccf97be7e83d2fff Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 16:46:07 +0530 Subject: [PATCH 11/35] rc Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_metrics_types.go | 84 +------------------- api/v1alpha1/envoygateway_traces_types.go | 92 ++++++++++++++++++++++ 2 files changed, 94 insertions(+), 82 deletions(-) create mode 100644 api/v1alpha1/envoygateway_traces_types.go diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index 7e68412113b..fe3be93a6e8 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -15,49 +15,6 @@ type EnvoyGatewayMetrics struct { Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` } -// EnvoyGatewayTraces defines control plane tracing configurations. -type EnvoyGatewayTraces struct { - // Sink defines the trace sink where traces are sent to. - Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` - // Disable disables the traces. - Disable bool `json:"enable,omitempty"` - // SamplingRate controls the rate at which traces are sampled. - // Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0. - // 0.0 means no sampling, 1.0 means all traces are sampled. - // - // +optional - // +kubebuilder:validation:Minimum=0.0 - // +kubebuilder:validation:Maximum=1.0 - SamplingRate *float64 `json:"samplingRate,omitempty"` - // BatchSpanProcessorConfig defines the configuration for the batch span processor. - // This processor batches spans before exporting them to the configured sink. - // - // +optional - BatchSpanProcessorConfig *BatchSpanProcessorConfig `json:"batchSpanProcessor,omitempty"` -} - -// BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. -// The batch span processor batches spans before sending them to the exporter. -type BatchSpanProcessorConfig struct { - // BatchTimeout is the maximum duration for constructing a batch. Spans are - // exported when either the batch is full or this timeout is reached. - // - // +optional - BatchTimeout *gwapiv1.Duration `json:"batchTimeout,omitempty"` - // MaxExportBatchSize is the maximum number of spans to export in a single batch. - // Default is 512. - // - // +optional - // +kubebuilder:validation:Minimum=1 - MaxExportBatchSize *int `json:"maxExportBatchSize,omitempty"` - // MaxQueueSize is the maximum queue size to buffer spans for delayed processing. - // If the queue gets full it drops the spans. Default is 2048. - // - // +optional - // +kubebuilder:validation:Minimum=1 - MaxQueueSize *int `json:"maxQueueSize,omitempty"` -} - // EnvoyGatewayMetricSink defines control plane // metric sinks where metrics are sent to. type EnvoyGatewayMetricSink struct { @@ -71,45 +28,8 @@ type EnvoyGatewayMetricSink struct { OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` } -// EnvoyGatewayTraceSink defines control plane -// trace sinks where traces are sent to. -type EnvoyGatewayTraceSink struct { - // Type defines the trace sink type. - // EG control plane currently supports OpenTelemetry. - // +kubebuilder:validation:Enum=OpenTelemetry - // +kubebuilder:default=OpenTelemetry - Type TraceSinkType `json:"type"` // TODO: is this even needed? - // OpenTelemetry defines the configuration for OpenTelemetry sink. - // It's required if the sink type is OpenTelemetry. - OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` -} - -type EnvoyGatewayTracingSink struct { - // Host define the sink service hostname. - Host string `json:"host"` - // Protocol define the sink service protocol. - // +kubebuilder:validation:Enum=grpc;http - Protocol string `json:"protocol"` - // Port defines the port the sink service is exposed on. - // - // +optional - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=4319 - Port int32 `json:"port,omitempty"` - // ExportInterval configures the intervening time between exports for a - // Sink. This option overrides any value set for the - // OTEL_METRIC_EXPORT_INTERVAL environment variable. - // If ExportInterval is less than or equal to zero, 60 seconds - // is used as the default. - ExportInterval *gwapiv1.Duration `json:"exportInterval,omitempty"` - // ExportTimeout configures the time a Sink waits for an export to - // complete before canceling it. This option overrides any value set for the - // OTEL_METRIC_EXPORT_TIMEOUT environment variable. - // If ExportTimeout is less than or equal to zero, 30 seconds - // is used as the default. - ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` -} - +// EnvoyGatewayOpenTelemetrySink defines the configuration for OpenTelemetry sink. +// This is shared between metrics and traces. type EnvoyGatewayOpenTelemetrySink struct { // Host define the sink service hostname. Host string `json:"host"` diff --git a/api/v1alpha1/envoygateway_traces_types.go b/api/v1alpha1/envoygateway_traces_types.go new file mode 100644 index 00000000000..3e0c9d2e253 --- /dev/null +++ b/api/v1alpha1/envoygateway_traces_types.go @@ -0,0 +1,92 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package v1alpha1 + +import gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + +// EnvoyGatewayTraces defines control plane tracing configurations. +type EnvoyGatewayTraces struct { + // Sink defines the trace sink where traces are sent to. + Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` + // Disable disables the traces. + // + // +optional + Disable bool `json:"disable,omitempty"` + // SamplingRate controls the rate at which traces are sampled. + // Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0. + // 0.0 means no sampling, 1.0 means all traces are sampled. + // + // +optional + // +kubebuilder:validation:Minimum=0.0 + // +kubebuilder:validation:Maximum=1.0 + SamplingRate *float64 `json:"samplingRate,omitempty"` + // BatchSpanProcessorConfig defines the configuration for the batch span processor. + // This processor batches spans before exporting them to the configured sink. + // + // +optional + BatchSpanProcessorConfig *BatchSpanProcessorConfig `json:"batchSpanProcessor,omitempty"` +} + +// BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. +// The batch span processor batches spans before sending them to the exporter. +type BatchSpanProcessorConfig struct { + // BatchTimeout is the maximum duration for constructing a batch. Spans are + // exported when either the batch is full or this timeout is reached. + // + // +optional + BatchTimeout *gwapiv1.Duration `json:"batchTimeout,omitempty"` + // MaxExportBatchSize is the maximum number of spans to export in a single batch. + // Default is 512. + // + // +optional + // +kubebuilder:validation:Minimum=1 + MaxExportBatchSize *int `json:"maxExportBatchSize,omitempty"` + // MaxQueueSize is the maximum queue size to buffer spans for delayed processing. + // If the queue gets full it drops the spans. Default is 2048. + // + // +optional + // +kubebuilder:validation:Minimum=1 + MaxQueueSize *int `json:"maxQueueSize,omitempty"` +} + +// EnvoyGatewayTraceSink defines control plane +// trace sinks where traces are sent to. +type EnvoyGatewayTraceSink struct { + // Type defines the trace sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type TraceSinkType `json:"type"` // TODO: is this even needed? + // OpenTelemetry defines the configuration for OpenTelemetry sink. + // It's required if the sink type is OpenTelemetry. + OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` +} + +type EnvoyGatewayTracingSink struct { + // Host define the sink service hostname. + Host string `json:"host"` + // Protocol define the sink service protocol. + // +kubebuilder:validation:Enum=grpc;http + Protocol string `json:"protocol"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4319 + Port int32 `json:"port,omitempty"` + // ExportInterval configures the intervening time between exports for a + // Sink. This option overrides any value set for the + // OTEL_METRIC_EXPORT_INTERVAL environment variable. + // If ExportInterval is less than or equal to zero, 60 seconds + // is used as the default. + ExportInterval *gwapiv1.Duration `json:"exportInterval,omitempty"` + // ExportTimeout configures the time a Sink waits for an export to + // complete before canceling it. This option overrides any value set for the + // OTEL_METRIC_EXPORT_TIMEOUT environment variable. + // If ExportTimeout is less than or equal to zero, 30 seconds + // is used as the default. + ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` +} From 1c5e52e37b020a12de35c4f18032e6d9105a49c2 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 16:52:11 +0530 Subject: [PATCH 12/35] generate Signed-off-by: Shreemaan Abhishek --- site/content/en/latest/api/extension_types.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 561284edd5d..e1b8f8da9b5 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -576,7 +576,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | -| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/1.4/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached.
Default is 5s. For e2e testing, a lower value like 100ms is recommended. | +| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/1.4/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached. | | `maxExportBatchSize` | _integer_ | false | | MaxExportBatchSize is the maximum number of spans to export in a single batch.
Default is 512. | | `maxQueueSize` | _integer_ | false | | MaxQueueSize is the maximum queue size to buffer spans for delayed processing.
If the queue gets full it drops the spans. Default is 2048. | @@ -1490,7 +1490,8 @@ _Appears in:_ - +EnvoyGatewayOpenTelemetrySink defines the configuration for OpenTelemetry sink. +This is shared between metrics and traces. _Appears in:_ - [EnvoyGatewayMetricSink](#envoygatewaymetricsink) @@ -1633,7 +1634,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | | `sink` | _[EnvoyGatewayTraceSink](#envoygatewaytracesink)_ | true | | Sink defines the trace sink where traces are sent to. | -| `enable` | _boolean_ | true | | Disable disables the traces. | +| `disable` | _boolean_ | false | | Disable disables the traces. | | `samplingRate` | _float_ | false | | SamplingRate controls the rate at which traces are sampled.
Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0.
0.0 means no sampling, 1.0 means all traces are sampled. | | `batchSpanProcessor` | _[BatchSpanProcessorConfig](#batchspanprocessorconfig)_ | false | | BatchSpanProcessorConfig defines the configuration for the batch span processor.
This processor batches spans before exporting them to the configured sink. | From 623406e3f297388f2f944443eb4ef9fc2cdd96a0 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 17:10:27 +0530 Subject: [PATCH 13/35] fix coverage tests Signed-off-by: Shreemaan Abhishek --- internal/cmd/server.go | 3 +-- internal/xds/runner/runner_test.go | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/internal/cmd/server.go b/internal/cmd/server.go index 812cb7b6cf9..2cf0a4d5b2b 100644 --- a/internal/cmd/server.go +++ b/internal/cmd/server.go @@ -215,8 +215,7 @@ func startRunners(ctx context.Context, cfg *config.Server) (err error) { runner: metrics.New(cfg), }, { - // Start the Metrics Server - // It provides metrics endpoints for monitoring. + // Start the Traces Server runner: traces.New(cfg), }, } diff --git a/internal/xds/runner/runner_test.go b/internal/xds/runner/runner_test.go index 8fc9cab32cb..740738bf6dd 100644 --- a/internal/xds/runner/runner_test.go +++ b/internal/xds/runner/runner_test.go @@ -271,7 +271,7 @@ func TestRunner(t *testing.T) { }() // xDS is nil at start - require.Equal(t, map[string]*ir.Xds{}, xdsIR.LoadAll()) + require.Equal(t, map[string]*message.XdsIRWithContext{}, xdsIR.LoadAll()) // test translation path := "example" @@ -364,7 +364,7 @@ func TestRunner_withExtensionManager_FailOpen(t *testing.T) { }() // xDS is nil at start - require.Equal(t, map[string]*ir.Xds{}, xdsIR.LoadAll()) + require.Equal(t, map[string]*message.XdsIRWithContext{}, xdsIR.LoadAll()) // test translation path := "example" @@ -450,7 +450,7 @@ func TestRunner_withExtensionManager_FailClosed(t *testing.T) { }() // xDS is nil at start - require.Equal(t, map[string]*ir.Xds{}, xdsIR.LoadAll()) + require.Equal(t, map[string]*message.XdsIRWithContext{}, xdsIR.LoadAll()) // test translation path := "example" From fd41925abde7e479b1045924796f319599fa2bc9 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 11:50:39 +0000 Subject: [PATCH 14/35] fix: e2e config Signed-off-by: Shreemaan Abhishek --- test/config/helm/traces-enabled.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/config/helm/traces-enabled.yaml b/test/config/helm/traces-enabled.yaml index 9c611623c7a..e6242694405 100644 --- a/test/config/helm/traces-enabled.yaml +++ b/test/config/helm/traces-enabled.yaml @@ -8,3 +8,9 @@ config: host: "otel-collector.monitoring.svc.cluster.local" port: 4317 protocol: grpc + # Fast export settings for e2e tests to ensure traces are available immediately + samplingRate: 1.0 # 100% sampling for tests + batchSpanProcessor: + batchTimeout: 100ms # Export every 100ms instead of default 5s + maxExportBatchSize: 512 + maxQueueSize: 2048 From 8392b417cdf50143e635c1cc38ae05181e373d59 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 2 Nov 2025 17:26:28 +0530 Subject: [PATCH 15/35] runner test fix Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/runner/runner_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/gatewayapi/runner/runner_test.go b/internal/gatewayapi/runner/runner_test.go index fc1093a431a..50d8f73664d 100644 --- a/internal/gatewayapi/runner/runner_test.go +++ b/internal/gatewayapi/runner/runner_test.go @@ -51,7 +51,7 @@ func TestRunner(t *testing.T) { require.NoError(t, err) // IR is nil at start - require.Equal(t, map[string]*ir.Xds{}, xdsIR.LoadAll()) + require.Equal(t, map[string]*message.XdsIRWithContext{}, xdsIR.LoadAll()) require.Equal(t, map[string]*ir.Infra{}, infraIR.LoadAll()) // TODO: pass valid provider resources @@ -64,7 +64,7 @@ func TestRunner(t *testing.T) { return false } // Ensure ir is empty - return (reflect.DeepEqual(xdsIR.LoadAll(), map[string]*ir.Xds{})) && (reflect.DeepEqual(infraIR.LoadAll(), map[string]*ir.Infra{})) + return (reflect.DeepEqual(xdsIR.LoadAll(), map[string]*message.XdsIRWithContext{})) && (reflect.DeepEqual(infraIR.LoadAll(), map[string]*ir.Infra{})) }, time.Second*1, time.Millisecond*20) } From fcadab22abf113091d173b307e5e847cbaf72d5b Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 10:38:22 +0530 Subject: [PATCH 16/35] logger Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/runner/runner.go | 29 ++++---- internal/logging/log.go | 24 +++++++ internal/logging/log_test.go | 24 +++++++ internal/provider/kubernetes/controller.go | 79 +++++++++++----------- internal/xds/runner/runner.go | 21 +++--- internal/xds/runner/runner_test.go | 22 ++++-- 6 files changed, 130 insertions(+), 69 deletions(-) diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index fd39517279b..06011be66fe 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -129,16 +129,17 @@ func (r *Runner) startWasmCache(ctx context.Context) { func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *resource.ControllerResourcesContext]) { message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.ProviderResourcesMessageName}, sub, func(update message.Update[string, *resource.ControllerResourcesContext], errChan chan error) { - r.Logger.Info("received an update", "key", update.Key) parentCtx := context.Background() if update.Value != nil && update.Value.Context != nil { parentCtx = update.Value.Context } + traceLogger := r.Logger.WithTrace(parentCtx) + + traceLogger.Info("received an update", "key", update.Key) + _, span := tracer.Start(parentCtx, "GatewayApiRunner.subscribeAndTranslate") defer span.End() - - r.Logger.Info("received an update") valWrapper := update.Value // There is only 1 key which is the controller name // so when a delete is triggered, delete all keys @@ -181,7 +182,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re MergeGateways: gatewayapi.IsMergeGatewaysEnabled(resources), WasmCache: r.wasmCache, ListenerPortShiftDisabled: r.EnvoyGateway.Provider != nil && r.EnvoyGateway.Provider.IsRunningOnHost(), - Logger: r.Logger, + Logger: traceLogger, } // If an extension is loaded, pass its supported groups/kinds to the translator @@ -195,24 +196,24 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re extGKs = append(extGKs, schema.GroupKind{Group: gvk.Group, Kind: gvk.Kind}) } t.ExtensionGroupKinds = extGKs - r.Logger.Info("extension resources", "GVKs count", len(extGKs)) + traceLogger.Info("extension resources", "GVKs count", len(extGKs)) } // Translate to IR result, err := t.Translate(resources) if err != nil { // Currently all errors that Translate returns should just be logged - r.Logger.Error(err, "errors detected during translation", "gateway-class", resources.GatewayClass.Name) + traceLogger.Error(err, "errors detected during translation", "gateway-class", resources.GatewayClass.Name) } // Publish the IRs. // Also validate the ir before sending it. for key, val := range result.InfraIR { - logger := r.Logger.V(1).WithValues(string(message.InfraIRMessageName), key) - if logger.Enabled() { - logger.Info(val.JSONString()) + logV := traceLogger.V(1).WithValues(string(message.InfraIRMessageName), key) + if logV.Enabled() { + logV.Info(val.JSONString()) } if err := val.Validate(); err != nil { - r.Logger.Error(err, "unable to validate infra ir, skipped sending it") + traceLogger.Error(err, "unable to validate infra ir, skipped sending it") errChan <- err } else { r.InfraIR.Store(key, val) @@ -224,12 +225,12 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re } for key, val := range result.XdsIR { - logger := r.Logger.V(1).WithValues(string(message.XDSIRMessageName), key) - if logger.Enabled() { - logger.Info(val.JSONString()) + logV := traceLogger.V(1).WithValues(string(message.XDSIRMessageName), key) + if logV.Enabled() { + logV.Info(val.JSONString()) } if err := val.Validate(); err != nil { - r.Logger.Error(err, "unable to validate xds ir, skipped sending it") + traceLogger.Error(err, "unable to validate xds ir, skipped sending it") errChan <- err } else { m := message.XdsIRWithContext{ diff --git a/internal/logging/log.go b/internal/logging/log.go index bdf2f3515c6..2cb6a3eac95 100644 --- a/internal/logging/log.go +++ b/internal/logging/log.go @@ -6,11 +6,13 @@ package logging import ( + "context" "io" "os" "github.com/go-logr/logr" "github.com/go-logr/zapr" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -88,6 +90,28 @@ func (l Logger) WithValues(keysAndValues ...interface{}) Logger { return l } +// WithTrace returns a new Logger that includes basic OpenTelemetry metadata +// extracted from the provided context. If the context does not contain a valid +// span, the original Logger is returned unchanged. +func (l Logger) WithTrace(ctx context.Context) Logger { + sc := trace.SpanContextFromContext(ctx) + if !sc.IsValid() { + return l + } + + fields := []interface{}{ + "trace_id", sc.TraceID().String(), + "span_id", sc.SpanID().String(), + "trace_flags", sc.TraceFlags().String(), + } + + if ts := sc.TraceState(); ts.Len() > 0 { + fields = append(fields, "trace_state", ts.String()) + } + + return l.WithValues(fields...) +} + // A Sugar wraps the base Logger functionality in a slower, but less // verbose, API. Any Logger can be converted to a SugaredLogger with its Sugar // method. diff --git a/internal/logging/log_test.go b/internal/logging/log_test.go index a5f75e43816..cb133eb53d3 100644 --- a/internal/logging/log_test.go +++ b/internal/logging/log_test.go @@ -6,12 +6,15 @@ package logging import ( + "bytes" + "context" "errors" "os" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -104,3 +107,24 @@ func TestLoggerSugarName(t *testing.T) { capturedOutput := string(outputBytes) assert.Contains(t, capturedOutput, "debugging message", logName) } + +func TestLoggerWithTrace(t *testing.T) { + buffer := &bytes.Buffer{} + logger := NewLogger(buffer, egv1a1.DefaultEnvoyGatewayLogging()) + + traceID := trace.TraceID{0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xb0, 0x0c, 0x12, 0x34, 0x56, 0x78} + spanID := trace.SpanID{0xba, 0xad, 0xf0, 0x0d, 0xfe, 0xed, 0xfa, 0xce} + sc := trace.NewSpanContext(trace.SpanContextConfig{ + TraceID: traceID, + SpanID: spanID, + TraceFlags: trace.FlagsSampled, + }) + ctx := trace.ContextWithSpanContext(context.Background(), sc) + + logger.WithTrace(ctx).Info("hello tracing") + + output := buffer.String() + assert.Contains(t, output, traceID.String()) + assert.Contains(t, output, spanID.String()) + assert.Contains(t, output, trace.FlagsSampled.String()) +} diff --git a/internal/provider/kubernetes/controller.go b/internal/provider/kubernetes/controller.go index 8d2c6ae61a7..f65527d566a 100644 --- a/internal/provider/kubernetes/controller.go +++ b/internal/provider/kubernetes/controller.go @@ -301,11 +301,12 @@ func isTransientError(err error) bool { func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { ctx, span := tracer.Start(ctx, "GatewayAPIReconciler.Reconcile") defer span.End() + logger := r.log.WithTrace(ctx) var ( managedGCs []*gwapiv1.GatewayClass err error ) - r.log.Info("reconciling gateways") + logger.Info("reconciling gateways") // Get the GatewayClasses managed by the Envoy Gateway Controller. managedGCs, err = r.managedGatewayClasses(ctx) @@ -320,7 +321,7 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques defer func() { for _, key := range gcStatusToDelete.UnsortedList() { - r.log.Info("delete from GatewayClass statuses", "key", key) + logger.Info("delete from GatewayClass statuses", "key", key) r.resources.GatewayClassStatuses.Delete(key) } }() @@ -328,7 +329,7 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // The gatewayclass was already deleted/finalized and there are stale queue entries. if managedGCs == nil { r.resources.GatewayAPIResources.Delete(string(r.classController)) - r.log.Info("no accepted gatewayclass") + logger.Info("no accepted gatewayclass") return reconcile.Result{}, nil } @@ -346,18 +347,18 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques gwcResource.GatewayClass = managedGC gwcResourceMapping := newResourceMapping() - logger := r.log.WithValues("GatewayClass", managedGC.Name) + gcLogger := logger.WithValues("GatewayClass", managedGC.Name) // Process the parametersRef of the accepted GatewayClass. // This should run before processGateways and processBackendRefs failToProcessGCParamsRef := false if managedGC.Spec.ParametersRef != nil && managedGC.DeletionTimestamp == nil { if err := r.processGatewayClassParamsRef(ctx, managedGC, gwcResourceMapping, gwcResource); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing parametersRef for GatewayClass") + gcLogger.Error(err, "transient error processing parametersRef for GatewayClass") return reconcile.Result{}, err } - logger.Error(err, "failed to process ParametersRef for GatewayClass") + gcLogger.Error(err, "failed to process ParametersRef for GatewayClass") msg := fmt.Sprintf("%s: %v", status.MsgGatewayClassInvalidParams, err) status.SetGatewayClassAccepted( managedGC, @@ -376,11 +377,11 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // process envoy gateway secret refs if err := r.processEnvoyProxySecretRef(ctx, gwcResource); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing TLS SecretRef for EnvoyProxy") + gcLogger.Error(err, "transient error processing TLS SecretRef for EnvoyProxy") return reconcile.Result{}, err } - r.log.Error(err, "failed to process TLS SecretRef for EnvoyProxy for GatewayClass") + gcLogger.Error(err, "failed to process TLS SecretRef for EnvoyProxy for GatewayClass") status.SetGatewayClassAccepted( managedGC, false, @@ -396,7 +397,7 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques if !failToProcessGCParamsRef { // GatewayClass is valid so far, mark it as accepted. - logger.V(6).Info("Set GatewayClass Accepted") + gcLogger.V(6).Info("Set GatewayClass Accepted") status.SetGatewayClassAccepted( managedGC, true, @@ -412,38 +413,38 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // add the OIDC HMAC Secret to the resourceTree if err = r.processOIDCHMACSecret(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing OIDC HMAC Secret") + gcLogger.Error(err, "transient error processing OIDC HMAC Secret") return reconcile.Result{}, err } - logger.Error(err, "failed to process OIDC HMAC Secret for GatewayClass") + gcLogger.Error(err, "failed to process OIDC HMAC Secret for GatewayClass") } // add the Envoy TLS Secret to the resourceTree if err = r.processEnvoyTLSSecret(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing Envoy TLS Secret") + gcLogger.Error(err, "transient error processing Envoy TLS Secret") return reconcile.Result{}, err } - logger.Error(err, "failed to process EnvoyTLSSecret") + gcLogger.Error(err, "failed to process EnvoyTLSSecret") } // Add all Gateways, their associated Routes, and referenced resources to the resourceTree if err = r.processGateways(ctx, managedGC, gwcResourceMapping, gwcResource); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing gateways") + gcLogger.Error(err, "transient error processing gateways") return reconcile.Result{}, err } - logger.Error(err, "failed process gateways for GatewayClass") + gcLogger.Error(err, "failed process gateways for GatewayClass") } if r.eppCRDExists { // Add all EnvoyPatchPolicies to the resourceTree if err = r.processEnvoyPatchPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing EnvoyPatchPolicies") + gcLogger.Error(err, "transient error processing EnvoyPatchPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process EnvoyPatchPolicies for GatewayClass") + gcLogger.Error(err, "failed to process EnvoyPatchPolicies for GatewayClass") } } @@ -451,10 +452,10 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // Add all ClientTrafficPolicies and their referenced resources to the resourceTree if err = r.processClientTrafficPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing ClientTrafficPolicies") + gcLogger.Error(err, "transient error processing ClientTrafficPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed process to ClientTrafficPolicies for GatewayClass") + gcLogger.Error(err, "failed process to ClientTrafficPolicies for GatewayClass") } } @@ -462,10 +463,10 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // Add all BackendTrafficPolicies to the resourceTree if err = r.processBackendTrafficPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing BackendTrafficPolicies") + gcLogger.Error(err, "transient error processing BackendTrafficPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process BackendTrafficPolicies for GatewayClass") + gcLogger.Error(err, "failed to process BackendTrafficPolicies for GatewayClass") } } @@ -473,10 +474,10 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // Add all SecurityPolicies and their referenced resources to the resourceTree if err = r.processSecurityPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing SecurityPolicies") + gcLogger.Error(err, "transient error processing SecurityPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process SecurityPolicies for GatewayClass") + gcLogger.Error(err, "failed to process SecurityPolicies for GatewayClass") } } @@ -484,10 +485,10 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // Add all BackendTLSPolies to the resourceTree if err = r.processBackendTLSPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing BackendTLSPolicies") + gcLogger.Error(err, "transient error processing BackendTLSPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process BackendTLSPolicies for GatewayClass") + gcLogger.Error(err, "failed to process BackendTLSPolicies for GatewayClass") } } @@ -495,19 +496,19 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // Add all EnvoyExtensionPolicies and their referenced resources to the resourceTree if err = r.processEnvoyExtensionPolicies(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing EnvoyExtensionPolicies") + gcLogger.Error(err, "transient error processing EnvoyExtensionPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process EnvoyExtensionPolicies for GatewayClass") + gcLogger.Error(err, "failed to process EnvoyExtensionPolicies for GatewayClass") } } if err = r.processExtensionServerPolicies(ctx, gwcResource); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing ExtensionServerPolicies") + gcLogger.Error(err, "transient error processing ExtensionServerPolicies") return reconcile.Result{}, err } - logger.Error(err, "failed to process ExtensionServerPolicies for GatewayClass") + gcLogger.Error(err, "failed to process ExtensionServerPolicies for GatewayClass") } // Add the referenced services, ServiceImports, and EndpointSlices in @@ -515,11 +516,11 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques // BackendRefs are referred by various Route objects and the ExtAuth in SecurityPolicies. if err = r.processBackendRefs(ctx, gwcResource, gwcResourceMapping); err != nil { if isTransientError(err) { - logger.Error(err, "transient error processing BackendRefs") + gcLogger.Error(err, "transient error processing BackendRefs") return reconcile.Result{}, err } - logger.Error(err, "failed to process BackendRefs for GatewayClass") + gcLogger.Error(err, "failed to process BackendRefs for GatewayClass") } // For this particular Gateway, and all associated objects, check whether the @@ -528,10 +529,10 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques namespace, err := r.getNamespace(ctx, ns) if err != nil { if isTransientError(err) { - logger.Error(err, "transient error getting namespace", "namespace", ns) + gcLogger.Error(err, "transient error getting namespace", "namespace", ns) return reconcile.Result{}, err } - logger.Error(err, "unable to find the namespace", "namespace", ns) + gcLogger.Error(err, "unable to find the namespace", "namespace", ns) if kerrors.IsNotFound(err) { continue } @@ -546,24 +547,24 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques } if len(gwcResource.Gateways) == 0 { - logger.Info("No gateways found for accepted GatewayClass") + gcLogger.Info("No gateways found for accepted GatewayClass") // If needed, remove the finalizer from the accepted GatewayClass. if err := r.removeFinalizer(ctx, managedGC); err != nil { if isTransientError(err) { - logger.Error(err, "transient error removing finalizer from GatewayClass") + gcLogger.Error(err, "transient error removing finalizer from GatewayClass") return reconcile.Result{}, err } - logger.Error(err, "failed to remove finalizer from GatewayClass") + gcLogger.Error(err, "failed to remove finalizer from GatewayClass") } } else { // finalize the accepted GatewayClass. if err := r.addFinalizer(ctx, managedGC); err != nil { if isTransientError(err) { - logger.Error(err, "transient error adding finalizer to gatewayClass") + gcLogger.Error(err, "transient error adding finalizer to gatewayClass") return reconcile.Result{}, err } - logger.Error(err, "failed adding finalizer to gatewayClass") + gcLogger.Error(err, "failed adding finalizer to gatewayClass") } } } @@ -589,7 +590,7 @@ func (r *gatewayAPIReconciler) Reconcile(ctx context.Context, _ reconcile.Reques Message: message.ProviderResourcesMessageName, }, 1) - r.log.Info("reconciled gateways successfully") + logger.Info("reconciled gateways successfully") return reconcile.Result{}, nil } diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index 2ffd38ac26b..606d01e6f3f 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -260,13 +260,14 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, // Subscribe to resources message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.XDSIRMessageName}, sub, func(update message.Update[string, *message.XdsIRWithContext], errChan chan error) { - r.Logger.Info("received an update") - parentCtx := context.Background() if update.Value != nil && update.Value.Context != nil { parentCtx = update.Value.Context } + traceLogger := r.Logger.WithTrace(parentCtx) + traceLogger.Info("received an update") + _, span := tracer.Start(parentCtx, "XdsRunner.subscribeAndTranslate") defer span.End() @@ -281,7 +282,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, if update.Delete { if err := r.cache.GenerateNewSnapshot(key, nil); err != nil { - r.Logger.Error(err, "failed to delete the snapshot") + traceLogger.Error(err, "failed to delete the snapshot") errChan <- err } } else { @@ -290,7 +291,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, ControllerNamespace: r.ControllerNamespace, FilterOrder: val.XdsIR.FilterOrder, RuntimeFlags: r.EnvoyGateway.RuntimeFlags, - Logger: r.Logger, + Logger: traceLogger, } // Set the extension manager if an extension is loaded @@ -307,7 +308,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, if r.EnvoyGateway.RateLimit.Timeout != nil { d, err := time.ParseDuration(string(*r.EnvoyGateway.RateLimit.Timeout)) if err != nil { - r.Logger.Error(err, "invalid rateLimit timeout") + traceLogger.Error(err, "invalid rateLimit timeout") errChan <- err } else { t.GlobalRateLimit.Timeout = d @@ -317,14 +318,14 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, result, err := t.Translate(val.XdsIR) if err != nil { - r.Logger.Error(err, "failed to translate xds ir") + traceLogger.Error(err, "failed to translate xds ir") errChan <- err } // xDS translation is done in a best-effort manner, so the result // may contain partial resources even if there are errors. if result == nil { - r.Logger.Info("no xds resources to publish") + traceLogger.Info("no xds resources to publish") return } @@ -357,17 +358,17 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, if err == nil { if result.XdsResources != nil { if r.cache == nil { - r.Logger.Error(err, "failed to init snapshot cache") + traceLogger.Error(err, "failed to init snapshot cache") errChan <- err } else { // Update snapshot cache if err := r.cache.GenerateNewSnapshot(key, result.XdsResources); err != nil { - r.Logger.Error(err, "failed to generate a snapshot") + traceLogger.Error(err, "failed to generate a snapshot") errChan <- err } } } else { - r.Logger.Error(err, "skipped publishing xds resources") + traceLogger.Error(err, "skipped publishing xds resources") } } diff --git a/internal/xds/runner/runner_test.go b/internal/xds/runner/runner_test.go index 740738bf6dd..4513968b3a0 100644 --- a/internal/xds/runner/runner_test.go +++ b/internal/xds/runner/runner_test.go @@ -34,8 +34,18 @@ import ( "github.com/envoyproxy/gateway/internal/ir" "github.com/envoyproxy/gateway/internal/message" "github.com/envoyproxy/gateway/internal/xds/bootstrap" + "go.opentelemetry.io/otel/trace" ) +func newTestTraceContext() context.Context { + sc := trace.NewSpanContext(trace.SpanContextConfig{ + TraceID: trace.TraceID{0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0}, + SpanID: trace.SpanID{0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x1}, + TraceFlags: trace.FlagsSampled, + }) + return trace.ContextWithSpanContext(context.Background(), sc) +} + func TestTLSConfig(t *testing.T) { // Create trusted CA, server and client certs. trustedCACert := certyaml.Certificate{ @@ -259,7 +269,7 @@ func TestRunner(t *testing.T) { TLSCaPath: caFile, }) - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(newTestTraceContext()) defer cancel() // Start @@ -310,7 +320,7 @@ func TestRunner(t *testing.T) { } m := message.XdsIRWithContext{ XdsIR: &res, - Context: context.Background(), + Context: newTestTraceContext(), } xdsIR.Store("test", &m) require.Eventually(t, func() bool { @@ -352,7 +362,7 @@ func TestRunner_withExtensionManager_FailOpen(t *testing.T) { TLSCaPath: caFile, }) - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(newTestTraceContext()) defer cancel() // Start @@ -403,7 +413,7 @@ func TestRunner_withExtensionManager_FailOpen(t *testing.T) { } m := message.XdsIRWithContext{ XdsIR: &res, - Context: context.Background(), + Context: newTestTraceContext(), } xdsIR.Store("test", &m) require.Eventually(t, func() bool { @@ -438,7 +448,7 @@ func TestRunner_withExtensionManager_FailClosed(t *testing.T) { TLSCaPath: caFile, }) - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(newTestTraceContext()) defer cancel() // Start @@ -489,7 +499,7 @@ func TestRunner_withExtensionManager_FailClosed(t *testing.T) { } m := message.XdsIRWithContext{ XdsIR: &res, - Context: context.Background(), + Context: newTestTraceContext(), } xdsIR.Store("test", &m) require.Never(t, func() bool { From afc0812853ce534ed22667a07090114235aaa206 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 10:40:25 +0530 Subject: [PATCH 17/35] mod Signed-off-by: Shreemaan Abhishek --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 58fb5f9da42..b3764015cb9 100644 --- a/go.mod +++ b/go.mod @@ -61,6 +61,7 @@ require ( go.opentelemetry.io/otel/metric v1.38.0 go.opentelemetry.io/otel/sdk v1.38.0 go.opentelemetry.io/otel/sdk/metric v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 go.opentelemetry.io/proto/otlp v1.8.0 go.uber.org/zap v1.27.0 golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 @@ -291,7 +292,6 @@ require ( go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect - go.opentelemetry.io/otel/trace v1.38.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect From 0480ec5744012c63bb45aab325e143ca325fc2c0 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 10:55:30 +0530 Subject: [PATCH 18/35] mod Signed-off-by: Shreemaan Abhishek --- go.mod | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go.mod b/go.mod index b3764015cb9..cbc9496d587 100644 --- a/go.mod +++ b/go.mod @@ -54,8 +54,6 @@ require ( go.opentelemetry.io/otel v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.38.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0 go.opentelemetry.io/otel/metric v1.38.0 @@ -292,6 +290,7 @@ require ( go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect From 8b834a9e1b067fea1aeba3dc44fe4c6edc5a536f Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 10:56:15 +0530 Subject: [PATCH 19/35] trace config cleanup Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_traces_types.go | 78 ----------- api/v1alpha1/envoyproxy_metric_types.go | 2 - api/v1alpha1/zz_generated.deepcopy.go | 88 +------------ internal/traces/register.go | 104 +-------------- internal/traces/register_test.go | 121 +----------------- site/content/en/latest/api/extension_types.md | 51 -------- 6 files changed, 8 insertions(+), 436 deletions(-) diff --git a/api/v1alpha1/envoygateway_traces_types.go b/api/v1alpha1/envoygateway_traces_types.go index 3e0c9d2e253..73d9b1ee68f 100644 --- a/api/v1alpha1/envoygateway_traces_types.go +++ b/api/v1alpha1/envoygateway_traces_types.go @@ -5,88 +5,10 @@ package v1alpha1 -import gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" - // EnvoyGatewayTraces defines control plane tracing configurations. type EnvoyGatewayTraces struct { - // Sink defines the trace sink where traces are sent to. - Sink EnvoyGatewayTraceSink `json:"sink,omitempty"` // Disable disables the traces. // // +optional Disable bool `json:"disable,omitempty"` - // SamplingRate controls the rate at which traces are sampled. - // Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0. - // 0.0 means no sampling, 1.0 means all traces are sampled. - // - // +optional - // +kubebuilder:validation:Minimum=0.0 - // +kubebuilder:validation:Maximum=1.0 - SamplingRate *float64 `json:"samplingRate,omitempty"` - // BatchSpanProcessorConfig defines the configuration for the batch span processor. - // This processor batches spans before exporting them to the configured sink. - // - // +optional - BatchSpanProcessorConfig *BatchSpanProcessorConfig `json:"batchSpanProcessor,omitempty"` -} - -// BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. -// The batch span processor batches spans before sending them to the exporter. -type BatchSpanProcessorConfig struct { - // BatchTimeout is the maximum duration for constructing a batch. Spans are - // exported when either the batch is full or this timeout is reached. - // - // +optional - BatchTimeout *gwapiv1.Duration `json:"batchTimeout,omitempty"` - // MaxExportBatchSize is the maximum number of spans to export in a single batch. - // Default is 512. - // - // +optional - // +kubebuilder:validation:Minimum=1 - MaxExportBatchSize *int `json:"maxExportBatchSize,omitempty"` - // MaxQueueSize is the maximum queue size to buffer spans for delayed processing. - // If the queue gets full it drops the spans. Default is 2048. - // - // +optional - // +kubebuilder:validation:Minimum=1 - MaxQueueSize *int `json:"maxQueueSize,omitempty"` -} - -// EnvoyGatewayTraceSink defines control plane -// trace sinks where traces are sent to. -type EnvoyGatewayTraceSink struct { - // Type defines the trace sink type. - // EG control plane currently supports OpenTelemetry. - // +kubebuilder:validation:Enum=OpenTelemetry - // +kubebuilder:default=OpenTelemetry - Type TraceSinkType `json:"type"` // TODO: is this even needed? - // OpenTelemetry defines the configuration for OpenTelemetry sink. - // It's required if the sink type is OpenTelemetry. - OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` -} - -type EnvoyGatewayTracingSink struct { - // Host define the sink service hostname. - Host string `json:"host"` - // Protocol define the sink service protocol. - // +kubebuilder:validation:Enum=grpc;http - Protocol string `json:"protocol"` - // Port defines the port the sink service is exposed on. - // - // +optional - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=4319 - Port int32 `json:"port,omitempty"` - // ExportInterval configures the intervening time between exports for a - // Sink. This option overrides any value set for the - // OTEL_METRIC_EXPORT_INTERVAL environment variable. - // If ExportInterval is less than or equal to zero, 60 seconds - // is used as the default. - ExportInterval *gwapiv1.Duration `json:"exportInterval,omitempty"` - // ExportTimeout configures the time a Sink waits for an export to - // complete before canceling it. This option overrides any value set for the - // OTEL_METRIC_EXPORT_TIMEOUT environment variable. - // If ExportTimeout is less than or equal to zero, 30 seconds - // is used as the default. - ExportTimeout *gwapiv1.Duration `json:"exportTimeout,omitempty"` } diff --git a/api/v1alpha1/envoyproxy_metric_types.go b/api/v1alpha1/envoyproxy_metric_types.go index c37a23f5ca3..1d32d4b3eb7 100644 --- a/api/v1alpha1/envoyproxy_metric_types.go +++ b/api/v1alpha1/envoyproxy_metric_types.go @@ -7,12 +7,10 @@ package v1alpha1 type ( MetricSinkType string - TraceSinkType string ) const ( MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" - TraceSinkTypeOpenTelemetry TraceSinkType = "OpenTelemetry" ) type ProxyMetrics struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 1a3f214b13e..580d90e6d17 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -795,36 +795,6 @@ func (in *BasicAuth) DeepCopy() *BasicAuth { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *BatchSpanProcessorConfig) DeepCopyInto(out *BatchSpanProcessorConfig) { - *out = *in - if in.BatchTimeout != nil { - in, out := &in.BatchTimeout, &out.BatchTimeout - *out = new(v1.Duration) - **out = **in - } - if in.MaxExportBatchSize != nil { - in, out := &in.MaxExportBatchSize, &out.MaxExportBatchSize - *out = new(int) - **out = **in - } - if in.MaxQueueSize != nil { - in, out := &in.MaxQueueSize, &out.MaxQueueSize - *out = new(int) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BatchSpanProcessorConfig. -func (in *BatchSpanProcessorConfig) DeepCopy() *BatchSpanProcessorConfig { - if in == nil { - return nil - } - out := new(BatchSpanProcessorConfig) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BodyToExtAuth) DeepCopyInto(out *BodyToExtAuth) { *out = *in @@ -2334,7 +2304,7 @@ func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { if in.Traces != nil { in, out := &in.Traces, &out.Traces *out = new(EnvoyGatewayTraces) - (*in).DeepCopyInto(*out) + **out = **in } } @@ -2368,40 +2338,9 @@ func (in *EnvoyGatewayTopologyInjector) DeepCopy() *EnvoyGatewayTopologyInjector return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvoyGatewayTraceSink) DeepCopyInto(out *EnvoyGatewayTraceSink) { - *out = *in - if in.OpenTelemetry != nil { - in, out := &in.OpenTelemetry, &out.OpenTelemetry - *out = new(EnvoyGatewayOpenTelemetrySink) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraceSink. -func (in *EnvoyGatewayTraceSink) DeepCopy() *EnvoyGatewayTraceSink { - if in == nil { - return nil - } - out := new(EnvoyGatewayTraceSink) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayTraces) DeepCopyInto(out *EnvoyGatewayTraces) { *out = *in - in.Sink.DeepCopyInto(&out.Sink) - if in.SamplingRate != nil { - in, out := &in.SamplingRate, &out.SamplingRate - *out = new(float64) - **out = **in - } - if in.BatchSpanProcessorConfig != nil { - in, out := &in.BatchSpanProcessorConfig, &out.BatchSpanProcessorConfig - *out = new(BatchSpanProcessorConfig) - (*in).DeepCopyInto(*out) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraces. @@ -2414,31 +2353,6 @@ func (in *EnvoyGatewayTraces) DeepCopy() *EnvoyGatewayTraces { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvoyGatewayTracingSink) DeepCopyInto(out *EnvoyGatewayTracingSink) { - *out = *in - if in.ExportInterval != nil { - in, out := &in.ExportInterval, &out.ExportInterval - *out = new(v1.Duration) - **out = **in - } - if in.ExportTimeout != nil { - in, out := &in.ExportTimeout, &out.ExportTimeout - *out = new(v1.Duration) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTracingSink. -func (in *EnvoyGatewayTracingSink) DeepCopy() *EnvoyGatewayTracingSink { - if in == nil { - return nil - } - out := new(EnvoyGatewayTracingSink) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyJSONPatchConfig) DeepCopyInto(out *EnvoyJSONPatchConfig) { *out = *in diff --git a/internal/traces/register.go b/internal/traces/register.go index 66bac49a046..e509221b14d 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -7,17 +7,13 @@ package traces import ( "context" - "fmt" "time" "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" - "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" - egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/envoygateway/config" ) @@ -37,115 +33,27 @@ func (r *Runner) Start(ctx context.Context) error { return nil } - tracesConfig := r.cfg.EnvoyGateway.GetEnvoyGatewayTelemetry().Traces - sinkConfig := tracesConfig.Sink - configObj := sinkConfig.OpenTelemetry - - endpoint := fmt.Sprintf("%s:%d", sinkConfig.OpenTelemetry.Host, sinkConfig.OpenTelemetry.Port) - // Create resource res, err := resource.New(ctx, resource.WithAttributes( semconv.ServiceNameKey.String("envoy-gateway"), ), ) + if err != nil { return err } - // Get sampler configuration - sampler := r.getSampler(tracesConfig) - - // Get batch span processor options - batchOptions := r.getBatchSpanProcessorOptions(tracesConfig) - - if configObj.Protocol == egv1a1.GRPCProtocol { - exporter, err := otlptracegrpc.New(ctx, - otlptracegrpc.WithEndpoint(endpoint), - otlptracegrpc.WithInsecure(), - ) - if err != nil { - return err - } - - bsp := trace.NewBatchSpanProcessor(exporter, batchOptions...) - tp := trace.NewTracerProvider( - trace.WithSpanProcessor(bsp), - trace.WithResource(res), - trace.WithSampler(sampler), - ) - - otel.SetTracerProvider(tp) - r.tp = tp - - return nil - } - - if configObj.Protocol == egv1a1.HTTPProtocol { - // Create OTLP HTTP exporter - exporter, err := otlptracehttp.New(ctx, - otlptracehttp.WithEndpoint(endpoint), - otlptracehttp.WithInsecure(), - ) - if err != nil { - return err - } - - bsp := trace.NewBatchSpanProcessor(exporter, batchOptions...) - tp := trace.NewTracerProvider( - trace.WithSpanProcessor(bsp), - trace.WithResource(res), - trace.WithSampler(sampler), - ) - - otel.SetTracerProvider(tp) - r.tp = tp + tp := trace.NewTracerProvider( + trace.WithResource(res), + ) - return nil - } + otel.SetTracerProvider(tp) + r.tp = tp return nil } -// getSampler returns the configured sampler or a default sampler -func (r *Runner) getSampler(tracesConfig *egv1a1.EnvoyGatewayTraces) trace.Sampler { - if tracesConfig.SamplingRate != nil { - return trace.TraceIDRatioBased(*tracesConfig.SamplingRate) - } - // Default to always sample (100%) - return trace.AlwaysSample() -} - -// getBatchSpanProcessorOptions returns the configured batch span processor options -func (r *Runner) getBatchSpanProcessorOptions(tracesConfig *egv1a1.EnvoyGatewayTraces) []trace.BatchSpanProcessorOption { - var options []trace.BatchSpanProcessorOption - - if tracesConfig.BatchSpanProcessorConfig != nil { - cfg := tracesConfig.BatchSpanProcessorConfig - - if cfg.BatchTimeout != nil { - timeout, err := time.ParseDuration(string(*cfg.BatchTimeout)) - if err == nil && timeout > 0 { - options = append(options, trace.WithBatchTimeout(timeout)) - } - } - - if cfg.MaxExportBatchSize != nil && *cfg.MaxExportBatchSize > 0 { - options = append(options, trace.WithMaxExportBatchSize(*cfg.MaxExportBatchSize)) - } - - if cfg.MaxQueueSize != nil && *cfg.MaxQueueSize > 0 { - options = append(options, trace.WithMaxQueueSize(*cfg.MaxQueueSize)) - } - } - - // If no options were configured, use defaults - // Default BatchTimeout is 5s, MaxExportBatchSize is 512, MaxQueueSize is 2048 - // These are the OpenTelemetry SDK defaults - - return options -} - func (r *Runner) Name() string { return "traces" } diff --git a/internal/traces/register_test.go b/internal/traces/register_test.go index 0052af4ae43..a1a8a409902 100644 --- a/internal/traces/register_test.go +++ b/internal/traces/register_test.go @@ -6,9 +6,7 @@ package traces import ( - "context" "testing" - "time" "github.com/stretchr/testify/require" @@ -20,18 +18,7 @@ func TestTracesRunner_New(t *testing.T) { cfg := &config.Server{ EnvoyGateway: &egv1a1.EnvoyGateway{ EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ - Telemetry: &egv1a1.EnvoyGatewayTelemetry{ - Traces: &egv1a1.EnvoyGatewayTraces{ - Sink: egv1a1.EnvoyGatewayTraceSink{ - Type: egv1a1.TraceSinkTypeOpenTelemetry, - OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ - Host: "localhost", - Port: 4317, - Protocol: egv1a1.GRPCProtocol, - }, - }, - }, - }, + Telemetry: &egv1a1.EnvoyGatewayTelemetry{}, }, }, } @@ -69,109 +56,3 @@ func TestTracesRunner_Close(t *testing.T) { }) } } - -func TestTracesRunner_Start_ValidConfiguration(t *testing.T) { - tests := []struct { - name string - protocol string - host string - port int32 - }{ - { - name: "grpc protocol configuration", - protocol: egv1a1.GRPCProtocol, - host: "localhost", - port: 4317, - }, - { - name: "http protocol configuration", - protocol: egv1a1.HTTPProtocol, - host: "localhost", - port: 4318, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cfg := &config.Server{ - EnvoyGateway: &egv1a1.EnvoyGateway{ - EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ - Telemetry: &egv1a1.EnvoyGatewayTelemetry{ - Traces: &egv1a1.EnvoyGatewayTraces{ - Sink: egv1a1.EnvoyGatewayTraceSink{ - Type: egv1a1.TraceSinkTypeOpenTelemetry, - OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ - Host: tt.host, - Port: tt.port, - Protocol: tt.protocol, - }, - }, - }, - }, - }, - }, - } - - runner := New(cfg) - require.NotNil(t, runner) - - // Create a context with timeout - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - - // Start will create the exporter - err := runner.Start(ctx) - // We don't expect an error during initialization - require.NoError(t, err) - - // Clean up - _ = runner.Close() - }) - } -} - -func TestTracesRunner_Start_Configuration(t *testing.T) { - tests := []struct { - name string - protocol string - }{ - { - name: "grpc protocol configuration", - protocol: egv1a1.GRPCProtocol, - }, - { - name: "http protocol configuration", - protocol: egv1a1.HTTPProtocol, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cfg := &config.Server{ - EnvoyGateway: &egv1a1.EnvoyGateway{ - EnvoyGatewaySpec: egv1a1.EnvoyGatewaySpec{ - Telemetry: &egv1a1.EnvoyGatewayTelemetry{ - Traces: &egv1a1.EnvoyGatewayTraces{ - Sink: egv1a1.EnvoyGatewayTraceSink{ - Type: egv1a1.TraceSinkTypeOpenTelemetry, - OpenTelemetry: &egv1a1.EnvoyGatewayOpenTelemetrySink{ - Host: "localhost", - Port: 4317, - Protocol: tt.protocol, - }, - }, - }, - }, - }, - }, - } - - runner := New(cfg) - require.NotNil(t, runner) - require.Equal(t, "traces", runner.Name()) - - // Note: We don't call Start() here because it requires a real OTLP endpoint - // This test just verifies the runner can be created with valid configuration - }) - } -} diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index e1b8f8da9b5..17f46b70967 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -564,23 +564,6 @@ _Appears in:_ | `forwardUsernameHeader` | _string_ | false | | This field specifies the header name to forward a successfully authenticated user to
the backend. The header will be added to the request with the username as the value.
If it is not specified, the username will not be forwarded. | -#### BatchSpanProcessorConfig - - - -BatchSpanProcessorConfig defines the configuration for the OpenTelemetry batch span processor. -The batch span processor batches spans before sending them to the exporter. - -_Appears in:_ -- [EnvoyGatewayTraces](#envoygatewaytraces) - -| Field | Type | Required | Default | Description | -| --- | --- | --- | --- | --- | -| `batchTimeout` | _[Duration](https://gateway-api.sigs.k8s.io/reference/1.4/spec/#duration)_ | false | | BatchTimeout is the maximum duration for constructing a batch. Spans are
exported when either the batch is full or this timeout is reached. | -| `maxExportBatchSize` | _integer_ | false | | MaxExportBatchSize is the maximum number of spans to export in a single batch.
Default is 512. | -| `maxQueueSize` | _integer_ | false | | MaxQueueSize is the maximum queue size to buffer spans for delayed processing.
If the queue gets full it drops the spans. Default is 2048. | - - #### BodyToExtAuth @@ -1495,7 +1478,6 @@ This is shared between metrics and traces. _Appears in:_ - [EnvoyGatewayMetricSink](#envoygatewaymetricsink) -- [EnvoyGatewayTraceSink](#envoygatewaytracesink) | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | @@ -1606,22 +1588,6 @@ _Appears in:_ | `disabled` | _boolean_ | false | | | -#### EnvoyGatewayTraceSink - - - -EnvoyGatewayTraceSink defines control plane -trace sinks where traces are sent to. - -_Appears in:_ -- [EnvoyGatewayTraces](#envoygatewaytraces) - -| Field | Type | Required | Default | Description | -| --- | --- | --- | --- | --- | -| `type` | _[TraceSinkType](#tracesinktype)_ | true | OpenTelemetry | Type defines the trace sink type.
EG control plane currently supports OpenTelemetry. | -| `openTelemetry` | _[EnvoyGatewayOpenTelemetrySink](#envoygatewayopentelemetrysink)_ | true | | OpenTelemetry defines the configuration for OpenTelemetry sink.
It's required if the sink type is OpenTelemetry. | - - #### EnvoyGatewayTraces @@ -1633,12 +1599,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | -| `sink` | _[EnvoyGatewayTraceSink](#envoygatewaytracesink)_ | true | | Sink defines the trace sink where traces are sent to. | | `disable` | _boolean_ | false | | Disable disables the traces. | -| `samplingRate` | _float_ | false | | SamplingRate controls the rate at which traces are sampled.
Defaults to 1.0 (100% sampling). Valid values are between 0.0 and 1.0.
0.0 means no sampling, 1.0 means all traces are sampled. | -| `batchSpanProcessor` | _[BatchSpanProcessorConfig](#batchspanprocessorconfig)_ | false | | BatchSpanProcessorConfig defines the configuration for the batch span processor.
This processor batches spans before exporting them to the configured sink. | - - #### EnvoyJSONPatchConfig @@ -5202,18 +5163,6 @@ _Appears in:_ | `http` | _[HTTPTimeout](#httptimeout)_ | false | | Timeout settings for HTTP. | -#### TraceSinkType - -_Underlying type:_ _string_ - - - -_Appears in:_ -- [EnvoyGatewayTraceSink](#envoygatewaytracesink) - -| Value | Description | -| ----- | ----------- | -| `OpenTelemetry` | | #### Tracing From 8ca0a2cca39431de3189d62f24fe08001ca15127 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 10:57:52 +0530 Subject: [PATCH 20/35] lint Signed-off-by: Shreemaan Abhishek --- internal/traces/register.go | 1 - internal/xds/runner/runner_test.go | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/traces/register.go b/internal/traces/register.go index e509221b14d..e4ffa10b326 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -39,7 +39,6 @@ func (r *Runner) Start(ctx context.Context) error { semconv.ServiceNameKey.String("envoy-gateway"), ), ) - if err != nil { return err } diff --git a/internal/xds/runner/runner_test.go b/internal/xds/runner/runner_test.go index 4513968b3a0..a3afb438ee7 100644 --- a/internal/xds/runner/runner_test.go +++ b/internal/xds/runner/runner_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/tsaarni/certyaml" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -34,7 +35,6 @@ import ( "github.com/envoyproxy/gateway/internal/ir" "github.com/envoyproxy/gateway/internal/message" "github.com/envoyproxy/gateway/internal/xds/bootstrap" - "go.opentelemetry.io/otel/trace" ) func newTestTraceContext() context.Context { From 1be27d38ab4922764e0d923271d3711ba39b625a Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 11:11:11 +0530 Subject: [PATCH 21/35] testing cleanup Signed-off-by: Shreemaan Abhishek --- .../envoy-gateaway-config/traces-enabled.yaml | 35 --- test/config/helm/traces-enabled.yaml | 16 -- test/e2e/testdata/controlplane-tracing.yaml | 31 --- test/e2e/tests/controlplane_tracing.go | 133 ---------- test/utils/controlplane_tracing/tracing.go | 232 ------------------ 5 files changed, 447 deletions(-) delete mode 100644 test/config/envoy-gateaway-config/traces-enabled.yaml delete mode 100644 test/config/helm/traces-enabled.yaml delete mode 100644 test/e2e/testdata/controlplane-tracing.yaml delete mode 100644 test/e2e/tests/controlplane_tracing.go delete mode 100644 test/utils/controlplane_tracing/tracing.go diff --git a/test/config/envoy-gateaway-config/traces-enabled.yaml b/test/config/envoy-gateaway-config/traces-enabled.yaml deleted file mode 100644 index 15b0ab43b41..00000000000 --- a/test/config/envoy-gateaway-config/traces-enabled.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: envoy-gateway-system -data: - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true - rateLimit: - backend: - type: Redis - redis: - url: redis.redis-system.svc.cluster.local:6379 - telemetry: - traces: - sink: - type: OpenTelemetry - openTelemetry: - host: "otel-collector.monitoring.svc.cluster.local" - port: 4317 - protocol: grpc - # Fast export settings for e2e tests to ensure traces are available immediately - samplingRate: 1.0 # 100% sampling for tests - batchSpanProcessor: - batchTimeout: 100ms # Export every 100ms instead of default 5s - maxExportBatchSize: 512 - maxQueueSize: 2048 diff --git a/test/config/helm/traces-enabled.yaml b/test/config/helm/traces-enabled.yaml deleted file mode 100644 index e6242694405..00000000000 --- a/test/config/helm/traces-enabled.yaml +++ /dev/null @@ -1,16 +0,0 @@ -config: - envoyGateway: - telemetry: - traces: - sink: - type: OpenTelemetry - openTelemetry: - host: "otel-collector.monitoring.svc.cluster.local" - port: 4317 - protocol: grpc - # Fast export settings for e2e tests to ensure traces are available immediately - samplingRate: 1.0 # 100% sampling for tests - batchSpanProcessor: - batchTimeout: 100ms # Export every 100ms instead of default 5s - maxExportBatchSize: 512 - maxQueueSize: 2048 diff --git a/test/e2e/testdata/controlplane-tracing.yaml b/test/e2e/testdata/controlplane-tracing.yaml deleted file mode 100644 index 20860756a08..00000000000 --- a/test/e2e/testdata/controlplane-tracing.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: controlplane-tracing-test - namespace: gateway-conformance-infra -spec: - gatewayClassName: envoy-gateway - listeners: - - name: http - port: 80 - protocol: HTTP - allowedRoutes: - namespaces: - from: All ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: controlplane-tracing-test - namespace: gateway-conformance-infra -spec: - parentRefs: - - name: controlplane-tracing-test - rules: - - matches: - - path: - type: PathPrefix - value: /test - backendRefs: - - name: infra-backend-v1 - port: 8080 diff --git a/test/e2e/tests/controlplane_tracing.go b/test/e2e/tests/controlplane_tracing.go deleted file mode 100644 index 98ecd421ebc..00000000000 --- a/test/e2e/tests/controlplane_tracing.go +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright Envoy Gateway Authors -// SPDX-License-Identifier: Apache-2.0 -// The full text of the Apache license is available in the LICENSE file at -// the root of the repo. - -//go:build e2e - -package tests - -import ( - "context" - "testing" - "time" - - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" - httputils "sigs.k8s.io/gateway-api/conformance/utils/http" - "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" - "sigs.k8s.io/gateway-api/conformance/utils/suite" - "sigs.k8s.io/gateway-api/conformance/utils/tlog" - - controlplanetracing "github.com/envoyproxy/gateway/test/utils/controlplane_tracing" -) - -func init() { - ConformanceTests = append(ConformanceTests, ControlPlaneTracingTest) -} - -var ControlPlaneTracingTest = suite.ConformanceTest{ - ShortName: "ControlPlaneTracing", - Description: "Verify that control plane traces are being generated and exported to OpenTelemetry collector", - Manifests: []string{"testdata/controlplane-tracing.yaml"}, - Test: func(t *testing.T, suite *suite.ConformanceTestSuite) { - t.Run("OpenTelemetry", func(t *testing.T) { - ns := "gateway-conformance-infra" - routeNN := types.NamespacedName{Name: "controlplane-tracing-test", Namespace: ns} - gwNN := types.NamespacedName{Name: "controlplane-tracing-test", Namespace: ns} - - // Wait for gateway and route to be accepted - // This will trigger control plane operations that should generate traces - gwAddr := kubernetes.GatewayAndRoutesMustBeAccepted( - t, - suite.Client, - suite.TimeoutConfig, - suite.ControllerName, - kubernetes.NewGatewayRef(gwNN), - &gwapiv1.HTTPRoute{}, - false, - routeNN, - ) - - // Make a test request to ensure the gateway is fully operational - expectedResponse := httputils.ExpectedResponse{ - Request: httputils.Request{ - Path: "/test", - }, - Response: httputils.Response{ - StatusCodes: []int{200}, - }, - Namespace: ns, - } - httputils.MakeRequestAndExpectEventuallyConsistentResponse( - t, - suite.RoundTripper, - suite.TimeoutConfig, - gwAddr, - expectedResponse, - ) - - // Wait for traces to be exported and verify they exist - // Control plane traces should have service.name=envoy-gateway - tlog.Logf(t, "waiting for control plane traces to be exported...") - if err := wait.PollUntilContextTimeout( - context.TODO(), - 2*time.Second, - 2*time.Minute, - true, - func(ctx context.Context) (bool, error) { - // Query Tempo for control plane traces - traceCount, err := controlplanetracing.QueryControlPlaneTraces(t, suite.Client, "envoy-gateway") - if err != nil { - tlog.Logf(t, "failed to query traces from tempo: %v", err) - return false, nil - } - - tlog.Logf(t, "found %d control plane traces", traceCount) - - // We expect at least some traces from the gateway operations - if traceCount > 0 { - return true, nil - } - - return false, nil - }, - ); err != nil { - t.Errorf("failed to find control plane traces in tempo: %v", err) - } - - // Verify specific span names exist - // These span names are created by the instrumented code in the gateway - tlog.Logf(t, "verifying expected span names exist...") - expectedSpanNames := []string{ - "GatewayApiRunner.subscribeAndTranslate", - "XdsRunner.subscribeAndTranslate", - } - - if err := wait.PollUntilContextTimeout( - context.TODO(), - 2*time.Second, - 2*time.Minute, - true, - func(ctx context.Context) (bool, error) { - hasExpectedSpans, err := controlplanetracing.VerifyExpectedSpans( - t, - suite.Client, - "envoy-gateway", - expectedSpanNames, - ) - if err != nil { - tlog.Logf(t, "failed to verify expected spans: %v", err) - return false, nil - } - return hasExpectedSpans, nil - }, - ); err != nil { - t.Errorf("failed to find expected span names: %v", err) - } - - tlog.Logf(t, "control plane tracing test completed successfully") - }) - }, -} diff --git a/test/utils/controlplane_tracing/tracing.go b/test/utils/controlplane_tracing/tracing.go deleted file mode 100644 index 0a4ca68071a..00000000000 --- a/test/utils/controlplane_tracing/tracing.go +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright Envoy Gateway Authors -// SPDX-License-Identifier: Apache-2.0 -// The full text of the Apache license is available in the LICENSE file at -// the root of the repo. - -package controlplanetracing - -import ( - "context" - "encoding/json" - "fmt" - "io" - "net" - "net/http" - "net/url" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/gateway-api/conformance/utils/tlog" -) - -// getTempoHost returns the Tempo host address -// It tries to use the LoadBalancer IP first, then falls back to localhost (port-forward) -func getTempoHost(t *testing.T, c client.Client) (string, error) { - // Verify Tempo service exists - svc := corev1.Service{} - if err := c.Get(context.Background(), types.NamespacedName{ - Namespace: "monitoring", - Name: "tempo", - }, &svc); err != nil { - // Fall back to eg-addons prefix if that fails - if err := c.Get(context.Background(), types.NamespacedName{ - Namespace: "monitoring", - Name: "eg-addons-tempo", - }, &svc); err != nil { - return "", fmt.Errorf("failed to get tempo service: %w", err) - } - } - - // Try to use LoadBalancer IP if available (more reliable than port-forward) - if svc.Spec.Type == corev1.ServiceTypeLoadBalancer { - if len(svc.Status.LoadBalancer.Ingress) > 0 { - if svc.Status.LoadBalancer.Ingress[0].IP != "" { - host := svc.Status.LoadBalancer.Ingress[0].IP - tlog.Logf(t, "using Tempo at %s:3100 (via LoadBalancer)", host) - return host, nil - } - } - } - - return "", fmt.Errorf("tempo loadbalancer IP not found") -} - -// QueryControlPlaneTraces queries Tempo for control plane traces with the given service name -func QueryControlPlaneTraces(t *testing.T, c client.Client, serviceName string) (int, error) { - host, err := getTempoHost(t, c) - if err != nil { - return -1, err - } - - tempoURL := url.URL{ - Scheme: "http", - Host: net.JoinHostPort(host, "3100"), - Path: "/api/search", - } - query := tempoURL.Query() - query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) - query.Add("end", fmt.Sprintf("%d", time.Now().Unix())) - query.Add("tags", fmt.Sprintf("service.name=%s", serviceName)) - tempoURL.RawQuery = query.Encode() - - req, err := http.NewRequest("GET", tempoURL.String(), nil) - if err != nil { - return -1, err - } - - tlog.Logf(t, "querying tempo: %s", tempoURL.String()) - res, err := http.DefaultClient.Do(req) - if err != nil { - return -1, err - } - defer res.Body.Close() - - if res.StatusCode != http.StatusOK { - return -1, fmt.Errorf("tempo returned status %s", res.Status) - } - - resp := &TempoResponse{} - data, err := io.ReadAll(res.Body) - if err != nil { - return -1, err - } - if err := json.Unmarshal(data, &resp); err != nil { - tlog.Logf(t, "failed to unmarshal response: %s", string(data)) - return -1, err - } - - total := len(resp.Traces) - tlog.Logf(t, "found %d traces from tempo for service %s", total, serviceName) - return total, nil -} - -// VerifyExpectedSpans checks that expected span names exist in the traces -// Note: Tempo's search API doesn't index child span names, so we need to fetch -// traces and inspect them directly -func VerifyExpectedSpans(t *testing.T, c client.Client, serviceName string, expectedSpanNames []string) (bool, error) { - host, err := getTempoHost(t, c) - if err != nil { - return false, err - } - - // First, get all traces for the service - searchURL := url.URL{ - Scheme: "http", - Host: net.JoinHostPort(host, "3100"), - Path: "/api/search", - } - query := searchURL.Query() - query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) - query.Add("end", fmt.Sprintf("%d", time.Now().Unix())) - query.Add("tags", fmt.Sprintf("service.name=%s", serviceName)) - searchURL.RawQuery = query.Encode() - - req, err := http.NewRequest("GET", searchURL.String(), nil) - if err != nil { - return false, err - } - - res, err := http.DefaultClient.Do(req) - if err != nil { - return false, err - } - defer res.Body.Close() - - if res.StatusCode != http.StatusOK { - return false, fmt.Errorf("tempo search returned status %s", res.Status) - } - - searchResp := &TempoResponse{} - data, err := io.ReadAll(res.Body) - if err != nil { - return false, err - } - if err := json.Unmarshal(data, &searchResp); err != nil { - return false, err - } - - if len(searchResp.Traces) == 0 { - tlog.Logf(t, "no traces found for service %s", serviceName) - return false, nil - } - - // Now fetch each trace and check for the expected span names - foundSpans := make(map[string]bool) - for _, trace := range searchResp.Traces { - traceID := trace["traceID"].(string) - traceURL := url.URL{ - Scheme: "http", - Host: net.JoinHostPort(host, "3100"), - Path: fmt.Sprintf("/api/traces/%s", traceID), - } - - req, err := http.NewRequest("GET", traceURL.String(), nil) - if err != nil { - continue - } - - res, err := http.DefaultClient.Do(req) - if err != nil { - continue - } - - if res.StatusCode != http.StatusOK { - res.Body.Close() - continue - } - - traceData, err := io.ReadAll(res.Body) - res.Body.Close() - if err != nil { - continue - } - - // Parse the trace to find span names - var traceResp map[string]interface{} - if err := json.Unmarshal(traceData, &traceResp); err != nil { - continue - } - - // Extract span names from the trace - if batches, ok := traceResp["batches"].([]interface{}); ok { - for _, batch := range batches { - if batchMap, ok := batch.(map[string]interface{}); ok { - if scopeSpans, ok := batchMap["scopeSpans"].([]interface{}); ok { - for _, scopeSpan := range scopeSpans { - if scopeSpanMap, ok := scopeSpan.(map[string]interface{}); ok { - if spans, ok := scopeSpanMap["spans"].([]interface{}); ok { - for _, span := range spans { - if spanMap, ok := span.(map[string]interface{}); ok { - if name, ok := spanMap["name"].(string); ok { - foundSpans[name] = true - } - } - } - } - } - } - } - } - } - } - } - - // Check if all expected spans were found - for _, expectedSpan := range expectedSpanNames { - if !foundSpans[expectedSpan] { - tlog.Logf(t, "span '%s' not found yet", expectedSpan) - return false, nil - } - tlog.Logf(t, "found span '%s' in traces", expectedSpan) - } - - return true, nil -} - -// TempoResponse represents the response from Tempo's search API -type TempoResponse struct { - Traces []map[string]interface{} `json:"traces,omitempty"` -} From 37107cd7a2f6a46cc1578d1556da0b7a3aa48e26 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 11:11:58 +0530 Subject: [PATCH 22/35] lint Signed-off-by: Shreemaan Abhishek --- site/content/en/latest/api/extension_types.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 17f46b70967..0c42720d675 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -5163,8 +5163,6 @@ _Appears in:_ | `http` | _[HTTPTimeout](#httptimeout)_ | false | | Timeout settings for HTTP. | - - #### Tracing From f0ab998ab92c9e0d53e1df4824ab118b00c779ba Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 22:20:35 +0530 Subject: [PATCH 23/35] snapshot cache Signed-off-by: Shreemaan Abhishek --- internal/xds/cache/snapshotcache.go | 32 ++++++++++++----------------- internal/xds/runner/runner.go | 4 ++-- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/internal/xds/cache/snapshotcache.go b/internal/xds/cache/snapshotcache.go index 33c4cad18b6..afbfa8866fb 100644 --- a/internal/xds/cache/snapshotcache.go +++ b/internal/xds/cache/snapshotcache.go @@ -16,7 +16,6 @@ package cache import ( "context" "fmt" - "math" "strconv" "sync" "time" @@ -25,6 +24,8 @@ import ( discoveryv3 "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" cachev3 "github.com/envoyproxy/go-control-plane/pkg/cache/v3" serverv3 "github.com/envoyproxy/go-control-plane/pkg/server/v3" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" "github.com/envoyproxy/gateway/internal/logging" @@ -32,7 +33,10 @@ import ( "github.com/envoyproxy/gateway/internal/xds/types" ) -var Hash = cachev3.IDHash{} +var ( + Hash = cachev3.IDHash{} + tracer = otel.Tracer("envoy-gateway/gateway-api") +) // SnapshotCacheWithCallbacks uses the go-control-plane SimpleCache to store snapshots of // Envoy resources, sliced by Node ID so that we can do incremental xDS properly. @@ -46,7 +50,7 @@ var Hash = cachev3.IDHash{} type SnapshotCacheWithCallbacks interface { cachev3.SnapshotCache serverv3.Callbacks - GenerateNewSnapshot(string, types.XdsResources) error + GenerateNewSnapshot(string, types.XdsResources, context.Context) error SnapshotHasIrKey(string) bool GetIrKeys() []string } @@ -65,7 +69,6 @@ type snapshotCache struct { nodeFrequency nodeFrequencyMap streamDuration streamDurationMap deltaStreamDuration streamDurationMap - snapshotVersion int64 lastSnapshot snapshotMap log *zap.SugaredLogger mu sync.Mutex @@ -73,11 +76,15 @@ type snapshotCache struct { // GenerateNewSnapshot takes a table of resources (the output from the IR->xDS // translator) and updates the snapshot version. -func (s *snapshotCache) GenerateNewSnapshot(irKey string, resources types.XdsResources) error { +func (s *snapshotCache) GenerateNewSnapshot(irKey string, resources types.XdsResources, ctx context.Context) error { s.mu.Lock() defer s.mu.Unlock() - version := s.newSnapshotVersion() + _, span := tracer.Start(ctx, "SnapshotCache.GenerateNewSnapshot") + defer span.End() + + sc := trace.SpanContextFromContext(ctx) + version := sc.TraceID().String() // Create a snapshot with all xDS resources. snapshot, err := cachev3.NewSnapshot( @@ -112,19 +119,6 @@ func (s *snapshotCache) GenerateNewSnapshot(irKey string, resources types.XdsRes return nil } -// newSnapshotVersion increments the current snapshotVersion -// and returns as a string. -func (s *snapshotCache) newSnapshotVersion() string { - // Reset the snapshotVersion if it ever hits max size. - if s.snapshotVersion == math.MaxInt64 { - s.snapshotVersion = 0 - } - - // Increment the snapshot version & return as string. - s.snapshotVersion++ - return strconv.FormatInt(s.snapshotVersion, 10) -} - // NewSnapshotCache gives you a fresh SnapshotCache. // It needs a logger that supports the go-control-plane // required interface (Debugf, Infof, Warnf, and Errorf). diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index 606d01e6f3f..40a73121a50 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -281,7 +281,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, ) if update.Delete { - if err := r.cache.GenerateNewSnapshot(key, nil); err != nil { + if err := r.cache.GenerateNewSnapshot(key, nil, parentCtx); err != nil { traceLogger.Error(err, "failed to delete the snapshot") errChan <- err } @@ -362,7 +362,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, errChan <- err } else { // Update snapshot cache - if err := r.cache.GenerateNewSnapshot(key, result.XdsResources); err != nil { + if err := r.cache.GenerateNewSnapshot(key, result.XdsResources, parentCtx); err != nil { traceLogger.Error(err, "failed to generate a snapshot") errChan <- err } From 8ee02257478a090fcecabb660aefc5eeb1113022 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Tue, 4 Nov 2025 22:41:06 +0530 Subject: [PATCH 24/35] rename Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_helpers.go | 6 +++--- api/v1alpha1/envoygateway_traces_types.go | 4 ++-- api/v1alpha1/envoygateway_types.go | 2 +- api/v1alpha1/zz_generated.deepcopy.go | 10 +++++----- site/content/en/latest/api/extension_types.md | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index c77277f02a8..3c9ac59dbb5 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -220,7 +220,7 @@ func (e *EnvoyGateway) DisableTraces() bool { func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return &EnvoyGatewayTelemetry{ Metrics: DefaultEnvoyGatewayMetrics(), - Traces: DefaultEnvoyGatewayTraces(), + Traces: DefaultEnvoyGatewayTracing(), } } @@ -231,8 +231,8 @@ func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { } } -func DefaultEnvoyGatewayTraces() *EnvoyGatewayTraces { - return &EnvoyGatewayTraces{ +func DefaultEnvoyGatewayTracing() *EnvoyGatewayTracing { + return &EnvoyGatewayTracing{ Disable: true, } } diff --git a/api/v1alpha1/envoygateway_traces_types.go b/api/v1alpha1/envoygateway_traces_types.go index 73d9b1ee68f..8a4eb7805f6 100644 --- a/api/v1alpha1/envoygateway_traces_types.go +++ b/api/v1alpha1/envoygateway_traces_types.go @@ -5,8 +5,8 @@ package v1alpha1 -// EnvoyGatewayTraces defines control plane tracing configurations. -type EnvoyGatewayTraces struct { +// EnvoyGatewayTracing defines control plane tracing configurations. +type EnvoyGatewayTracing struct { // Disable disables the traces. // // +optional diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index 12272c645a4..51333f77f02 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -189,7 +189,7 @@ type LeaderElection struct { type EnvoyGatewayTelemetry struct { // Metrics defines metrics configuration for envoy gateway. Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` - Traces *EnvoyGatewayTraces `json:"traces,omitempty"` + Traces *EnvoyGatewayTracing `json:"traces,omitempty"` } // EnvoyGatewayLogging defines logging for Envoy Gateway. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 580d90e6d17..0fa73521f79 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -2303,7 +2303,7 @@ func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { } if in.Traces != nil { in, out := &in.Traces, &out.Traces - *out = new(EnvoyGatewayTraces) + *out = new(EnvoyGatewayTracing) **out = **in } } @@ -2339,16 +2339,16 @@ func (in *EnvoyGatewayTopologyInjector) DeepCopy() *EnvoyGatewayTopologyInjector } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvoyGatewayTraces) DeepCopyInto(out *EnvoyGatewayTraces) { +func (in *EnvoyGatewayTracing) DeepCopyInto(out *EnvoyGatewayTracing) { *out = *in } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTraces. -func (in *EnvoyGatewayTraces) DeepCopy() *EnvoyGatewayTraces { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTracing. +func (in *EnvoyGatewayTracing) DeepCopy() *EnvoyGatewayTracing { if in == nil { return nil } - out := new(EnvoyGatewayTraces) + out := new(EnvoyGatewayTracing) in.DeepCopyInto(out) return out } diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 0c42720d675..9a6279cd15b 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -1571,7 +1571,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | | `metrics` | _[EnvoyGatewayMetrics](#envoygatewaymetrics)_ | true | | Metrics defines metrics configuration for envoy gateway. | -| `traces` | _[EnvoyGatewayTraces](#envoygatewaytraces)_ | true | | | +| `traces` | _[EnvoyGatewayTracing](#envoygatewaytracing)_ | true | | | #### EnvoyGatewayTopologyInjector @@ -1588,11 +1588,11 @@ _Appears in:_ | `disabled` | _boolean_ | false | | | -#### EnvoyGatewayTraces +#### EnvoyGatewayTracing -EnvoyGatewayTraces defines control plane tracing configurations. +EnvoyGatewayTracing defines control plane tracing configurations. _Appears in:_ - [EnvoyGatewayTelemetry](#envoygatewaytelemetry) From 3728b823225b9ddb230ed4d8665e469a12cf4190 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 7 Nov 2025 11:36:46 +0530 Subject: [PATCH 25/35] fix Signed-off-by: Shreemaan Abhishek --- internal/xds/runner/runner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index f387294c7bc..bf60b0e4539 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -337,7 +337,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, errChan <- err } else { // Update snapshot cache - if err := r.cache.GenerateNewSnapshot(key, result.XdsResources); err != nil { + if err := r.cache.GenerateNewSnapshot(key, result.XdsResources, parentCtx); err != nil { r.Logger.Error(err, "failed to generate a snapshot") errChan <- err } From d4b7eb4d1339816437d4894dfbb028f5cb31a3fe Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sat, 15 Nov 2025 22:55:33 +0530 Subject: [PATCH 26/35] revert api Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_helpers.go | 14 +------- api/v1alpha1/envoygateway_metrics_types.go | 2 -- api/v1alpha1/envoygateway_types.go | 15 ++++++++- api/v1alpha1/envoyproxy_metric_types.go | 4 +-- api/v1alpha1/zz_generated.deepcopy.go | 10 +++--- internal/traces/register.go | 4 --- site/content/en/latest/api/extension_types.md | 32 ++++++++++--------- 7 files changed, 38 insertions(+), 43 deletions(-) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index 3c9ac59dbb5..9722656b5cc 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -206,21 +206,15 @@ func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return e.Telemetry } -// DisablePrometheus returns true if prometheus is disabled. +// DisablePrometheus returns if disable prometheus. func (e *EnvoyGateway) DisablePrometheus() bool { return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Disable } -// DisableTraces returns true if tracing is disabled. -func (e *EnvoyGateway) DisableTraces() bool { - return e.GetEnvoyGatewayTelemetry().Traces.Disable -} - // DefaultEnvoyGatewayTelemetry returns a new EnvoyGatewayTelemetry with default configuration parameters. func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return &EnvoyGatewayTelemetry{ Metrics: DefaultEnvoyGatewayMetrics(), - Traces: DefaultEnvoyGatewayTracing(), } } @@ -231,12 +225,6 @@ func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { } } -func DefaultEnvoyGatewayTracing() *EnvoyGatewayTracing { - return &EnvoyGatewayTracing{ - Disable: true, - } -} - // DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. func DefaultEnvoyGatewayPrometheus() *EnvoyGatewayPrometheusProvider { return &EnvoyGatewayPrometheusProvider{ diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index fe3be93a6e8..62aeec39519 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -28,8 +28,6 @@ type EnvoyGatewayMetricSink struct { OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` } -// EnvoyGatewayOpenTelemetrySink defines the configuration for OpenTelemetry sink. -// This is shared between metrics and traces. type EnvoyGatewayOpenTelemetrySink struct { // Host define the sink service hostname. Host string `json:"host"` diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index 51333f77f02..5c7bf7eacba 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -189,7 +189,6 @@ type LeaderElection struct { type EnvoyGatewayTelemetry struct { // Metrics defines metrics configuration for envoy gateway. Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` - Traces *EnvoyGatewayTracing `json:"traces,omitempty"` } // EnvoyGatewayLogging defines logging for Envoy Gateway. @@ -200,8 +199,22 @@ type EnvoyGatewayLogging struct { // // +kubebuilder:default={default: info} Level map[EnvoyGatewayLogComponent]LogLevel `json:"level,omitempty"` + // Encoder defines the log encoder format. + // If unspecified, defaults to "Text". + // + // +optional + Encoder *EnvoyGatewayLogEncoder `json:"encoder,omitempty"` } +type EnvoyGatewayLogEncoder string + +const ( + // EnvoyGatewayLogEncoderText defines the "Text" log encoder. + EnvoyGatewayLogEncoderText EnvoyGatewayLogEncoder = "Text" + // EnvoyGatewayLogEncoderJSON defines the "JSON" log encoder. + EnvoyGatewayLogEncoderJSON EnvoyGatewayLogEncoder = "JSON" +) + // EnvoyGatewayLogComponent defines a component that supports a configured logging level. // +kubebuilder:validation:Enum=default;provider;gateway-api;xds-translator;xds-server;xds;infrastructure;global-ratelimit type EnvoyGatewayLogComponent string diff --git a/api/v1alpha1/envoyproxy_metric_types.go b/api/v1alpha1/envoyproxy_metric_types.go index 1d32d4b3eb7..320b7436caa 100644 --- a/api/v1alpha1/envoyproxy_metric_types.go +++ b/api/v1alpha1/envoyproxy_metric_types.go @@ -5,9 +5,7 @@ package v1alpha1 -type ( - MetricSinkType string -) +type MetricSinkType string const ( MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 657d7392572..4d4db684186 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -2084,6 +2084,11 @@ func (in *EnvoyGatewayLogging) DeepCopyInto(out *EnvoyGatewayLogging) { (*out)[key] = val } } + if in.Encoder != nil { + in, out := &in.Encoder, &out.Encoder + *out = new(EnvoyGatewayLogEncoder) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayLogging. @@ -2301,11 +2306,6 @@ func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { *out = new(EnvoyGatewayMetrics) (*in).DeepCopyInto(*out) } - if in.Traces != nil { - in, out := &in.Traces, &out.Traces - *out = new(EnvoyGatewayTracing) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTelemetry. diff --git a/internal/traces/register.go b/internal/traces/register.go index e4ffa10b326..bbce812406d 100644 --- a/internal/traces/register.go +++ b/internal/traces/register.go @@ -29,10 +29,6 @@ func New(cfg *config.Server) *Runner { } func (r *Runner) Start(ctx context.Context) error { - if r.cfg.EnvoyGateway.DisableTraces() { - return nil - } - // Create resource res, err := resource.New(ctx, resource.WithAttributes( diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 3993d25d335..20f1f1d02c9 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -1423,6 +1423,21 @@ _Appears in:_ | `global-ratelimit` | LogComponentGlobalRateLimitRunner defines the "global-ratelimit" runner component.
| +#### EnvoyGatewayLogEncoder + +_Underlying type:_ _string_ + + + +_Appears in:_ +- [EnvoyGatewayLogging](#envoygatewaylogging) + +| Value | Description | +| ----- | ----------- | +| `Text` | EnvoyGatewayLogEncoderText defines the "Text" log encoder.
| +| `JSON` | EnvoyGatewayLogEncoderJSON defines the "JSON" log encoder.
| + + #### EnvoyGatewayLogging @@ -1436,6 +1451,7 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | | `level` | _object (keys:[EnvoyGatewayLogComponent](#envoygatewaylogcomponent), values:[LogLevel](#loglevel))_ | true | \{ default:info \} | Level is the logging level. If unspecified, defaults to "info".
EnvoyGatewayLogComponent options: default/provider/gateway-api/xds-translator/xds-server/infrastructure/global-ratelimit.
LogLevel options: debug/info/error/warn. | +| `encoder` | _[EnvoyGatewayLogEncoder](#envoygatewaylogencoder)_ | false | | Encoder defines the log encoder format.
If unspecified, defaults to "Text". | #### EnvoyGatewayMetricSink @@ -1473,8 +1489,7 @@ _Appears in:_ -EnvoyGatewayOpenTelemetrySink defines the configuration for OpenTelemetry sink. -This is shared between metrics and traces. + _Appears in:_ - [EnvoyGatewayMetricSink](#envoygatewaymetricsink) @@ -1571,7 +1586,6 @@ _Appears in:_ | Field | Type | Required | Default | Description | | --- | --- | --- | --- | --- | | `metrics` | _[EnvoyGatewayMetrics](#envoygatewaymetrics)_ | true | | Metrics defines metrics configuration for envoy gateway. | -| `traces` | _[EnvoyGatewayTracing](#envoygatewaytracing)_ | true | | | #### EnvoyGatewayTopologyInjector @@ -1588,18 +1602,6 @@ _Appears in:_ | `disabled` | _boolean_ | false | | | -#### EnvoyGatewayTracing - - - -EnvoyGatewayTracing defines control plane tracing configurations. - -_Appears in:_ -- [EnvoyGatewayTelemetry](#envoygatewaytelemetry) - -| Field | Type | Required | Default | Description | -| --- | --- | --- | --- | --- | -| `disable` | _boolean_ | false | | Disable disables the traces. | #### EnvoyJSONPatchConfig From c4b00c13436e0e443c0af943667a6b3dbd709bff Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sat, 15 Nov 2025 23:00:14 +0530 Subject: [PATCH 27/35] rm type Signed-off-by: Shreemaan Abhishek --- api/v1alpha1/envoygateway_traces_types.go | 14 -------------- api/v1alpha1/zz_generated.deepcopy.go | 15 --------------- site/content/en/latest/api/extension_types.md | 2 -- 3 files changed, 31 deletions(-) delete mode 100644 api/v1alpha1/envoygateway_traces_types.go diff --git a/api/v1alpha1/envoygateway_traces_types.go b/api/v1alpha1/envoygateway_traces_types.go deleted file mode 100644 index 8a4eb7805f6..00000000000 --- a/api/v1alpha1/envoygateway_traces_types.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright Envoy Gateway Authors -// SPDX-License-Identifier: Apache-2.0 -// The full text of the Apache license is available in the LICENSE file at -// the root of the repo. - -package v1alpha1 - -// EnvoyGatewayTracing defines control plane tracing configurations. -type EnvoyGatewayTracing struct { - // Disable disables the traces. - // - // +optional - Disable bool `json:"disable,omitempty"` -} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4d4db684186..2dd26de2064 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -2338,21 +2338,6 @@ func (in *EnvoyGatewayTopologyInjector) DeepCopy() *EnvoyGatewayTopologyInjector return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvoyGatewayTracing) DeepCopyInto(out *EnvoyGatewayTracing) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTracing. -func (in *EnvoyGatewayTracing) DeepCopy() *EnvoyGatewayTracing { - if in == nil { - return nil - } - out := new(EnvoyGatewayTracing) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyJSONPatchConfig) DeepCopyInto(out *EnvoyJSONPatchConfig) { *out = *in diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 20f1f1d02c9..1d1086fb7dc 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -1602,8 +1602,6 @@ _Appears in:_ | `disabled` | _boolean_ | false | | | - - #### EnvoyJSONPatchConfig From 93f2acee68bc895d819d10711746213761a7ade0 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 21 Nov 2025 10:44:27 +0530 Subject: [PATCH 28/35] more spans and cleanups Signed-off-by: Shreemaan Abhishek --- internal/cmd/egctl/translate.go | 10 ++++---- internal/gatewayapi/runner/runner.go | 9 +++++-- internal/gatewayapi/translator.go | 28 +++++++++++++++++++--- internal/gatewayapi/translator_test.go | 4 ++-- internal/globalratelimit/runner/runner.go | 22 ++++++++++++++++- internal/xds/cache/snapshotcache.go | 2 +- internal/xds/runner/runner.go | 4 ++-- internal/xds/translator/translator.go | 9 ++++++- internal/xds/translator/translator_test.go | 7 +++--- test/helm/gateway-addons-helm/e2e.in.yaml | 7 ------ test/helm/gateway-addons-helm/e2e.out.yaml | 2 +- 11 files changed, 77 insertions(+), 27 deletions(-) diff --git a/internal/cmd/egctl/translate.go b/internal/cmd/egctl/translate.go index 9c06ce3c790..3765c13d214 100644 --- a/internal/cmd/egctl/translate.go +++ b/internal/cmd/egctl/translate.go @@ -7,6 +7,7 @@ package egctl import ( "bufio" + "context" "encoding/json" "fmt" "io" @@ -291,7 +292,8 @@ func translateGatewayAPIToIR(resources *resource.Resources) (*gatewayapi.Transla } } - result, _ := t.Translate(resources) + ctx := context.Background() + result, _ := t.Translate(resources, ctx) return result, nil } @@ -311,7 +313,7 @@ func translateGatewayAPIToGatewayAPI(resources *resource.Resources) (resource.Re BackendEnabled: true, Logger: logging.DefaultLogger(io.Discard, egv1a1.LogLevelInfo), } - gRes, _ := gTranslator.Translate(resources) + gRes, _ := gTranslator.Translate(resources, context.Background()) // Update the status of the GatewayClass based on EnvoyProxy validation epInvalid := false if resources.EnvoyProxyForGatewayClass != nil { @@ -351,7 +353,7 @@ func TranslateGatewayAPIToXds(namespace, dnsDomain, resourceType string, resourc BackendEnabled: true, Logger: logging.DefaultLogger(io.Discard, egv1a1.LogLevelInfo), } - gRes, _ := gTranslator.Translate(resources) + gRes, _ := gTranslator.Translate(resources, context.Background()) keys := []string{} for key := range gRes.XdsIR { @@ -374,7 +376,7 @@ func TranslateGatewayAPIToXds(namespace, dnsDomain, resourceType string, resourc if resources.EnvoyProxyForGatewayClass != nil { xTranslator.FilterOrder = resources.EnvoyProxyForGatewayClass.Spec.FilterOrder } - xRes, err := xTranslator.Translate(val) + xRes, err := xTranslator.Translate(val, context.Background()) if err != nil { return nil, fmt.Errorf("failed to translate xds ir for key %s value %+v, error:%w", key, val, err) } diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 3cc79eee80e..65f0fad0e73 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -18,6 +18,7 @@ import ( "github.com/telepresenceio/watchable" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -144,7 +145,6 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re // There is only 1 key which is the controller name // so when a delete is triggered, delete all keys if update.Delete || valWrapper == nil || valWrapper.Resources == nil { - span.AddEvent("delete_all_keys") r.deleteAllKeys() return } @@ -169,6 +169,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re var backendTLSPolicyStatusCount, clientTrafficPolicyStatusCount, backendTrafficPolicyStatusCount int var securityPolicyStatusCount, envoyExtensionPolicyStatusCount, backendStatusCount, extensionServerPolicyStatusCount int + span.AddEvent("gateway_resources_translation_cycle", trace.WithAttributes(attribute.Int("resources.count", len(*val)))) for _, resources := range *val { // Translate and publish IRs. t := &gatewayapi.Translator{ @@ -199,11 +200,13 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re traceLogger.Info("extension resources", "GVKs count", len(extGKs)) } // Translate to IR - result, err := t.Translate(resources) + _, translateToIRSpan := tracer.Start(parentCtx, "GatewayApiRunner.ResoureTranslationCycle.TranslateToIR") + result, err := t.Translate(resources, parentCtx) if err != nil { // Currently all errors that Translate returns should just be logged traceLogger.Error(err, "errors detected during translation", "gateway-class", resources.GatewayClass.Name) } + translateToIRSpan.End() // Publish the IRs. // Also validate the ir before sending it. @@ -243,6 +246,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re } // Update Status + _, statusUpdateSpan := tracer.Start(parentCtx, "GatewayApiRunner.ResoureTranslationCycle.UpdateStatus") if result.GatewayClass != nil { key := utils.NamespacedName(result.GatewayClass) r.ProviderResources.GatewayClassStatuses.Store(key, &result.GatewayClass.Status) @@ -365,6 +369,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re delete(keysToDelete.ExtensionServerPolicyStatus, key) r.keyCache.ExtensionServerPolicyStatus[key] = true } + statusUpdateSpan.End() } // Publish aggregated metrics diff --git a/internal/gatewayapi/translator.go b/internal/gatewayapi/translator.go index cf943962ce3..dd9f25a35ac 100644 --- a/internal/gatewayapi/translator.go +++ b/internal/gatewayapi/translator.go @@ -6,9 +6,12 @@ package gatewayapi import ( + "context" "errors" "fmt" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "golang.org/x/exp/maps" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" @@ -49,10 +52,13 @@ const ( wellKnownPortShift = 10000 ) -var _ TranslatorManager = (*Translator)(nil) +var ( + _ TranslatorManager = (*Translator)(nil) + tracer = otel.Tracer("envoy-gateway/gateway-api/translator") +) type TranslatorManager interface { - Translate(resources *resource.Resources) (*TranslateResult, error) + Translate(resources *resource.Resources, ctx context.Context) (*TranslateResult, error) GetRelevantGateways(resources *resource.Resources) (acceptedGateways, failedGateways []*GatewayContext) RoutesTranslator @@ -217,7 +223,10 @@ func newTranslateResult( return translateResult } -func (t *Translator) Translate(resources *resource.Resources) (*TranslateResult, error) { +func (t *Translator) Translate(resources *resource.Resources, ctx context.Context) (*TranslateResult, error) { + _, span := tracer.Start(ctx, "Translator.Translate") + defer span.End() + span.SetAttributes(getAttributes(resources)...) var errs error // Get Gateways belonging to our GatewayClass. @@ -529,3 +538,16 @@ func (t *Translator) IRKey(gatewayNN types.NamespacedName) string { } return irStringKey(gatewayNN.Namespace, gatewayNN.Name) } + +func getAttributes(resources *resource.Resources) []attribute.KeyValue { + attrs := []attribute.KeyValue{} + if resources.GatewayClass == nil { + return attrs + } + attrs = append(attrs, attribute.String("gateway-class", resources.GatewayClass.Name)) + attrs = append(attrs, attribute.String("gateway-class-namespace", resources.GatewayClass.Namespace)) + if resources.GatewayClass.Spec.ControllerName != "" { + attrs = append(attrs, attribute.String("gateway-class-controller-name", string(resources.GatewayClass.Spec.ControllerName))) + } + return attrs +} diff --git a/internal/gatewayapi/translator_test.go b/internal/gatewayapi/translator_test.go index 2a8432c278a..72d11ef49f5 100644 --- a/internal/gatewayapi/translator_test.go +++ b/internal/gatewayapi/translator_test.go @@ -421,7 +421,7 @@ func TestTranslate(t *testing.T) { }, }) - got, _ := translator.Translate(resources) + got, _ := translator.Translate(resources, context.Background()) require.NoError(t, field.SetValue(got, "LastTransitionTime", metav1.NewTime(time.Time{}))) outputFilePath := strings.ReplaceAll(inputFile, ".in.yaml", ".out.yaml") out, err := yaml.Marshal(got) @@ -696,7 +696,7 @@ func TestTranslateWithExtensionKinds(t *testing.T) { }, }) - got, _ := translator.Translate(resources) + got, _ := translator.Translate(resources, context.Background()) require.NoError(t, field.SetValue(got, "LastTransitionTime", metav1.NewTime(time.Time{}))) // Also fix lastTransitionTime in unstructured members for i := range got.ExtensionServerPolicies { diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index 90859f5604c..666580f85a3 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -20,6 +20,8 @@ import ( resourcev3 "github.com/envoyproxy/go-control-plane/pkg/resource/v3" serverv3 "github.com/envoyproxy/go-control-plane/pkg/server/v3" "github.com/telepresenceio/watchable" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "google.golang.org/grpc" "google.golang.org/grpc/credentials" @@ -47,6 +49,8 @@ const ( rateLimitTLSCACertFilepath = "/certs/ca.crt" ) +var tracer = otel.Tracer("envoy-gateway/global-rate-limit/runner") + type Config struct { config.Server XdsIR *message.XdsIR @@ -138,7 +142,20 @@ func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchab message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.XDSIRMessageName}, c, func(update message.Update[string, *message.XdsIRWithContext], errChan chan error) { - r.Logger.Info("received a notification") + parentCtx := context.Background() + if update.Value != nil && update.Value.Context != nil { + parentCtx = update.Value.Context + } + traceLogger := r.Logger.WithTrace(parentCtx) + traceLogger.Info("received a notification") + + _, span := tracer.Start(parentCtx, "GlobalRateLimitRunner.translateFromSubscription") + defer span.End() + + span.SetAttributes( + attribute.String("controller.key", update.Key), + attribute.Bool("update.delete", update.Delete), + ) if update.Delete { delete(rateLimitConfigsCache, update.Key) @@ -183,6 +200,9 @@ func (r *Runner) translate(xdsIR *ir.Xds) (*types.ResourceVersionTable, error) { } func (r *Runner) updateSnapshot(ctx context.Context, resource types.XdsResources) { + _, span := tracer.Start(ctx, "GlobalRateLimitRunner.updateSnapshot") + defer span.End() + if r.cache == nil { r.Logger.Error(nil, "failed to init the snapshot cache") return diff --git a/internal/xds/cache/snapshotcache.go b/internal/xds/cache/snapshotcache.go index afbfa8866fb..dccbf4f2f48 100644 --- a/internal/xds/cache/snapshotcache.go +++ b/internal/xds/cache/snapshotcache.go @@ -35,7 +35,7 @@ import ( var ( Hash = cachev3.IDHash{} - tracer = otel.Tracer("envoy-gateway/gateway-api") + tracer = otel.Tracer("envoy-gateway/xds/snapshotcache") ) // SnapshotCacheWithCallbacks uses the go-control-plane SimpleCache to store snapshots of diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index bf60b0e4539..58caad0d7bc 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -66,7 +66,7 @@ const ( defaultMaxConnectionAgeGrace = 2 * time.Minute ) -var tracer = otel.Tracer("envoy-gateway/gateway-api") +var tracer = otel.Tracer("envoy-gateway/xds") var maxConnectionAgeValues = []time.Duration{ 10 * time.Hour, @@ -316,7 +316,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, } } - result, err := t.Translate(val.XdsIR) + result, err := t.Translate(val.XdsIR, parentCtx) if err != nil { traceLogger.Error(err, "failed to translate xds ir") errChan <- err diff --git a/internal/xds/translator/translator.go b/internal/xds/translator/translator.go index 5e8a541ad0e..385cf90feb8 100644 --- a/internal/xds/translator/translator.go +++ b/internal/xds/translator/translator.go @@ -6,6 +6,7 @@ package translator import ( + "context" "errors" "fmt" "runtime" @@ -22,6 +23,7 @@ import ( matcherv3 "github.com/envoyproxy/go-control-plane/envoy/type/matcher/v3" resourcev3 "github.com/envoyproxy/go-control-plane/pkg/resource/v3" "github.com/envoyproxy/go-control-plane/pkg/wellknown" + "go.opentelemetry.io/otel" protobuf "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" "google.golang.org/protobuf/types/known/wrapperspb" @@ -44,6 +46,8 @@ const ( emptyClusterName = "EmptyCluster" ) +var tracer = otel.Tracer("envoy-gateway/xds/translator") + // The dummy cluster for TCP/UDP listeners that have no routes var emptyRouteCluster = &clusterv3.Cluster{ Name: emptyClusterName, @@ -94,7 +98,10 @@ type GlobalRateLimitSettings struct { } // Translate translates the XDS IR into xDS resources -func (t *Translator) Translate(xdsIR *ir.Xds) (*types.ResourceVersionTable, error) { +func (t *Translator) Translate(xdsIR *ir.Xds, ctx context.Context) (*types.ResourceVersionTable, error) { + _, span := tracer.Start(ctx, "Translator.Translate") + defer span.End() + if xdsIR == nil { return nil, errors.New("ir is nil") } diff --git a/internal/xds/translator/translator_test.go b/internal/xds/translator/translator_test.go index 0f1849377f7..29e9786cf13 100644 --- a/internal/xds/translator/translator_test.go +++ b/internal/xds/translator/translator_test.go @@ -6,6 +6,7 @@ package translator import ( + "context" "embed" "encoding/json" "os" @@ -180,7 +181,7 @@ func TestTranslateXds(t *testing.T) { FilterOrder: x.FilterOrder, RuntimeFlags: cfg.runtimeFlags, } - tCtx, err := tr.Translate(x) + tCtx, err := tr.Translate(x, context.Background()) if !strings.HasSuffix(inputFileName, "partial-invalid") && len(cfg.errMsg) == 0 { t.Log(inputFileName) require.NoError(t, err) @@ -384,7 +385,7 @@ func TestTranslateXdsWithExtensionErrorsWhenFailOpen(t *testing.T) { defer closeFunc() tr.ExtensionManager = &extMgr - tCtx, err := tr.Translate(x) + tCtx, err := tr.Translate(x, context.Background()) if len(cfg.errMsg) > 0 { require.EqualError(t, err, cfg.errMsg) } else { @@ -525,7 +526,7 @@ func TestTranslateXdsWithExtensionErrorsWhenFailClosed(t *testing.T) { defer closeFunc() tr.ExtensionManager = &extMgr - _, err = tr.Translate(x) + _, err = tr.Translate(x, context.Background()) require.EqualError(t, err, cfg.errMsg) }) } diff --git a/test/helm/gateway-addons-helm/e2e.in.yaml b/test/helm/gateway-addons-helm/e2e.in.yaml index bd554ba47dd..bf913c259a9 100644 --- a/test/helm/gateway-addons-helm/e2e.in.yaml +++ b/test/helm/gateway-addons-helm/e2e.in.yaml @@ -4,12 +4,5 @@ grafana: enabled: false opentelemetry-collector: enabled: true - mode: deployment - service: - type: LoadBalancer fluent-bit: enabled: false -tempo: - enabled: true - service: - type: LoadBalancer diff --git a/test/helm/gateway-addons-helm/e2e.out.yaml b/test/helm/gateway-addons-helm/e2e.out.yaml index 4b4d2925337..6978f3dcefc 100644 --- a/test/helm/gateway-addons-helm/e2e.out.yaml +++ b/test/helm/gateway-addons-helm/e2e.out.yaml @@ -10334,7 +10334,7 @@ metadata: app.kubernetes.io/component: standalone-collector component: standalone-collector spec: - type: LoadBalancer + type: ClusterIP ports: - name: datadog From 3bf3309146cf3731ae6e1a4d90efb5a4ef4a1586 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Fri, 21 Nov 2025 12:49:19 +0530 Subject: [PATCH 29/35] lint Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/translator.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/gatewayapi/translator.go b/internal/gatewayapi/translator.go index dd9f25a35ac..57efcced186 100644 --- a/internal/gatewayapi/translator.go +++ b/internal/gatewayapi/translator.go @@ -544,8 +544,10 @@ func getAttributes(resources *resource.Resources) []attribute.KeyValue { if resources.GatewayClass == nil { return attrs } - attrs = append(attrs, attribute.String("gateway-class", resources.GatewayClass.Name)) - attrs = append(attrs, attribute.String("gateway-class-namespace", resources.GatewayClass.Namespace)) + attrs = append(attrs, + attribute.String("gateway-class", resources.GatewayClass.Name), + attribute.String("gateway-class-namespace", resources.GatewayClass.Namespace), + ) if resources.GatewayClass.Spec.ControllerName != "" { attrs = append(attrs, attribute.String("gateway-class-controller-name", string(resources.GatewayClass.Spec.ControllerName))) } From 1e1322b40fbdd881e61d2cc5e929b5c7084d6f01 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 23 Nov 2025 11:11:27 +0545 Subject: [PATCH 30/35] equal and tests Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/resource/resource.go | 19 +++++++++++ internal/gatewayapi/resource/resource_test.go | 32 +++++++++++++++++++ internal/message/types.go | 18 +++++++++++ 3 files changed, 69 insertions(+) diff --git a/internal/gatewayapi/resource/resource.go b/internal/gatewayapi/resource/resource.go index 4201cb801ff..9b0d3600e97 100644 --- a/internal/gatewayapi/resource/resource.go +++ b/internal/gatewayapi/resource/resource.go @@ -7,6 +7,7 @@ package resource import ( "context" + "reflect" "sort" certificatesv1b1 "k8s.io/api/certificates/v1beta1" @@ -228,6 +229,24 @@ func (c *ControllerResourcesContext) DeepCopy() *ControllerResourcesContext { } } +// Equal compares two Resources objects for equality. +func (c *ControllerResourcesContext) Equal(other *ControllerResourcesContext) bool { + if c == nil && other == nil { + return true + } + if c == nil || other == nil { + return false + } + if c.Resources == nil && other.Resources == nil { + return true + } + if c.Resources == nil || other.Resources == nil { + return false + } + + return reflect.DeepEqual(c.Resources, other.Resources) +} + // DeepCopy creates a new ControllerResources. // It is handwritten since the tooling was unable to copy into a new slice func (c *ControllerResources) DeepCopy() *ControllerResources { diff --git a/internal/gatewayapi/resource/resource_test.go b/internal/gatewayapi/resource/resource_test.go index 9e6bb054e4b..823a08b4914 100644 --- a/internal/gatewayapi/resource/resource_test.go +++ b/internal/gatewayapi/resource/resource_test.go @@ -131,6 +131,38 @@ func TestEqualXds(t *testing.T) { } } +func TestEqualControllerResourcesContext(t *testing.T) { + c1 := context.Background() + c2 := context.TODO() + r1 := &ControllerResourcesContext{ + Resources: &ControllerResources{ + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + }, + }, + }, + Context: c1, + } + r2 := &ControllerResourcesContext{ + Resources: &ControllerResources{ + { + GatewayClass: &gwapiv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + }, + }, + }, + Context: c2, + } + + assert.True(t, r1.Equal(r2)) + assert.True(t, r2.Equal(r1)) +} + func TestGetEndpointSlicesForBackendDualStack(t *testing.T) { // Test data setup dualStackService := &discoveryv1.EndpointSlice{ diff --git a/internal/message/types.go b/internal/message/types.go index 863bcdc31cd..df32350a663 100644 --- a/internal/message/types.go +++ b/internal/message/types.go @@ -7,6 +7,7 @@ package message import ( "context" + "reflect" "github.com/telepresenceio/watchable" "k8s.io/apimachinery/pkg/runtime/schema" @@ -150,6 +151,23 @@ func (x *XdsIRWithContext) DeepCopy() *XdsIRWithContext { } } +func (x *XdsIRWithContext) Equal(other *XdsIRWithContext) bool { + if x == nil && other == nil { + return true + } + if x == nil || other == nil { + return false + } + if x.XdsIR == nil && other.XdsIR == nil { + return true + } + if x.XdsIR == nil || other.XdsIR == nil { + return false + } + + return reflect.DeepEqual(x.XdsIR, other.XdsIR) +} + // XdsIR message type XdsIR struct { watchable.Map[string, *XdsIRWithContext] From cd7fdd67dbe6d2cd2a2d04d5f0716812f4fad3ce Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 23 Nov 2025 11:18:16 +0545 Subject: [PATCH 31/35] equal and tests Signed-off-by: Shreemaan Abhishek --- internal/message/types_test.go | 83 ++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 internal/message/types_test.go diff --git a/internal/message/types_test.go b/internal/message/types_test.go new file mode 100644 index 00000000000..5f05ee1f91b --- /dev/null +++ b/internal/message/types_test.go @@ -0,0 +1,83 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package message_test + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" + "github.com/envoyproxy/gateway/internal/ir" + "github.com/envoyproxy/gateway/internal/message" +) + +// XdsIRWithContext structs with differing context values should be Equal +func TestXdsWithContextEqual(t *testing.T) { + xdsIR := &ir.Xds{ + HTTP: []*ir.HTTPListener{ + { + CoreListenerDetails: ir.CoreListenerDetails{ + Name: fmt.Sprintf("default/%s/listener-0", "gwName"), + }, + Routes: []*ir.HTTPRoute{ + { + Name: "route-0", + Traffic: &ir.TrafficFeatures{ + RateLimit: &ir.RateLimit{ + Global: &ir.GlobalRateLimit{ + Rules: []*ir.RateLimitRule{ + { + HeaderMatches: []*ir.StringMatch{ + { + Name: "x-user-id", + Distinct: true, + }, + }, + Limit: ir.RateLimitValue{ + Requests: 100, + Unit: ir.RateLimitUnit(egv1a1.RateLimitUnitMinute), + }, + }, + { + HeaderMatches: []*ir.StringMatch{ + { + Name: "x-another-user-id", + Distinct: true, + }, + }, + Limit: ir.RateLimitValue{ + Requests: 10, + Unit: ir.RateLimitUnit(egv1a1.RateLimitUnitSecond), + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + c1 := context.Background() + c2 := context.TODO() + + x1 := &message.XdsIRWithContext{ + XdsIR: xdsIR, + Context: c1, + } + x2 := &message.XdsIRWithContext{ + XdsIR: xdsIR, + Context: c2, + } + + assert.True(t, x1.Equal(x2)) + assert.True(t, x2.Equal(x1)) +} From 9836161c279b306af1eb0fa30c9c3575e6db857a Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Sun, 23 Nov 2025 12:14:18 +0545 Subject: [PATCH 32/35] no otel things in translator Signed-off-by: Shreemaan Abhishek --- internal/cmd/egctl/translate.go | 10 +++----- internal/gatewayapi/runner/runner.go | 4 +-- internal/gatewayapi/translator.go | 30 +++------------------- internal/gatewayapi/translator_test.go | 4 +-- internal/xds/runner/runner.go | 4 ++- internal/xds/translator/translator.go | 9 +------ internal/xds/translator/translator_test.go | 7 +++-- 7 files changed, 18 insertions(+), 50 deletions(-) diff --git a/internal/cmd/egctl/translate.go b/internal/cmd/egctl/translate.go index 3765c13d214..9c06ce3c790 100644 --- a/internal/cmd/egctl/translate.go +++ b/internal/cmd/egctl/translate.go @@ -7,7 +7,6 @@ package egctl import ( "bufio" - "context" "encoding/json" "fmt" "io" @@ -292,8 +291,7 @@ func translateGatewayAPIToIR(resources *resource.Resources) (*gatewayapi.Transla } } - ctx := context.Background() - result, _ := t.Translate(resources, ctx) + result, _ := t.Translate(resources) return result, nil } @@ -313,7 +311,7 @@ func translateGatewayAPIToGatewayAPI(resources *resource.Resources) (resource.Re BackendEnabled: true, Logger: logging.DefaultLogger(io.Discard, egv1a1.LogLevelInfo), } - gRes, _ := gTranslator.Translate(resources, context.Background()) + gRes, _ := gTranslator.Translate(resources) // Update the status of the GatewayClass based on EnvoyProxy validation epInvalid := false if resources.EnvoyProxyForGatewayClass != nil { @@ -353,7 +351,7 @@ func TranslateGatewayAPIToXds(namespace, dnsDomain, resourceType string, resourc BackendEnabled: true, Logger: logging.DefaultLogger(io.Discard, egv1a1.LogLevelInfo), } - gRes, _ := gTranslator.Translate(resources, context.Background()) + gRes, _ := gTranslator.Translate(resources) keys := []string{} for key := range gRes.XdsIR { @@ -376,7 +374,7 @@ func TranslateGatewayAPIToXds(namespace, dnsDomain, resourceType string, resourc if resources.EnvoyProxyForGatewayClass != nil { xTranslator.FilterOrder = resources.EnvoyProxyForGatewayClass.Spec.FilterOrder } - xRes, err := xTranslator.Translate(val, context.Background()) + xRes, err := xTranslator.Translate(val) if err != nil { return nil, fmt.Errorf("failed to translate xds ir for key %s value %+v, error:%w", key, val, err) } diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 65f0fad0e73..8393893828d 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -201,12 +201,12 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re } // Translate to IR _, translateToIRSpan := tracer.Start(parentCtx, "GatewayApiRunner.ResoureTranslationCycle.TranslateToIR") - result, err := t.Translate(resources, parentCtx) + result, err := t.Translate(resources) + translateToIRSpan.End() if err != nil { // Currently all errors that Translate returns should just be logged traceLogger.Error(err, "errors detected during translation", "gateway-class", resources.GatewayClass.Name) } - translateToIRSpan.End() // Publish the IRs. // Also validate the ir before sending it. diff --git a/internal/gatewayapi/translator.go b/internal/gatewayapi/translator.go index 57efcced186..cf943962ce3 100644 --- a/internal/gatewayapi/translator.go +++ b/internal/gatewayapi/translator.go @@ -6,12 +6,9 @@ package gatewayapi import ( - "context" "errors" "fmt" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" "golang.org/x/exp/maps" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" @@ -52,13 +49,10 @@ const ( wellKnownPortShift = 10000 ) -var ( - _ TranslatorManager = (*Translator)(nil) - tracer = otel.Tracer("envoy-gateway/gateway-api/translator") -) +var _ TranslatorManager = (*Translator)(nil) type TranslatorManager interface { - Translate(resources *resource.Resources, ctx context.Context) (*TranslateResult, error) + Translate(resources *resource.Resources) (*TranslateResult, error) GetRelevantGateways(resources *resource.Resources) (acceptedGateways, failedGateways []*GatewayContext) RoutesTranslator @@ -223,10 +217,7 @@ func newTranslateResult( return translateResult } -func (t *Translator) Translate(resources *resource.Resources, ctx context.Context) (*TranslateResult, error) { - _, span := tracer.Start(ctx, "Translator.Translate") - defer span.End() - span.SetAttributes(getAttributes(resources)...) +func (t *Translator) Translate(resources *resource.Resources) (*TranslateResult, error) { var errs error // Get Gateways belonging to our GatewayClass. @@ -538,18 +529,3 @@ func (t *Translator) IRKey(gatewayNN types.NamespacedName) string { } return irStringKey(gatewayNN.Namespace, gatewayNN.Name) } - -func getAttributes(resources *resource.Resources) []attribute.KeyValue { - attrs := []attribute.KeyValue{} - if resources.GatewayClass == nil { - return attrs - } - attrs = append(attrs, - attribute.String("gateway-class", resources.GatewayClass.Name), - attribute.String("gateway-class-namespace", resources.GatewayClass.Namespace), - ) - if resources.GatewayClass.Spec.ControllerName != "" { - attrs = append(attrs, attribute.String("gateway-class-controller-name", string(resources.GatewayClass.Spec.ControllerName))) - } - return attrs -} diff --git a/internal/gatewayapi/translator_test.go b/internal/gatewayapi/translator_test.go index 72d11ef49f5..2a8432c278a 100644 --- a/internal/gatewayapi/translator_test.go +++ b/internal/gatewayapi/translator_test.go @@ -421,7 +421,7 @@ func TestTranslate(t *testing.T) { }, }) - got, _ := translator.Translate(resources, context.Background()) + got, _ := translator.Translate(resources) require.NoError(t, field.SetValue(got, "LastTransitionTime", metav1.NewTime(time.Time{}))) outputFilePath := strings.ReplaceAll(inputFile, ".in.yaml", ".out.yaml") out, err := yaml.Marshal(got) @@ -696,7 +696,7 @@ func TestTranslateWithExtensionKinds(t *testing.T) { }, }) - got, _ := translator.Translate(resources, context.Background()) + got, _ := translator.Translate(resources) require.NoError(t, field.SetValue(got, "LastTransitionTime", metav1.NewTime(time.Time{}))) // Also fix lastTransitionTime in unstructured members for i := range got.ExtensionServerPolicies { diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index 58caad0d7bc..838813015a2 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -316,7 +316,9 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, } } - result, err := t.Translate(val.XdsIR, parentCtx) + _, translateSpan := tracer.Start(parentCtx, "Translator.Translate") + result, err := t.Translate(val.XdsIR) + translateSpan.End() if err != nil { traceLogger.Error(err, "failed to translate xds ir") errChan <- err diff --git a/internal/xds/translator/translator.go b/internal/xds/translator/translator.go index 385cf90feb8..5e8a541ad0e 100644 --- a/internal/xds/translator/translator.go +++ b/internal/xds/translator/translator.go @@ -6,7 +6,6 @@ package translator import ( - "context" "errors" "fmt" "runtime" @@ -23,7 +22,6 @@ import ( matcherv3 "github.com/envoyproxy/go-control-plane/envoy/type/matcher/v3" resourcev3 "github.com/envoyproxy/go-control-plane/pkg/resource/v3" "github.com/envoyproxy/go-control-plane/pkg/wellknown" - "go.opentelemetry.io/otel" protobuf "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" "google.golang.org/protobuf/types/known/wrapperspb" @@ -46,8 +44,6 @@ const ( emptyClusterName = "EmptyCluster" ) -var tracer = otel.Tracer("envoy-gateway/xds/translator") - // The dummy cluster for TCP/UDP listeners that have no routes var emptyRouteCluster = &clusterv3.Cluster{ Name: emptyClusterName, @@ -98,10 +94,7 @@ type GlobalRateLimitSettings struct { } // Translate translates the XDS IR into xDS resources -func (t *Translator) Translate(xdsIR *ir.Xds, ctx context.Context) (*types.ResourceVersionTable, error) { - _, span := tracer.Start(ctx, "Translator.Translate") - defer span.End() - +func (t *Translator) Translate(xdsIR *ir.Xds) (*types.ResourceVersionTable, error) { if xdsIR == nil { return nil, errors.New("ir is nil") } diff --git a/internal/xds/translator/translator_test.go b/internal/xds/translator/translator_test.go index 29e9786cf13..0f1849377f7 100644 --- a/internal/xds/translator/translator_test.go +++ b/internal/xds/translator/translator_test.go @@ -6,7 +6,6 @@ package translator import ( - "context" "embed" "encoding/json" "os" @@ -181,7 +180,7 @@ func TestTranslateXds(t *testing.T) { FilterOrder: x.FilterOrder, RuntimeFlags: cfg.runtimeFlags, } - tCtx, err := tr.Translate(x, context.Background()) + tCtx, err := tr.Translate(x) if !strings.HasSuffix(inputFileName, "partial-invalid") && len(cfg.errMsg) == 0 { t.Log(inputFileName) require.NoError(t, err) @@ -385,7 +384,7 @@ func TestTranslateXdsWithExtensionErrorsWhenFailOpen(t *testing.T) { defer closeFunc() tr.ExtensionManager = &extMgr - tCtx, err := tr.Translate(x, context.Background()) + tCtx, err := tr.Translate(x) if len(cfg.errMsg) > 0 { require.EqualError(t, err, cfg.errMsg) } else { @@ -526,7 +525,7 @@ func TestTranslateXdsWithExtensionErrorsWhenFailClosed(t *testing.T) { defer closeFunc() tr.ExtensionManager = &extMgr - _, err = tr.Translate(x, context.Background()) + _, err = tr.Translate(x) require.EqualError(t, err, cfg.errMsg) }) } From d96ac3a0373436e0838155582c54a23d8a111d9b Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Thu, 27 Nov 2025 10:44:23 +0545 Subject: [PATCH 33/35] review comments Signed-off-by: Shreemaan Abhishek --- internal/cmd/server.go | 8 ++++---- internal/gatewayapi/runner/runner.go | 15 +++++++-------- internal/globalratelimit/runner/runner.go | 13 +++++++------ internal/logging/log.go | 1 - internal/xds/cache/snapshotcache.go | 18 ++++++++++++++++++ internal/xds/runner/runner.go | 15 +++++++-------- 6 files changed, 43 insertions(+), 27 deletions(-) diff --git a/internal/cmd/server.go b/internal/cmd/server.go index 2cf0a4d5b2b..5582dab2666 100644 --- a/internal/cmd/server.go +++ b/internal/cmd/server.go @@ -157,6 +157,10 @@ func startRunners(ctx context.Context, cfg *config.Server) (err error) { runners := []struct { runner Runner }{ + { + // Start the Traces Server + runner: traces.New(cfg), + }, { // Start the Provider Service // It fetches the resources from the configured provider type @@ -214,10 +218,6 @@ func startRunners(ctx context.Context, cfg *config.Server) (err error) { // It provides metrics endpoints for monitoring. runner: metrics.New(cfg), }, - { - // Start the Traces Server - runner: traces.New(cfg), - }, } // Start all runners diff --git a/internal/gatewayapi/runner/runner.go b/internal/gatewayapi/runner/runner.go index 8393893828d..35a5c8fd33c 100644 --- a/internal/gatewayapi/runner/runner.go +++ b/internal/gatewayapi/runner/runner.go @@ -135,12 +135,11 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re parentCtx = update.Value.Context } - traceLogger := r.Logger.WithTrace(parentCtx) - + traceCtx, span := tracer.Start(parentCtx, "GatewayApiRunner.subscribeAndTranslate") + defer span.End() + traceLogger := r.Logger.WithTrace(traceCtx) traceLogger.Info("received an update", "key", update.Key) - _, span := tracer.Start(parentCtx, "GatewayApiRunner.subscribeAndTranslate") - defer span.End() valWrapper := update.Value // There is only 1 key which is the controller name // so when a delete is triggered, delete all keys @@ -169,7 +168,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re var backendTLSPolicyStatusCount, clientTrafficPolicyStatusCount, backendTrafficPolicyStatusCount int var securityPolicyStatusCount, envoyExtensionPolicyStatusCount, backendStatusCount, extensionServerPolicyStatusCount int - span.AddEvent("gateway_resources_translation_cycle", trace.WithAttributes(attribute.Int("resources.count", len(*val)))) + span.AddEvent("translate", trace.WithAttributes(attribute.Int("resources.count", len(*val)))) for _, resources := range *val { // Translate and publish IRs. t := &gatewayapi.Translator{ @@ -200,7 +199,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re traceLogger.Info("extension resources", "GVKs count", len(extGKs)) } // Translate to IR - _, translateToIRSpan := tracer.Start(parentCtx, "GatewayApiRunner.ResoureTranslationCycle.TranslateToIR") + _, translateToIRSpan := tracer.Start(traceCtx, "GatewayApiRunner.ResoureTranslationCycle.TranslateToIR") result, err := t.Translate(resources) translateToIRSpan.End() if err != nil { @@ -238,7 +237,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re } else { m := message.XdsIRWithContext{ XdsIR: val, - Context: parentCtx, + Context: traceCtx, } r.XdsIR.Store(key, &m) xdsIRCount++ @@ -246,7 +245,7 @@ func (r *Runner) subscribeAndTranslate(sub <-chan watchable.Snapshot[string, *re } // Update Status - _, statusUpdateSpan := tracer.Start(parentCtx, "GatewayApiRunner.ResoureTranslationCycle.UpdateStatus") + _, statusUpdateSpan := tracer.Start(traceCtx, "GatewayApiRunner.ResoureTranslationCycle.UpdateStatus") if result.GatewayClass != nil { key := utils.NamespacedName(result.GatewayClass) r.ProviderResources.GatewayClassStatuses.Store(key, &result.GatewayClass.Status) diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index 666580f85a3..8793bd13d15 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -146,20 +146,21 @@ func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchab if update.Value != nil && update.Value.Context != nil { parentCtx = update.Value.Context } - traceLogger := r.Logger.WithTrace(parentCtx) - traceLogger.Info("received a notification") - _, span := tracer.Start(parentCtx, "GlobalRateLimitRunner.translateFromSubscription") + traceCtx, span := tracer.Start(parentCtx, "GlobalRateLimitRunner.translateFromSubscription") defer span.End() + traceLogger := r.Logger.WithTrace(traceCtx) + traceLogger.Info("received a notification") + span.SetAttributes( - attribute.String("controller.key", update.Key), + attribute.String("xds-ir.key", update.Key), attribute.Bool("update.delete", update.Delete), ) if update.Delete { delete(rateLimitConfigsCache, update.Key) - r.updateSnapshot(ctx, buildXDSResourceFromCache(rateLimitConfigsCache)) + r.updateSnapshot(traceCtx, buildXDSResourceFromCache(rateLimitConfigsCache)) } else { // Translate to ratelimit xDS Config. rvt, err := r.translate(update.Value.XdsIR) @@ -172,7 +173,7 @@ func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchab if rvt != nil { // Build XdsResources to use for the snapshot update from the cache. rateLimitConfigsCache[update.Key] = rvt.XdsResources[resourcev3.RateLimitConfigType] - r.updateSnapshot(ctx, buildXDSResourceFromCache(rateLimitConfigsCache)) + r.updateSnapshot(traceCtx, buildXDSResourceFromCache(rateLimitConfigsCache)) } } }, diff --git a/internal/logging/log.go b/internal/logging/log.go index 641bc481d53..27b02a01571 100644 --- a/internal/logging/log.go +++ b/internal/logging/log.go @@ -102,7 +102,6 @@ func (l Logger) WithTrace(ctx context.Context) Logger { fields := []interface{}{ "trace_id", sc.TraceID().String(), "span_id", sc.SpanID().String(), - "trace_flags", sc.TraceFlags().String(), } if ts := sc.TraceState(); ts.Len() > 0 { diff --git a/internal/xds/cache/snapshotcache.go b/internal/xds/cache/snapshotcache.go index dccbf4f2f48..72aaa1ba142 100644 --- a/internal/xds/cache/snapshotcache.go +++ b/internal/xds/cache/snapshotcache.go @@ -16,6 +16,7 @@ package cache import ( "context" "fmt" + "math" "strconv" "sync" "time" @@ -69,6 +70,7 @@ type snapshotCache struct { nodeFrequency nodeFrequencyMap streamDuration streamDurationMap deltaStreamDuration streamDurationMap + snapshotVersion int64 lastSnapshot snapshotMap log *zap.SugaredLogger mu sync.Mutex @@ -85,6 +87,9 @@ func (s *snapshotCache) GenerateNewSnapshot(irKey string, resources types.XdsRes sc := trace.SpanContextFromContext(ctx) version := sc.TraceID().String() + if !sc.IsValid() { + version = s.newSnapshotVersion() + } // Create a snapshot with all xDS resources. snapshot, err := cachev3.NewSnapshot( @@ -119,6 +124,19 @@ func (s *snapshotCache) GenerateNewSnapshot(irKey string, resources types.XdsRes return nil } +// newSnapshotVersion increments the current snapshotVersion +// and returns as a string. +func (s *snapshotCache) newSnapshotVersion() string { + // Reset the snapshotVersion if it ever hits max size. + if s.snapshotVersion == math.MaxInt64 { + s.snapshotVersion = 0 + } + + // Increment the snapshot version & return as string. + s.snapshotVersion++ + return strconv.FormatInt(s.snapshotVersion, 10) +} + // NewSnapshotCache gives you a fresh SnapshotCache. // It needs a logger that supports the go-control-plane // required interface (Debugf, Infof, Warnf, and Errorf). diff --git a/internal/xds/runner/runner.go b/internal/xds/runner/runner.go index 838813015a2..6bbc841fb0b 100644 --- a/internal/xds/runner/runner.go +++ b/internal/xds/runner/runner.go @@ -265,23 +265,22 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, parentCtx = update.Value.Context } - traceLogger := r.Logger.WithTrace(parentCtx) - traceLogger.Info("received an update") - - _, span := tracer.Start(parentCtx, "XdsRunner.subscribeAndTranslate") + traceCtx, span := tracer.Start(parentCtx, "XdsRunner.subscribeAndTranslate") defer span.End() + traceLogger := r.Logger.WithTrace(traceCtx) + traceLogger.Info("received an update") key := update.Key val := update.Value // Add span attributes for observability span.SetAttributes( - attribute.String("controller.key", update.Key), + attribute.String("xds-ir.key", update.Key), attribute.Bool("update.delete", update.Delete), ) if update.Delete { - if err := r.cache.GenerateNewSnapshot(key, nil, parentCtx); err != nil { + if err := r.cache.GenerateNewSnapshot(key, nil, traceCtx); err != nil { traceLogger.Error(err, "failed to delete the snapshot") errChan <- err } @@ -316,7 +315,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, } } - _, translateSpan := tracer.Start(parentCtx, "Translator.Translate") + _, translateSpan := tracer.Start(traceCtx, "Translator.Translate") result, err := t.Translate(val.XdsIR) translateSpan.End() if err != nil { @@ -339,7 +338,7 @@ func (r *Runner) translateFromSubscription(sub <-chan watchable.Snapshot[string, errChan <- err } else { // Update snapshot cache - if err := r.cache.GenerateNewSnapshot(key, result.XdsResources, parentCtx); err != nil { + if err := r.cache.GenerateNewSnapshot(key, result.XdsResources, traceCtx); err != nil { r.Logger.Error(err, "failed to generate a snapshot") errChan <- err } From be8f839b62723de95165aefb5df1397720d8a9a4 Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Thu, 27 Nov 2025 11:13:11 +0545 Subject: [PATCH 34/35] nil check Signed-off-by: Shreemaan Abhishek --- internal/gatewayapi/resource/resource.go | 6 +++++- internal/globalratelimit/runner/runner.go | 2 +- internal/message/types.go | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/gatewayapi/resource/resource.go b/internal/gatewayapi/resource/resource.go index 9b0d3600e97..d66db3c5bfd 100644 --- a/internal/gatewayapi/resource/resource.go +++ b/internal/gatewayapi/resource/resource.go @@ -223,8 +223,12 @@ func (c *ControllerResourcesContext) DeepCopy() *ControllerResourcesContext { if c == nil { return nil } + var resourcesCopy *ControllerResources + if c.Resources != nil { + resourcesCopy = c.Resources.DeepCopy() + } return &ControllerResourcesContext{ - Resources: c.Resources.DeepCopy(), + Resources: resourcesCopy, Context: c.Context, } } diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index 8793bd13d15..27d8119cfe7 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -142,7 +142,7 @@ func (r *Runner) translateFromSubscription(ctx context.Context, c <-chan watchab message.HandleSubscription(message.Metadata{Runner: r.Name(), Message: message.XDSIRMessageName}, c, func(update message.Update[string, *message.XdsIRWithContext], errChan chan error) { - parentCtx := context.Background() + parentCtx := ctx if update.Value != nil && update.Value.Context != nil { parentCtx = update.Value.Context } diff --git a/internal/message/types.go b/internal/message/types.go index df32350a663..1053c09183a 100644 --- a/internal/message/types.go +++ b/internal/message/types.go @@ -145,8 +145,12 @@ func (x *XdsIRWithContext) DeepCopy() *XdsIRWithContext { if x == nil { return nil } + var xdsIRCopy *ir.Xds + if x.XdsIR != nil { + xdsIRCopy = x.XdsIR.DeepCopy() + } return &XdsIRWithContext{ - XdsIR: x.XdsIR.DeepCopy(), + XdsIR: xdsIRCopy, Context: x.Context, } } From 0c545cc81e7742c119e008bfd0fb90e4d62ea12a Mon Sep 17 00:00:00 2001 From: Shreemaan Abhishek Date: Wed, 3 Dec 2025 20:41:47 +0545 Subject: [PATCH 35/35] flags Signed-off-by: Shreemaan Abhishek --- internal/logging/log_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/logging/log_test.go b/internal/logging/log_test.go index cb133eb53d3..0576d133aa3 100644 --- a/internal/logging/log_test.go +++ b/internal/logging/log_test.go @@ -126,5 +126,4 @@ func TestLoggerWithTrace(t *testing.T) { output := buffer.String() assert.Contains(t, output, traceID.String()) assert.Contains(t, output, spanID.String()) - assert.Contains(t, output, trace.FlagsSampled.String()) }