Merge branch 'master' into multiple-streaming-subscriptions-2

antontroshin · web-flow · commit 39e9d91c0466 · 2025-04-09T13:10:05.000-05:00
diff --git a/docs/release_notes/v1.15.4.md b/docs/release_notes/v1.15.4.md
@@ -3,7 +3,12 @@
 This update includes bug fixes:
 
 - [Fix degradation of Workflow runtime performance over time](#fix-degradation-of-workflow-runtime-performance-over-time)
+- [Fix remote Actor invocation 500 retry](#fix-remote-actor-invocation-500-retry)
+- [Fix Global Actors Enabled Configuration](#fix-global-actors-enabled-configuration)
+- [Prevent panic of reminder operations on slow Actor Startup](#prevent-panic-of-reminder-operations-on-slow-actor-startup)
+- [Remove client-side rate limiter from Sentry](#remove-client-side-rate-limiter-from-sentry)
 - [Allow Service Account for MetalBear mirrord operator in sidecar injector](#allow-service-account-for-metalbear-mirrord-operator-in-sidecar-injector)
+- [Fix Scheduler Client connection pruning](#fix-scheduler-client-connection-pruning)
 
 ## Fix degradation of Workflow runtime performance over time
 
@@ -25,6 +30,85 @@ This caused Jobs to fail, and enter failure policy retry loops.
 
 Refactor the Scheduler connection pool logic to properly prune stale connections to prevent job execution occurring on stale connections and causing failure policy loops.
 
+## Fix remote Actor invocation 500 retry
+
+### Problem
+
+An actor invocation across hosts which result in a 500 HTTP header response code would result in the request being retried 5 times.
+
+### Impact
+
+Services which return a 500 HTTP header response code would result in requests under normal operation to return slowly, and request the service on the same request multiple times.
+
+### Root cause
+
+The Actor engine considered a 500 HTTP header response code to be a retriable error, rather than a successful request which returned a non-200 status code.
+
+### Solution
+
+Remove the 500 HTTP header response code from the list of retriable errors.
+
+### Problem
+
+## Fix Global Actors Enabled Configuration
+
+### Problem
+
+When `global.actors.enabled` was set to `false` via Helm or the environment variable `ACTORS_ENABLED=false`, the Dapr sidecar would still attempt to connect to the placement service, causing readiness probe failures and repeatedly logged errors about failing to connect to placement.
+Fixes this [issue](https://github.com/dapr/dapr/issues/8551).
+
+### Impact
+
+Dapr sidecars would fail their readiness probes and log errors like:
+```
+Failed to connect to placement dns:///dapr-placement-server.dapr-system.svc.cluster.local:50005: failed to create placement client: rpc error: code = Unavailable desc = last resolver error: produced zero addresses
+```
+
+### Root cause
+
+The sidecar injector was not properly respecting the global actors enabled configuration when setting up the placement service connection.
+
+### Solution
+
+The sidecar injector now properly respects the `global.actors.enabled` helm configuration and `ACTORS_ENABLED` environment variable. When set to `false`, it will not attempt to connect to the placement service, allowing the sidecar to start successfully without actor functionality.
+
+
+## Prevent panic of reminder operations on slow Actor Startup
+
+### Problem
+
+The Dapr runtime HTTP server would panic if a reminder operation timed out while an Actor was starting up.
+
+### Impact
+
+The HTTP server would panic, causing degraded performance.
+
+### Root cause
+
+The Dapr runtime would attempt to use the reminder service before it was initialized.
+
+### Solution
+
+Correctly return an errors that the actor runtime was not ready in time for the reminder operation.
+
+## Remove client-side rate limiter from Sentry
+
+### Problem
+
+A cold start of many Dapr deployments would take a long time, and even cause some crash loops.
+
+### Impact
+
+A large Dapr deployment would take a non-linear more amount of time that a smaller one to completely roll out.
+
+### Root cause
+
+The Sentry Kubernetes client was configured with a rate limiter which would be exhausted when services all new Dapr deployment at once, cause many client to wait significantly.
+
+### Solution
+
+Remove the client-side rate limiting from the Sentry Kubernetes client.
+
 ## Allow Service Account for MetalBear mirrord operator in sidecar injector
 
 ### Problem
@@ -33,12 +117,30 @@ Mirrord Operator is not on the allow list of Service Accounts for the dapr sidec
 
 ### Impact
 
-Running mirrord in `copy_target` mode would cause the pod to initalise with without the dapr container.
+Running mirrord in `copy_target` mode would cause the pod to initalise without the dapr container.
 
 ### Root cause
 
 Mirrord Operator is not on the allow list of Service Accounts for the dapr sidecar injector.
 
 ### Solution
 
-Add the Mirrord Operator into the allow list of Service Accounts for the dapr sidecar injector.
+Add the Mirrord Operator into the allow list of Service Accounts for the dapr sidecar injector.
+
+## Fix Scheduler Client connection pruning
+
+### Problem
+
+Daprd would attempt to connect to stale Scheduler addresses.
+
+### Impact
+
+Network resource usage and error reporting from service mesh sidecars.
+
+### Root cause
+
+Daprd would not close Scheduler gRPC connections to hosts which no longer exist.
+
+### Solution
+
+Daprd now closes connections to Scheduler hosts when they are no longer in the list of active hosts.
diff --git a/go.mod b/go.mod
@@ -1,6 +1,6 @@
 module github.com/dapr/dapr
 
-go 1.24.1
+go 1.24.2
 
 require (
 	contrib.go.opencensus.io/exporter/prometheus v0.4.2
@@ -252,7 +252,7 @@ require (
 	github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2 // indirect
 	github.com/gofrs/uuid v4.4.0+incompatible // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/golang-jwt/jwt/v4 v4.5.1 // indirect
+	github.com/golang-jwt/jwt/v4 v4.5.2 // indirect
 	github.com/golang-jwt/jwt/v5 v5.2.2 // indirect
 	github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9 // indirect
 	github.com/golang-sql/sqlexp v0.1.0 // indirect
diff --git a/go.sum b/go.sum
@@ -738,8 +738,8 @@ github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69
 github.com/goji/httpauth v0.0.0-20160601135302-2da839ab0f4d/go.mod h1:nnjvkQ9ptGaCkuDUx6wNykzzlUixGxvkme+H/lnzb+A=
 github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
 github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
-github.com/golang-jwt/jwt/v4 v4.5.1 h1:JdqV9zKUdtaa9gdPlywC3aeoEsR681PlKC+4F5gQgeo=
-github.com/golang-jwt/jwt/v4 v4.5.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
+github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI=
+github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
 github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8=
 github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
 github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9 h1:au07oEsX2xN0ktxqI+Sida1w446QrXBRJ0nee3SNZlA=
diff --git a/pkg/actors/actors.go b/pkg/actors/actors.go
@@ -193,12 +193,10 @@ func (a *actors) Init(opts InitOptions) error {
 
 	storeEnabled := a.buildStateStore(opts, apiLevel)
 
-	if a.reminderStore != nil {
-		a.reminders = reminders.New(reminders.Options{
-			Storage: a.reminderStore,
-			Table:   a.table,
-		})
-	}
+	a.reminders = reminders.New(reminders.Options{
+		Storage: a.reminderStore,
+		Table:   a.table,
+	})
 
 	var err error
 	a.placement, err = placement.New(placement.Options{
@@ -357,6 +355,10 @@ func (a *actors) Reminders(ctx context.Context) (reminders.Interface, error) {
 		return nil, err
 	}
 
+	if a.reminders == nil {
+		return nil, messages.ErrActorRuntimeNotFound
+	}
+
 	return a.reminders, nil
 }
 
@@ -374,7 +376,7 @@ func (a *actors) waitForReady(ctx context.Context) error {
 		}
 		return nil
 	case <-ctx.Done():
-		return ctx.Err()
+		return messages.ErrActorRuntimeNotFound
 	}
 }
 
diff --git a/pkg/apphealth/health.go b/pkg/apphealth/health.go
@@ -155,11 +155,6 @@ func (h *AppHealth) ReportHealth(status *Status) {
 		return
 	}
 
-	// Limit health reports to 1 per second
-	if !h.ratelimitReports() {
-		return
-	}
-
 	// Channel is buffered, so make sure that this doesn't block
 	// Just in case another report is being worked on!
 	select {
@@ -205,32 +200,6 @@ func (h *AppHealth) doProbe(parentCtx context.Context) {
 	}
 }
 
-// Returns true if the health report can be saved. Only 1 report per second at most is allowed.
-func (h *AppHealth) ratelimitReports() bool {
-	var (
-		swapped  bool
-		attempts uint8
-	)
-
-	now := h.clock.Now().UnixMicro()
-
-	// Attempts at most 2 times before giving up, as the report may be stale at that point
-	for !swapped && attempts < 2 {
-		attempts++
-
-		// If the last report was less than `reportMinInterval` ago, nothing to do here
-		prev := h.lastReport.Load()
-		if prev > now-reportMinInterval.Microseconds() {
-			return false
-		}
-
-		swapped = h.lastReport.CompareAndSwap(prev, now)
-	}
-
-	// If we couldn't do the swap after 2 attempts, just return false
-	return swapped
-}
-
 func (h *AppHealth) setResult(ctx context.Context, status *Status) {
 	h.lastReport.Store(h.clock.Now().UnixMicro())
 
diff --git a/pkg/apphealth/health_test.go b/pkg/apphealth/health_test.go
@@ -121,56 +121,6 @@ func TestAppHealth_setResult(t *testing.T) {
 	assert.Equal(t, threshold+3, h.failureCount.Load())
 }
 
-func TestAppHealth_ratelimitReports(t *testing.T) {
-	clock := clocktesting.NewFakeClock(time.Now())
-	h := New(config.AppHealthConfig{}, nil)
-	h.clock = clock
-
-	// First run should always succeed
-	require.True(t, h.ratelimitReports())
-
-	// Run again without waiting
-	require.False(t, h.ratelimitReports())
-	require.False(t, h.ratelimitReports())
-
-	// Step and test
-	clock.Step(reportMinInterval)
-	require.True(t, h.ratelimitReports())
-	require.False(t, h.ratelimitReports())
-
-	// Run tests for 1 second, constantly
-	// Should succeed only 10 times.
-	clock.Step(reportMinInterval)
-	firehose := func(start time.Time, step time.Duration) (passed int64) {
-		for clock.Now().Sub(start) < time.Second*10 {
-			if h.ratelimitReports() {
-				passed++
-			}
-			clock.Step(step)
-		}
-		return passed
-	}
-
-	passed := firehose(clock.Now(), 10*time.Millisecond)
-	assert.Equal(t, int64(10), passed)
-
-	// Repeat, but run with 3 parallel goroutines
-	wg := sync.WaitGroup{}
-	totalPassed := atomic.Int64{}
-	start := clock.Now()
-	wg.Add(3)
-	for range 3 {
-		go func() {
-			totalPassed.Add(firehose(start, 3*time.Millisecond))
-			wg.Done()
-		}()
-	}
-	wg.Wait()
-	passed = totalPassed.Load()
-	assert.GreaterOrEqual(t, passed, int64(8))
-	assert.LessOrEqual(t, passed, int64(12))
-}
-
 func Test_StartProbes(t *testing.T) {
 	t.Run("closing context should return", func(t *testing.T) {
 		ctx, cancel := context.WithCancel(t.Context())
diff --git a/pkg/apphealth/status.go b/pkg/apphealth/status.go
@@ -16,11 +16,6 @@ package apphealth
 
 import "time"
 
-const (
-	// reportMinInterval is the minimum interval between health reports.
-	reportMinInterval = time.Second
-)
-
 type Status struct {
 	IsHealthy bool    `json:"ishealthy"`
 	TimeUnix  int64   `json:"timeUnix"`
diff --git a/pkg/injector/service/config_test.go b/pkg/injector/service/config_test.go
@@ -26,12 +26,27 @@ import (
 )
 
 func TestGetInjectorConfig(t *testing.T) {
+	t.Setenv("NAMESPACE", "test-namespace")
+	t.Setenv("SIDECAR_IMAGE", "daprd-test-image")
+
+	t.Run("respect globally disabling placement", func(t *testing.T) {
+		t.Setenv("ACTORS_ENABLED", "false")
+		cfg, err := GetConfig()
+		require.NoError(t, err)
+		assert.False(t, cfg.parsedActorsEnabled)
+		assert.Equal(t, "false", cfg.ActorsEnabled)
+	})
+	t.Run("default placement is enabled", func(t *testing.T) {
+		cfg, err := GetConfig()
+		require.NoError(t, err)
+		assert.Empty(t, cfg.ActorsEnabled)
+		assert.True(t, cfg.parsedActorsEnabled)
+	})
+
 	t.Run("with kube cluster domain env", func(t *testing.T) {
 		t.Setenv("TLS_CERT_FILE", "test-cert-file")
 		t.Setenv("TLS_KEY_FILE", "test-key-file")
-		t.Setenv("SIDECAR_IMAGE", "daprd-test-image")
 		t.Setenv("SIDECAR_IMAGE_PULL_POLICY", "Always")
-		t.Setenv("NAMESPACE", "test-namespace")
 		t.Setenv("KUBE_CLUSTER_DOMAIN", "cluster.local")
 		t.Setenv("ALLOWED_SERVICE_ACCOUNTS", "test1:test-service-account1,test2:test-service-account2")
 		t.Setenv("ALLOWED_SERVICE_ACCOUNTS_PREFIX_NAMES", "namespace:test-service-account1,namespace2*:test-service-account2")
@@ -49,9 +64,7 @@ func TestGetInjectorConfig(t *testing.T) {
 	t.Run("not set kube cluster domain env", func(t *testing.T) {
 		t.Setenv("TLS_CERT_FILE", "test-cert-file")
 		t.Setenv("TLS_KEY_FILE", "test-key-file")
-		t.Setenv("SIDECAR_IMAGE", "daprd-test-image")
 		t.Setenv("SIDECAR_IMAGE_PULL_POLICY", "IfNotPresent")
-		t.Setenv("NAMESPACE", "test-namespace")
 		t.Setenv("KUBE_CLUSTER_DOMAIN", "")
 
 		cfg, err := GetConfig()
@@ -65,8 +78,6 @@ func TestGetInjectorConfig(t *testing.T) {
 	t.Run("sidecar run options not set", func(t *testing.T) {
 		t.Setenv("TLS_CERT_FILE", "test-cert-file")
 		t.Setenv("TLS_KEY_FILE", "test-key-file")
-		t.Setenv("SIDECAR_IMAGE", "daprd-test-image")
-		t.Setenv("NAMESPACE", "test-namespace")
 
 		// Default values are true
 		t.Setenv("SIDECAR_RUN_AS_NON_ROOT", "")
diff --git a/pkg/injector/service/pod_patch.go b/pkg/injector/service/pod_patch.go
@@ -80,10 +80,13 @@ func (i *injector) getPodPatchOperations(ctx context.Context, ar *admissionv1.Ad
 	sidecar.CurrentTrustAnchors = trustAnchors
 	sidecar.DisableTokenVolume = !token.HasKubernetesToken()
 
-	// Set addresses for actor services
+	// Set addresses for actor services only if it's not explicitly globally disabled
 	// Even if actors are disabled, however, the placement-host-address flag will still be included if explicitly set in the annotation dapr.io/placement-host-address
 	// So, if the annotation is already set, we accept that and also use placement for actors services
-	if sidecar.PlacementAddress == "" {
+	if !i.config.GetActorsEnabled() {
+		sidecar.ActorsService = ""
+		sidecar.PlacementAddress = ""
+	} else if sidecar.PlacementAddress == "" {
 		// Set configuration for the actors service
 		actorsSvcName, actorsSvc := i.config.GetActorsService()
 		actorsSvcAddr := actorsSvc.Address(i.config.Namespace, i.config.KubeClusterDomain)
diff --git a/tests/apps/resiliencyapp/go.mod b/tests/apps/resiliencyapp/go.mod
@@ -1,6 +1,6 @@
 module github.com/dapr/dapr/tests/apps/resiliencyapp
 
-go 1.24.1
+go 1.24.2
 
 require (
 	github.com/dapr/dapr v0.0.0
diff --git a/tests/apps/resiliencyapp_grpc/go.mod b/tests/apps/resiliencyapp_grpc/go.mod
@@ -1,6 +1,6 @@
 module github.com/dapr/dapr/tests/apps/resiliencyapp_grpc
 
-go 1.24.1
+go 1.24.2
 
 require (
 	github.com/dapr/dapr v1.7.4
diff --git a/tests/apps/service_invocation_grpc_proxy_client/go.mod b/tests/apps/service_invocation_grpc_proxy_client/go.mod
diff --git a/tests/e2e/metrics/metrics_test.go b/tests/e2e/metrics/metrics_test.go

Original file line number	Diff line number	Diff line change
`@@ -193,12 +193,10 @@ func (a *actors) Init(opts InitOptions) error {`
`193`	`193`
`194`	`194`	`storeEnabled := a.buildStateStore(opts, apiLevel)`
`195`	`195`
`196`		`- if a.reminderStore != nil {`
`197`		`- a.reminders = reminders.New(reminders.Options{`
`198`		`- Storage: a.reminderStore,`
`199`		`- Table: a.table,`
`200`		`- })`
`201`		`- }`
	`196`	`+ a.reminders = reminders.New(reminders.Options{`
	`197`	`+ Storage: a.reminderStore,`
	`198`	`+ Table: a.table,`
	`199`	`+ })`
`202`	`200`
`203`	`201`	`var err error`
`204`	`202`	`a.placement, err = placement.New(placement.Options{`
`@@ -357,6 +355,10 @@ func (a *actors) Reminders(ctx context.Context) (reminders.Interface, error) {`
`357`	`355`	`return nil, err`
`358`	`356`	`}`
`359`	`357`
	`358`	`+ if a.reminders == nil {`
	`359`	`+ return nil, messages.ErrActorRuntimeNotFound`
	`360`	`+ }`
	`361`	`+`
`360`	`362`	`return a.reminders, nil`
`361`	`363`	`}`
`362`	`364`
`@@ -374,7 +376,7 @@ func (a *actors) waitForReady(ctx context.Context) error {`
`374`	`376`	`}`
`375`	`377`	`return nil`
`376`	`378`	`case <-ctx.Done():`
`377`		`- return ctx.Err()`
	`379`	`+ return messages.ErrActorRuntimeNotFound`
`378`	`380`	`}`
`379`	`381`	`}`
`380`	`382`