From 33682308169cbd05df432d9cb4fdcc9fd8757d66 Mon Sep 17 00:00:00 2001 From: Diana Zawadzki Date: Fri, 14 Nov 2025 09:47:32 -0800 Subject: [PATCH] feat: add required reason field to failover domain requests Add mandatory 'Reason' field to FailoverDomainRequest to provide context for why domains are being failed over, improving audit trails and operational transparency. Changes: - Add Reason field to FailoverDomainRequest in types and mappers - Plumb reason through domain handler to audit logs - Add validation to require non-empty reason for failovers - Add CLI --reason flag (required) for domain failover command - Add unit and integration tests for validation and persistence This requires corresponding IDL changes in cadence-idl to expose the field via thrift/proto APIs. The go.mod includes a replace directive pointing to a fork with the IDL changes for now. Breaking change: All failover requests must now include a reason. Signed-off-by: Diana Zawadzki --- common/domain/handler.go | 10 +- common/domain/handler_test.go | 3 + common/types/mapper/proto/api.go | 2 + common/types/mapper/thrift/shared.go | 2 + common/types/shared.go | 17 +++ go.mod | 2 + go.sum | 5 +- go.work.sum | 2 + .../api/domain_failover_integration_test.go | 104 +++++++++++++++ service/frontend/api/request_validator.go | 5 + .../api/request_validator_failover_test.go | 118 ++++++++++++++++++ tools/cli/domain_commands.go | 9 ++ 12 files changed, 275 insertions(+), 4 deletions(-) create mode 100644 service/frontend/api/domain_failover_integration_test.go create mode 100644 service/frontend/api/request_validator_failover_test.go diff --git a/common/domain/handler.go b/common/domain/handler.go index cf4597fa0dd..98b5e3d7ee1 100644 --- a/common/domain/handler.go +++ b/common/domain/handler.go @@ -656,7 +656,15 @@ func (d *handlerImpl) handleFailoverRequest(ctx context.Context, } response.DomainInfo, response.Configuration, response.ReplicationConfiguration = d.createResponse(intendedDomainState.Info, intendedDomainState.Config, intendedDomainState.ReplicationConfig) - err = d.updateDomainAuditLog(ctx, currentState, intendedDomainState, persistence.DomainAuditOperationTypeFailover, "domain failover") + // Extract failover reason from update request data + failoverMessage := "domain failover" + if updateRequest.Data != nil { + if reason, ok := updateRequest.Data["FailoverReason"]; ok && reason != "" { + failoverMessage = fmt.Sprintf("domain failover: %s", reason) + } + } + + err = d.updateDomainAuditLog(ctx, currentState, intendedDomainState, persistence.DomainAuditOperationTypeFailover, failoverMessage) if err != nil { return nil, err } diff --git a/common/domain/handler_test.go b/common/domain/handler_test.go index 3e7fa28e553..c02ea03c96e 100644 --- a/common/domain/handler_test.go +++ b/common/domain/handler_test.go @@ -3685,6 +3685,7 @@ func TestHandler_FailoverDomain(t *testing.T) { request: &types.FailoverDomainRequest{ DomainName: constants.TestDomainName, DomainActiveClusterName: common.Ptr(clusterB), + Reason: "Planned maintenance on cluster A", }, response: func(timeSource clock.MockedTimeSource) *types.FailoverDomainResponse { data, _ := json.Marshal([]FailoverEvent{ @@ -3730,6 +3731,7 @@ func TestHandler_FailoverDomain(t *testing.T) { request: &types.FailoverDomainRequest{ DomainName: constants.TestDomainName, DomainActiveClusterName: common.Ptr(cluster.TestAlternativeClusterName), + Reason: "Testing domain not found error", }, err: &types.EntityNotExistsError{Message: "Domain not found"}, }, @@ -3767,6 +3769,7 @@ func TestHandler_FailoverDomain(t *testing.T) { request: &types.FailoverDomainRequest{ DomainName: constants.TestDomainName, DomainActiveClusterName: common.Ptr(cluster.TestCurrentClusterName), + Reason: "Testing update too frequent error", }, err: errDomainUpdateTooFrequent, }, diff --git a/common/types/mapper/proto/api.go b/common/types/mapper/proto/api.go index cb03abe7315..8175082be12 100644 --- a/common/types/mapper/proto/api.go +++ b/common/types/mapper/proto/api.go @@ -4516,6 +4516,7 @@ func FromFailoverDomainRequest(t *types.FailoverDomainRequest) *apiv1.FailoverDo DomainName: t.DomainName, DomainActiveClusterName: t.GetDomainActiveClusterName(), ActiveClusters: FromActiveClusters(t.ActiveClusters), + Reason: t.Reason, } } @@ -4527,6 +4528,7 @@ func ToFailoverDomainRequest(t *apiv1.FailoverDomainRequest) *types.FailoverDoma DomainName: t.DomainName, DomainActiveClusterName: common.StringPtr(t.DomainActiveClusterName), ActiveClusters: ToActiveClusters(t.ActiveClusters), + Reason: t.Reason, } } diff --git a/common/types/mapper/thrift/shared.go b/common/types/mapper/thrift/shared.go index 2b63a43bf6c..1f27912e546 100644 --- a/common/types/mapper/thrift/shared.go +++ b/common/types/mapper/thrift/shared.go @@ -1623,6 +1623,7 @@ func FromFailoverDomainRequest(t *types.FailoverDomainRequest) *shared.FailoverD DomainName: &t.DomainName, DomainActiveClusterName: t.DomainActiveClusterName, ActiveClusters: FromActiveClusters(t.ActiveClusters), + Reason: &t.Reason, } } @@ -1635,6 +1636,7 @@ func ToFailoverDomainRequest(t *shared.FailoverDomainRequest) *types.FailoverDom DomainName: t.GetDomainName(), DomainActiveClusterName: t.DomainActiveClusterName, ActiveClusters: ToActiveClusters(t.ActiveClusters), + Reason: t.GetReason(), } } diff --git a/common/types/shared.go b/common/types/shared.go index ad861ef5c29..434716814bf 100644 --- a/common/types/shared.go +++ b/common/types/shared.go @@ -1782,16 +1782,25 @@ type FailoverDomainRequest struct { DomainName string `json:"domainName,omitempty"` DomainActiveClusterName *string `json:"domainActiveClusterName,omitempty"` ActiveClusters *ActiveClusters `json:"activeClusters,omitempty"` + Reason string `json:"reason,omitempty"` } func (v *FailoverDomainRequest) ToUpdateDomainRequest() *UpdateDomainRequest { if v == nil { return nil } + // Include failover reason in domain data if provided + var data map[string]string + if v.Reason != "" { + data = map[string]string{ + "FailoverReason": v.Reason, + } + } return &UpdateDomainRequest{ Name: v.DomainName, ActiveClusterName: v.DomainActiveClusterName, ActiveClusters: v.ActiveClusters, + Data: data, } } @@ -1819,6 +1828,14 @@ func (v *FailoverDomainRequest) GetDomain() (o string) { return } +// GetReason is an internal getter (TBD...) +func (v *FailoverDomainRequest) GetReason() (o string) { + if v != nil { + return v.Reason + } + return +} + // FailoverDomainResponse is an internal type (TBD...) type FailoverDomainResponse struct { DomainInfo *DomainInfo `json:"domainInfo,omitempty"` diff --git a/go.mod b/go.mod index 7511e0893f0..cf7078d0b25 100644 --- a/go.mod +++ b/go.mod @@ -167,3 +167,5 @@ replace github.com/apache/thrift => github.com/apache/thrift v0.0.0-201612212036 // DO NOT USE as it misses mysql/config store fix retract v1.2.3 + +replace github.com/uber/cadence-idl => github.com/zawadzkidiana/cadence-idl v0.0.0-20251112233538-63b7e01d0963 diff --git a/go.sum b/go.sum index 7a63bf2a971..91a1b1597b8 100644 --- a/go.sum +++ b/go.sum @@ -442,9 +442,6 @@ github.com/uber-go/mapdecode v1.0.0/go.mod h1:b5nP15FwXTgpjTjeA9A2uTHXV5UJCl4arw github.com/uber-go/tally v3.3.12+incompatible/go.mod h1:YDTIBxdXyOU/sCWilKB4bgyufu1cEi0jdVnRdxvjnmU= github.com/uber-go/tally v3.3.15+incompatible h1:9hLSgNBP28CjIaDmAuRTq9qV+UZY+9PcvAkXO4nNMwg= github.com/uber-go/tally v3.3.15+incompatible/go.mod h1:YDTIBxdXyOU/sCWilKB4bgyufu1cEi0jdVnRdxvjnmU= -github.com/uber/cadence-idl v0.0.0-20211111101836-d6b70b60eb8c/go.mod h1:oyUK7GCNCRHCCyWyzifSzXpVrRYVBbAMHAzF5dXiKws= -github.com/uber/cadence-idl v0.0.0-20251027162905-7b9d8a31de8c h1:fd00g0M50UuZYetChIAPYUchvi1M+gL3rhbWwOQ6GhE= -github.com/uber/cadence-idl v0.0.0-20251027162905-7b9d8a31de8c/go.mod h1:oyUK7GCNCRHCCyWyzifSzXpVrRYVBbAMHAzF5dXiKws= github.com/uber/jaeger-client-go v2.22.1+incompatible h1:NHcubEkVbahf9t3p75TOCR83gdUHXjRJvjoBh1yACsM= github.com/uber/jaeger-client-go v2.22.1+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-lib v2.2.0+incompatible h1:MxZXOiR2JuoANZ3J6DE/U0kSFv/eJ/GfSYVCjK7dyaw= @@ -485,6 +482,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.1/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/zawadzkidiana/cadence-idl v0.0.0-20251112233538-63b7e01d0963 h1:1EW62MeL+ZKpxrJsLTc+IjtMablb5lHOECRf9oleBk8= +github.com/zawadzkidiana/cadence-idl v0.0.0-20251112233538-63b7e01d0963/go.mod h1:oyUK7GCNCRHCCyWyzifSzXpVrRYVBbAMHAzF5dXiKws= go.mongodb.org/mongo-driver v1.7.3 h1:G4l/eYY9VrQAK/AUgkV0koQKzQnyddnWxrd/Etf0jIs= go.mongodb.org/mongo-driver v1.7.3/go.mod h1:NqaYOwnXWr5Pm7AOpO5QFxKJ503nbMse/R79oO62zWg= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= diff --git a/go.work.sum b/go.work.sum index 2b1625d8c7d..b6d07ef505f 100644 --- a/go.work.sum +++ b/go.work.sum @@ -659,6 +659,8 @@ github.com/yudai/pp v2.0.1+incompatible h1:Q4//iY4pNF6yPLZIigmvcl7k/bPgrcTPIFIcm github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= github.com/yusufpapurcu/wmi v1.2.2/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +github.com/zawadzkidiana/cadence-idl v0.0.0-20251112233538-63b7e01d0963 h1:1EW62MeL+ZKpxrJsLTc+IjtMablb5lHOECRf9oleBk8= +github.com/zawadzkidiana/cadence-idl v0.0.0-20251112233538-63b7e01d0963/go.mod h1:oyUK7GCNCRHCCyWyzifSzXpVrRYVBbAMHAzF5dXiKws= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= go.etcd.io/bbolt v1.3.4 h1:hi1bXHMVrlQh6WwxAy+qZCV/SYIlqo+Ushwdpa4tAKg= diff --git a/service/frontend/api/domain_failover_integration_test.go b/service/frontend/api/domain_failover_integration_test.go new file mode 100644 index 00000000000..2677fad8ef8 --- /dev/null +++ b/service/frontend/api/domain_failover_integration_test.go @@ -0,0 +1,104 @@ +// Copyright (c) 2017 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package api + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/types" +) + +func TestFailoverDomainValidation_WithoutReason_ExpectError(t *testing.T) { + // This test verifies that when a failover is attempted without providing a reason, + // the validation fails and returns an appropriate error. + + v, _ := setupMocksForRequestValidator(t) + + // Test case 1: Failover request WITHOUT a reason + failoverRequestWithoutReason := &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + // Reason is intentionally missing + } + + err := v.ValidateFailoverDomainRequest(context.Background(), failoverRequestWithoutReason) + + // Verify we get the expected error + assert.Error(t, err) + assert.Contains(t, err.Error(), "Reason must be provided for domain failover") + + // Test case 2: Failover request with empty reason + failoverRequestEmptyReason := &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "", // Empty reason + } + + err = v.ValidateFailoverDomainRequest(context.Background(), failoverRequestEmptyReason) + + // Verify we get the expected error + assert.Error(t, err) + assert.Contains(t, err.Error(), "Reason must be provided for domain failover") +} + +func TestFailoverDomainValidation_WithReason_ExpectSuccess(t *testing.T) { + // This test verifies that when a failover is attempted with a valid reason, + // the validation succeeds. + + v, _ := setupMocksForRequestValidator(t) + + // Create a failover request WITH a reason + failoverRequestWithReason := &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "Planned maintenance on cluster1 - upgrading to v1.2.3", + } + + err := v.ValidateFailoverDomainRequest(context.Background(), failoverRequestWithReason) + + // Verify validation passes + require.NoError(t, err) +} + +func TestFailoverDomainReason_StoredInDomainData(t *testing.T) { + // This test verifies that the reason provided during failover + // is properly converted to domain data for storage. + + failoverRequest := &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "Emergency failover due to cluster1 outage", + } + + // Call ToUpdateDomainRequest and verify reason is in data + updateReq := failoverRequest.ToUpdateDomainRequest() + + assert.NotNil(t, updateReq) + assert.Equal(t, "test-domain", updateReq.Name) + assert.Equal(t, common.StringPtr("cluster2"), updateReq.ActiveClusterName) + assert.NotNil(t, updateReq.Data) + assert.Equal(t, "Emergency failover due to cluster1 outage", updateReq.Data["FailoverReason"]) +} diff --git a/service/frontend/api/request_validator.go b/service/frontend/api/request_validator.go index e9078fac21e..3542e9d82fd 100644 --- a/service/frontend/api/request_validator.go +++ b/service/frontend/api/request_validator.go @@ -378,6 +378,11 @@ func (v *requestValidatorImpl) ValidateFailoverDomainRequest(ctx context.Context return &types.BadRequestError{Message: "DomainActiveClusterName or ActiveClusters must be provided to failover the domain"} } + // Validate that a reason is provided + if failoverDomainRequest.GetReason() == "" { + return &types.BadRequestError{Message: "Reason must be provided for domain failover"} + } + // Security token is not required for failover request - reject the failover if the cluster is in lockdown return checkFailOverPermission(v.config, failoverDomainRequest.GetDomainName()) } diff --git a/service/frontend/api/request_validator_failover_test.go b/service/frontend/api/request_validator_failover_test.go new file mode 100644 index 00000000000..a34ec4bcb75 --- /dev/null +++ b/service/frontend/api/request_validator_failover_test.go @@ -0,0 +1,118 @@ +// Copyright (c) 2017 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package api + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/types" +) + +func TestValidateFailoverDomainRequest(t *testing.T) { + testCases := []struct { + name string + req *types.FailoverDomainRequest + expectError bool + expectedError string + }{ + { + name: "valid request", + req: &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "Planned maintenance on cluster1", + }, + expectError: false, + }, + { + name: "nil request", + req: nil, + expectError: true, + expectedError: "Request is nil.", + }, + { + name: "empty domain name", + req: &types.FailoverDomainRequest{ + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "Planned maintenance", + }, + expectError: true, + expectedError: "Domain not set on request.", + }, + { + name: "missing cluster info", + req: &types.FailoverDomainRequest{ + DomainName: "test-domain", + Reason: "Planned maintenance", + }, + expectError: true, + expectedError: "DomainActiveClusterName or ActiveClusters must be provided to failover the domain", + }, + { + name: "missing reason", + req: &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + }, + expectError: true, + expectedError: "Reason must be provided for domain failover", + }, + { + name: "empty reason", + req: &types.FailoverDomainRequest{ + DomainName: "test-domain", + DomainActiveClusterName: common.StringPtr("cluster2"), + Reason: "", + }, + expectError: true, + expectedError: "Reason must be provided for domain failover", + }, + // TODO: Fix ActiveClusters test case structure + // { + // name: "valid request with active clusters", + // req: &types.FailoverDomainRequest{ + // DomainName: "test-domain", + // ActiveClusters: &types.ActiveClusters{ + // AttributeScopes: []string{"cluster1", "cluster2"}, + // }, + // Reason: "Load balancing across clusters", + // }, + // expectError: false, + // }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + v, _ := setupMocksForRequestValidator(t) + + err := v.ValidateFailoverDomainRequest(context.Background(), tc.req) + if tc.expectError { + assert.ErrorContains(t, err, tc.expectedError) + } else { + assert.NoError(t, err) + } + }) + } +} diff --git a/tools/cli/domain_commands.go b/tools/cli/domain_commands.go index 0e25d79f798..9d173805348 100644 --- a/tools/cli/domain_commands.go +++ b/tools/cli/domain_commands.go @@ -461,6 +461,15 @@ func (d *domainCLIImpl) FailoverDomain(c *cli.Context) error { DomainName: domainName, } + if !c.IsSet(FlagReason) { + return commoncli.Problem(fmt.Sprintf("%s flag is required for domain failover.", FlagReason), nil) + } + failoverReason := strings.TrimSpace(c.String(FlagReason)) + if failoverReason == "" { + return commoncli.Problem("Reason must be a non-empty string.", nil) + } + failoverRequest.Reason = failoverReason + ctx, cancel, err := newContext(c) defer cancel() if err != nil {