Skip to content

Commit 01d916c

Browse files
feat: scaffolding domain audit manager (#7422)
<!-- Describe what has changed in this PR --> **What changed?** This adds a domain-audit functionality and implements the subset of events on it for failovers in the ListFailoverHistory endpoint for reading this data. The intent of this feature is that for any significant create, write, failover or other important domain operation, to put the events into an audit log for tracking. This supercedes the 5-most-recent-failovers domain-data blob that was previously created quickly. This PR only expose that read information for failovers, however, it persists all write operations to the audit log, with the expecation that a more general-purpose audit read endpoint may be added in the future. This is a scaffolding PR and it's probably missing a little in the way of features, among them: - [ ] TTL for data cleanup - [ ] Probably some test coverage These are blockers before landing and they'll be included in the PR Testing: - [X] Manual testing - [X] Unit testing Signed-off-by: David Porter <[email protected]>
1 parent 57f0d8d commit 01d916c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+4888
-209
lines changed

common/domain/audit/audit.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package audit

common/domain/handler.go

Lines changed: 127 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
"regexp"
3030
"time"
3131

32-
"github.com/pborman/uuid"
32+
guuid "github.com/google/uuid"
3333

3434
"github.com/uber/cadence/common"
3535
"github.com/uber/cadence/common/archiver"
@@ -94,6 +94,7 @@ type (
9494
// handlerImpl is the domain operation handler implementation
9595
handlerImpl struct {
9696
domainManager persistence.DomainManager
97+
domainAuditManager persistence.DomainAuditManager
9798
clusterMetadata cluster.Metadata
9899
domainReplicator Replicator
99100
domainAttrValidator *AttrValidatorImpl
@@ -106,12 +107,13 @@ type (
106107

107108
// Config is the domain config for domain handler
108109
Config struct {
109-
MinRetentionDays dynamicproperties.IntPropertyFn
110-
MaxRetentionDays dynamicproperties.IntPropertyFn
111-
RequiredDomainDataKeys dynamicproperties.MapPropertyFn
112-
MaxBadBinaryCount dynamicproperties.IntPropertyFnWithDomainFilter
113-
FailoverCoolDown dynamicproperties.DurationPropertyFnWithDomainFilter
114-
FailoverHistoryMaxSize dynamicproperties.IntPropertyFnWithDomainFilter
110+
MinRetentionDays dynamicproperties.IntPropertyFn
111+
MaxRetentionDays dynamicproperties.IntPropertyFn
112+
RequiredDomainDataKeys dynamicproperties.MapPropertyFn
113+
MaxBadBinaryCount dynamicproperties.IntPropertyFnWithDomainFilter
114+
FailoverCoolDown dynamicproperties.DurationPropertyFnWithDomainFilter
115+
FailoverHistoryMaxSize dynamicproperties.IntPropertyFnWithDomainFilter
116+
EnableDomainAuditLogging dynamicproperties.BoolPropertyFn
115117
}
116118

117119
// FailoverEvent is the failover information to be stored for each failover event in domain data
@@ -137,6 +139,7 @@ func NewHandler(
137139
config Config,
138140
logger log.Logger,
139141
domainManager persistence.DomainManager,
142+
domainAuditManager persistence.DomainAuditManager,
140143
clusterMetadata cluster.Metadata,
141144
domainReplicator Replicator,
142145
archivalMetadata archiver.ArchivalMetadata,
@@ -146,6 +149,7 @@ func NewHandler(
146149
return &handlerImpl{
147150
logger: logger,
148151
domainManager: domainManager,
152+
domainAuditManager: domainAuditManager,
149153
clusterMetadata: clusterMetadata,
150154
domainReplicator: domainReplicator,
151155
domainAttrValidator: newAttrValidator(clusterMetadata, int32(config.MinRetentionDays())),
@@ -241,8 +245,13 @@ func (d *handlerImpl) RegisterDomain(
241245
}
242246
}
243247

248+
eventID, err := guuid.NewV7()
249+
if err != nil {
250+
return err
251+
}
252+
244253
info := &persistence.DomainInfo{
245-
ID: uuid.New(),
254+
ID: eventID.String(),
246255
Name: registerRequest.GetName(),
247256
Status: persistence.DomainStatusRegistered,
248257
OwnerEmail: registerRequest.GetOwnerEmail(),
@@ -330,6 +339,22 @@ func (d *handlerImpl) RegisterDomain(
330339
tag.WorkflowDomainID(domainResponse.ID),
331340
)
332341

342+
// Construct GetDomainResponse for audit log
343+
domainStateAfterCreate := &persistence.GetDomainResponse{
344+
Info: domainRequest.Info,
345+
Config: domainRequest.Config,
346+
ReplicationConfig: domainRequest.ReplicationConfig,
347+
IsGlobalDomain: domainRequest.IsGlobalDomain,
348+
ConfigVersion: domainRequest.ConfigVersion,
349+
FailoverVersion: domainRequest.FailoverVersion,
350+
LastUpdatedTime: domainRequest.LastUpdatedTime,
351+
}
352+
353+
err = d.updateDomainAuditLog(ctx, nil, domainStateAfterCreate, persistence.DomainAuditOperationTypeCreate, "domain created")
354+
if err != nil {
355+
return err
356+
}
357+
333358
return nil
334359
}
335360

@@ -631,13 +656,64 @@ func (d *handlerImpl) handleFailoverRequest(ctx context.Context,
631656
}
632657
response.DomainInfo, response.Configuration, response.ReplicationConfiguration = d.createResponse(intendedDomainState.Info, intendedDomainState.Config, intendedDomainState.ReplicationConfig)
633658

659+
err = d.updateDomainAuditLog(ctx, currentState, intendedDomainState, persistence.DomainAuditOperationTypeFailover, "domain failover")
660+
if err != nil {
661+
return nil, err
662+
}
663+
634664
d.logger.Info("faiover request succeeded",
635665
tag.WorkflowDomainName(intendedDomainState.Info.Name),
636666
tag.WorkflowDomainID(intendedDomainState.Info.ID),
637667
)
638668
return response, nil
639669
}
640670

671+
func (d *handlerImpl) updateDomainAuditLog(ctx context.Context,
672+
currentState *persistence.GetDomainResponse,
673+
intendedDomainState *persistence.GetDomainResponse,
674+
operationType persistence.DomainAuditOperationType,
675+
comment string,
676+
) error {
677+
678+
if d.domainAuditManager == nil {
679+
return nil
680+
}
681+
682+
if !d.config.EnableDomainAuditLogging() {
683+
return nil
684+
}
685+
686+
// Must be a UUID v7, since we need a time value as well
687+
eventID, err := guuid.NewV7()
688+
if err != nil {
689+
return err
690+
}
691+
// the creation time is used in the database as a partition but passed around
692+
// embedded in the eventUUID for ergonomics. This means that users wishing
693+
// to get an audit entry by ID do not need to know the creation time in advance
694+
// since it's embedded in the sorting-values of the UUID.
695+
creationTime := time.Unix(eventID.Time().UnixTime())
696+
697+
_, err = d.domainAuditManager.CreateDomainAuditLog(ctx, &persistence.CreateDomainAuditLogRequest{
698+
DomainID: intendedDomainState.GetInfo().GetID(),
699+
EventID: eventID.String(),
700+
CreatedTime: creationTime,
701+
StateBefore: currentState,
702+
StateAfter: intendedDomainState,
703+
OperationType: operationType,
704+
Comment: comment,
705+
})
706+
if err != nil {
707+
d.logger.Error("Failed to create domain audit log",
708+
tag.WorkflowDomainID(intendedDomainState.GetInfo().GetID()),
709+
tag.Error(err),
710+
)
711+
// Log the error but don't fail the operation - audit logging is best effort
712+
// to avoid breaking critical domain operations
713+
}
714+
return nil
715+
}
716+
641717
// updateGlobalDomainConfiguration handles the update of a global domain configuration
642718
// this excludes failover/active_cluster/active_clusters updates. They are grouped under
643719
// forms of failover
@@ -773,6 +849,26 @@ func (d *handlerImpl) updateGlobalDomainConfiguration(ctx context.Context,
773849
}
774850
response.DomainInfo, response.Configuration, response.ReplicationConfiguration = d.createResponse(info, config, replicationConfig)
775851

852+
// Construct GetDomainResponse for audit log with the final updated values
853+
domainStateAfterUpdate := &persistence.GetDomainResponse{
854+
Info: info,
855+
Config: config,
856+
ReplicationConfig: replicationConfig,
857+
IsGlobalDomain: isGlobalDomain,
858+
ConfigVersion: configVersion,
859+
FailoverVersion: failoverVersion,
860+
FailoverNotificationVersion: intendedDomainState.FailoverNotificationVersion,
861+
PreviousFailoverVersion: intendedDomainState.PreviousFailoverVersion,
862+
FailoverEndTime: intendedDomainState.FailoverEndTime,
863+
LastUpdatedTime: now.UnixNano(),
864+
NotificationVersion: notificationVersion,
865+
}
866+
867+
err = d.updateDomainAuditLog(ctx, currentDomainState, domainStateAfterUpdate, persistence.DomainAuditOperationTypeUpdate, "domain updated")
868+
if err != nil {
869+
return nil, err
870+
}
871+
776872
d.logger.Info("Update domain succeeded",
777873
tag.WorkflowDomainName(info.Name),
778874
tag.WorkflowDomainID(info.ID),
@@ -876,6 +972,11 @@ func (d *handlerImpl) updateLocalDomain(ctx context.Context,
876972
if err != nil {
877973
return nil, err
878974
}
975+
976+
err = d.updateDomainAuditLog(ctx, currentState, intendedDomainState, persistence.DomainAuditOperationTypeUpdate, "domain updated")
977+
if err != nil {
978+
return nil, err
979+
}
879980
}
880981
response := &types.UpdateDomainResponse{
881982
IsGlobalDomain: false,
@@ -958,6 +1059,11 @@ func (d *handlerImpl) DeleteDomain(
9581059
}
9591060
}
9601061

1062+
err = d.updateDomainAuditLog(ctx, getResponse, nil, persistence.DomainAuditOperationTypeDelete, "domain deleted")
1063+
if err != nil {
1064+
return err
1065+
}
1066+
9611067
d.logger.Info("Delete domain succeeded",
9621068
tag.WorkflowDomainName(getResponse.Info.Name),
9631069
tag.WorkflowDomainID(getResponse.Info.ID),
@@ -1030,6 +1136,14 @@ func (d *handlerImpl) DeprecateDomain(
10301136
tag.WorkflowDomainName(getResponse.Info.Name),
10311137
tag.WorkflowDomainID(getResponse.Info.ID),
10321138
)
1139+
1140+
domainStateAfterDeprecate := &persistence.GetDomainResponse{
1141+
Info: getResponse.Info,
1142+
}
1143+
err = d.updateDomainAuditLog(ctx, getResponse, domainStateAfterDeprecate, persistence.DomainAuditOperationTypeDeprecate, "domain deprecated")
1144+
if err != nil {
1145+
return err
1146+
}
10331147
return nil
10341148
}
10351149

@@ -1204,16 +1318,6 @@ func (d *handlerImpl) UpdateAsyncWorkflowConfiguraton(
12041318
}
12051319

12061320
if currentDomainConfig.IsGlobalDomain {
1207-
// One might reasonably wonder what value there is in replication of isolation-group information - info which is
1208-
// regional and therefore of no value to the other region?
1209-
// Probably not a lot, in and of itself, however, the isolation-group information is stored
1210-
// in the domain configuration fields in the domain tables. Access and updates to those records is
1211-
// done through a replicated mechanism with explicit versioning and conflict resolution.
1212-
// Therefore, in order to avoid making an already complex mechanisim much more difficult to understand,
1213-
// the data is replicated in the same way so as to try and make things less confusing when both codepaths
1214-
// are updating the table:
1215-
// - versions like the confiugration version are updated in the same manner
1216-
// - the last-updated timestamps are updated in the same manner
12171321
if err := d.domainReplicator.HandleTransmissionTask(
12181322
ctx,
12191323
types.DomainOperationUpdate,
@@ -1229,6 +1333,11 @@ func (d *handlerImpl) UpdateAsyncWorkflowConfiguraton(
12291333
}
12301334
}
12311335

1336+
err = d.updateDomainAuditLog(ctx, currentDomainConfig, currentDomainConfig, persistence.DomainAuditOperationTypeUpdate, "async workflow queue config update")
1337+
if err != nil {
1338+
return err
1339+
}
1340+
12321341
d.logger.Info("async workflow queue config update succeeded",
12331342
tag.WorkflowDomainName(currentDomainConfig.Info.Name),
12341343
tag.WorkflowDomainID(currentDomainConfig.Info.ID),

common/domain/handler_MasterCluster_test.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ type (
6060
mockDomainReplicator Replicator
6161
archivalMetadata archiver.ArchivalMetadata
6262
mockArchiverProvider *provider.MockArchiverProvider
63+
mockDomainAuditManager persistence.DomainAuditManager
6364

6465
handler *handlerImpl
6566
}
@@ -97,6 +98,7 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) SetupTest() {
9798
s.domainManager = s.TestBase.DomainManager
9899
s.mockProducer = &mocks.KafkaProducer{}
99100
s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger)
101+
s.mockDomainAuditManager = persistence.NewMockDomainAuditManager(s.Controller)
100102
s.archivalMetadata = archiver.NewArchivalMetadata(
101103
dcCollection,
102104
"",
@@ -107,15 +109,17 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) SetupTest() {
107109
)
108110
s.mockArchiverProvider = &provider.MockArchiverProvider{}
109111
domainConfig := Config{
110-
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
111-
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
112-
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
113-
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
112+
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
113+
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
114+
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
115+
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
116+
EnableDomainAuditLogging: dynamicproperties.GetBoolPropertyFn(false),
114117
}
115118
s.handler = NewHandler(
116119
domainConfig,
117120
logger,
118121
s.domainManager,
122+
s.mockDomainAuditManager,
119123
s.ClusterMetadata,
120124
s.mockDomainReplicator,
121125
s.archivalMetadata,
@@ -901,14 +905,16 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) TestUpdateGetDomai
901905

902906
func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) TestUpdateDomain_CoolDown() {
903907
domainConfig := Config{
904-
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
905-
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
906-
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(10000 * time.Second),
908+
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
909+
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
910+
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(10000 * time.Second),
911+
EnableDomainAuditLogging: dynamicproperties.GetBoolPropertyFn(false),
907912
}
908913
s.handler = NewHandler(
909914
domainConfig,
910915
s.Logger,
911916
s.domainManager,
917+
s.mockDomainAuditManager,
912918
s.ClusterMetadata,
913919
s.mockDomainReplicator,
914920
s.archivalMetadata,

common/domain/handler_NotMasterCluster_test.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ type (
6060
mockDomainReplicator Replicator
6161
archivalMetadata archiver.ArchivalMetadata
6262
mockArchiverProvider *provider.MockArchiverProvider
63+
mockDomainAuditManager persistence.DomainAuditManager
6364

6465
handler *handlerImpl
6566
}
@@ -97,6 +98,7 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) SetupTest() {
9798
s.domainManager = s.TestBase.DomainManager
9899
s.mockProducer = &mocks.KafkaProducer{}
99100
s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger)
101+
s.mockDomainAuditManager = persistence.NewMockDomainAuditManager(s.Controller)
100102
s.archivalMetadata = archiver.NewArchivalMetadata(
101103
dcCollection,
102104
"",
@@ -107,15 +109,17 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) SetupTest() {
107109
)
108110
s.mockArchiverProvider = &provider.MockArchiverProvider{}
109111
domainConfig := Config{
110-
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
111-
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
112-
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
113-
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
112+
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
113+
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
114+
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
115+
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
116+
EnableDomainAuditLogging: dynamicproperties.GetBoolPropertyFn(false),
114117
}
115118
s.handler = NewHandler(
116119
domainConfig,
117120
logger,
118121
s.domainManager,
122+
s.mockDomainAuditManager,
119123
s.ClusterMetadata,
120124
s.mockDomainReplicator,
121125
s.archivalMetadata,

common/domain/handler_integration_test.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,12 @@ type (
5959
maxBadBinaryCount int
6060
failoverHistoryMaxSize int
6161

62-
domainManager persistence.DomainManager
63-
mockProducer *mocks.KafkaProducer
64-
mockDomainReplicator Replicator
65-
archivalMetadata archiver.ArchivalMetadata
66-
mockArchiverProvider *provider.MockArchiverProvider
62+
domainManager persistence.DomainManager
63+
mockProducer *mocks.KafkaProducer
64+
mockDomainReplicator Replicator
65+
archivalMetadata archiver.ArchivalMetadata
66+
mockArchiverProvider *provider.MockArchiverProvider
67+
mockDomainAuditManager persistence.DomainAuditManager
6768

6869
handler *handlerImpl
6970
}
@@ -103,6 +104,7 @@ func (s *domainHandlerCommonSuite) SetupTest() {
103104
s.domainManager = s.TestBase.DomainManager
104105
s.mockProducer = &mocks.KafkaProducer{}
105106
s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger)
107+
s.mockDomainAuditManager = persistence.NewMockDomainAuditManager(s.Controller)
106108
s.archivalMetadata = archiver.NewArchivalMetadata(
107109
dcCollection,
108110
"",
@@ -113,15 +115,17 @@ func (s *domainHandlerCommonSuite) SetupTest() {
113115
)
114116
s.mockArchiverProvider = provider.NewMockArchiverProvider(s.Controller)
115117
domainConfig := Config{
116-
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
117-
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
118-
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
119-
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
118+
MinRetentionDays: dynamicproperties.GetIntPropertyFn(s.minRetentionDays),
119+
MaxBadBinaryCount: dynamicproperties.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount),
120+
FailoverCoolDown: dynamicproperties.GetDurationPropertyFnFilteredByDomain(0 * time.Second),
121+
FailoverHistoryMaxSize: dynamicproperties.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize),
122+
EnableDomainAuditLogging: dynamicproperties.GetBoolPropertyFn(false),
120123
}
121124
s.handler = NewHandler(
122125
domainConfig,
123126
logger,
124127
s.domainManager,
128+
s.mockDomainAuditManager,
125129
s.ClusterMetadata,
126130
s.mockDomainReplicator,
127131
s.archivalMetadata,

0 commit comments

Comments
 (0)