Skip to content

Commit 13392cb

Browse files
committed
audit events for tbwr operation states
1 parent acfe904 commit 13392cb

File tree

4 files changed

+193
-36
lines changed

4 files changed

+193
-36
lines changed

cmd/integration/make_backup/main.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ type RawEvent struct {
161161
Reason string `json:"reason,omitempty"`
162162
Timestamp string `json:"@timestamp"`
163163
IsBackground bool `json:"is_background"`
164+
Database string `json:"database,omitempty"`
164165
}
165166

166167
type RawEventEnvelope struct {
@@ -359,6 +360,22 @@ func main() {
359360
Status: "SUCCESS",
360361
},
361362
},
363+
{
364+
event: RawEvent{
365+
Action: "ActionUpdate",
366+
Component: "backup_service",
367+
Status: "NEW",
368+
Database: databaseName,
369+
},
370+
},
371+
{
372+
event: RawEvent{
373+
Action: "ActionUpdate",
374+
Component: "backup_service",
375+
Status: "DONE",
376+
Database: databaseName,
377+
},
378+
},
362379
},
363380
)
364381

internal/audit/audit_event.go

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"google.golang.org/protobuf/proto"
1010
"time"
1111
"ydbcp/internal/server/grpcinfo"
12+
"ydbcp/internal/types"
1213
"ydbcp/internal/util/xlog"
1314
)
1415

@@ -148,7 +149,68 @@ func ReportGRPCCallEnd(
148149
ReportAuditEvent(ctx, event)
149150
}
150151

151-
func ReportAuditEvent(ctx context.Context, event *GRPCCallEvent) {
152+
type BackupStateEvent struct {
153+
GenericAuditFields
154+
Database string `json:"database"`
155+
}
156+
157+
func ReportBackupStateAuditEvent(
158+
ctx context.Context, operation *types.TakeBackupWithRetryOperation,
159+
retry bool, new bool,
160+
) {
161+
status := operation.GetState().String()
162+
reason := ""
163+
switch operation.GetState() {
164+
case types.OperationStateRunning:
165+
{
166+
if retry {
167+
status = "RETRYING"
168+
reason = "New backup attempt scheduled"
169+
} else if new {
170+
status = "NEW"
171+
reason = "New retryable backup attempt scheduled"
172+
}
173+
}
174+
case types.OperationStateDone:
175+
{
176+
reason = "Backup complete"
177+
}
178+
case types.OperationStateError:
179+
{
180+
reason = "Backup and all its retry attempts failed"
181+
}
182+
case types.OperationStateCancelling:
183+
case types.OperationStateCancelled:
184+
case types.OperationStateStartCancelling:
185+
{
186+
reason = "Backup operation cancelled"
187+
}
188+
}
189+
190+
event := &BackupStateEvent{
191+
GenericAuditFields: GenericAuditFields{
192+
ID: uuid.New().String(),
193+
IdempotencyKey: operation.GetID(),
194+
Service: "ydbcp",
195+
SpecVersion: "1.0",
196+
Action: ActionUpdate,
197+
Resource: Backup,
198+
Component: "backup_service",
199+
FolderID: operation.GetContainerID(),
200+
Subject: types.OperationCreatorName,
201+
SanitizedToken: "<somehow unpack token from oauth2 auth process>",
202+
Status: status,
203+
Reason: reason,
204+
Timestamp: time.Now().Format(time.RFC3339Nano),
205+
IsBackground: true,
206+
},
207+
Database: operation.GetDatabaseName(),
208+
}
209+
210+
ReportAuditEvent(ctx, event)
211+
}
212+
213+
func ReportAuditEvent(ctx context.Context, event any) {
152214
env, err := makeEnvelope(event)
153215
if err != nil {
154216
xlog.Error(ctx, "error reporting audit event", zap.Error(err))

internal/handlers/schedule_backup.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"go.uber.org/zap"
88
"google.golang.org/protobuf/types/known/durationpb"
99
"google.golang.org/protobuf/types/known/timestamppb"
10+
"ydbcp/internal/audit"
1011
"ydbcp/internal/connectors/db"
1112
"ydbcp/internal/connectors/db/yql/queries"
1213
"ydbcp/internal/types"
@@ -28,6 +29,17 @@ func NewBackupScheduleHandler(
2829
}
2930
}
3031

32+
func withNewBackupAudit(
33+
ctx context.Context, tbwr *types.TakeBackupWithRetryOperation,
34+
upsertError error,
35+
) error {
36+
if upsertError != nil {
37+
return upsertError
38+
}
39+
audit.ReportBackupStateAuditEvent(ctx, tbwr, false, true)
40+
return nil
41+
}
42+
3143
func BackupScheduleHandler(
3244
ctx context.Context,
3345
driver db.DBConnector,
@@ -84,9 +96,11 @@ func BackupScheduleHandler(
8496
if err != nil {
8597
return err
8698
}
87-
return driver.ExecuteUpsert(
88-
ctx,
89-
queryBuilderFactory().WithCreateOperation(tbwr).WithUpdateBackupSchedule(*schedule),
99+
return withNewBackupAudit(
100+
ctx, tbwr, driver.ExecuteUpsert(
101+
ctx,
102+
queryBuilderFactory().WithCreateOperation(tbwr).WithUpdateBackupSchedule(*schedule),
103+
),
90104
)
91105
}
92106
return nil

internal/handlers/take_backup_retry.go

Lines changed: 96 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"math"
1212
"strings"
1313
"time"
14+
"ydbcp/internal/audit"
1415
"ydbcp/internal/backup_operations"
1516
"ydbcp/internal/config"
1617
"ydbcp/internal/connectors/client"
@@ -73,7 +74,9 @@ func exp(p int) time.Duration {
7374
return time.Duration(math.Pow(BACKOFF_EXP, float64(p)))
7475
}
7576

76-
func shouldRetry(config *pb.RetryConfig, count int, firstStart time.Time, lastEnd *time.Time, clock clockwork.Clock) *time.Time {
77+
func shouldRetry(
78+
config *pb.RetryConfig, count int, firstStart time.Time, lastEnd *time.Time, clock clockwork.Clock,
79+
) *time.Time {
7780
if config == nil {
7881
if count == 0 {
7982
t := clock.Now()
@@ -112,7 +115,21 @@ func shouldRetry(config *pb.RetryConfig, count int, firstStart time.Time, lastEn
112115
return &t
113116
}
114117

115-
func MakeRetryDecision(ctx context.Context, tbwr *types.TakeBackupWithRetryOperation, tbOp *types.TakeBackupOperation, clock clockwork.Clock) (RetryDecision, error) {
118+
func withBackupStateAudit(
119+
ctx context.Context, tbwr *types.TakeBackupWithRetryOperation, retry bool,
120+
upsertError error,
121+
) error {
122+
if upsertError != nil {
123+
return upsertError
124+
}
125+
audit.ReportBackupStateAuditEvent(ctx, tbwr, retry, false)
126+
return nil
127+
}
128+
129+
func MakeRetryDecision(
130+
ctx context.Context, tbwr *types.TakeBackupWithRetryOperation, tbOp *types.TakeBackupOperation,
131+
clock clockwork.Clock,
132+
) (RetryDecision, error) {
116133
//retrieve last tbOp run time
117134
//if there is a tbOp, check its status
118135
//if success: set success to itself
@@ -160,13 +177,15 @@ func setErrorToRetryOperation(
160177
ops []types.Operation,
161178
clock clockwork.Clock,
162179
) {
163-
operationIDs := strings.Join(func() []string {
164-
var ids []string
165-
for _, item := range ops {
166-
ids = append(ids, item.GetID())
167-
}
168-
return ids
169-
}(), ", ")
180+
operationIDs := strings.Join(
181+
func() []string {
182+
var ids []string
183+
for _, item := range ops {
184+
ids = append(ids, item.GetID())
185+
}
186+
return ids
187+
}(), ", ",
188+
)
170189
tbwr.State = types.OperationStateError
171190
now := clock.Now()
172191
tbwr.UpdatedAt = timestamppb.New(now)
@@ -220,17 +239,23 @@ func TBWROperationHandler(
220239
if tbwr.ScheduleID != nil {
221240
ctx = xlog.With(ctx, zap.String("ScheduleID", *tbwr.ScheduleID))
222241
}
223-
ops, err := db.SelectOperations(ctx, queries.NewReadTableQuery(
224-
queries.WithTableName("Operations"),
225-
queries.WithIndex("idx_p"),
226-
queries.WithQueryFilters(queries.QueryFilter{
227-
Field: "parent_operation_id",
228-
Values: []table_types.Value{table_types.StringValueFromString(tbwr.ID)},
229-
}),
230-
queries.WithOrderBy(queries.OrderSpec{
231-
Field: "created_at",
232-
}),
233-
))
242+
ops, err := db.SelectOperations(
243+
ctx, queries.NewReadTableQuery(
244+
queries.WithTableName("Operations"),
245+
queries.WithIndex("idx_p"),
246+
queries.WithQueryFilters(
247+
queries.QueryFilter{
248+
Field: "parent_operation_id",
249+
Values: []table_types.Value{table_types.StringValueFromString(tbwr.ID)},
250+
},
251+
),
252+
queries.WithOrderBy(
253+
queries.OrderSpec{
254+
Field: "created_at",
255+
},
256+
),
257+
),
258+
)
234259

235260
var lastTbOp *types.TakeBackupOperation
236261
if len(ops) > 0 {
@@ -245,6 +270,7 @@ func TBWROperationHandler(
245270
case types.OperationStateRunning:
246271
{
247272
do, err := MakeRetryDecision(ctx, tbwr, lastTbOp, clock)
273+
reportRetry := do == RunNewTb
248274
if err != nil {
249275
xlog.Error(ctx, "RetryDecision failed", zap.Error(err))
250276
tbwr.State = types.OperationStateError
@@ -253,9 +279,14 @@ func TBWROperationHandler(
253279
tbwr.UpdatedAt = timestamppb.New(now)
254280
tbwr.Audit.CompletedAt = timestamppb.New(now)
255281

256-
errup := db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
257-
if errup != nil {
258-
return errup
282+
upsertError := withBackupStateAudit(
283+
ctx, tbwr, reportRetry, db.ExecuteUpsert(
284+
ctx,
285+
queryBuilderFactory().WithUpdateOperation(tbwr),
286+
),
287+
)
288+
if upsertError != nil {
289+
return upsertError
259290
}
260291
return err
261292
}
@@ -279,14 +310,18 @@ func TBWROperationHandler(
279310
now := clock.Now()
280311
tbwr.UpdatedAt = timestamppb.New(now)
281312
tbwr.Audit.CompletedAt = timestamppb.New(now)
282-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
313+
return withBackupStateAudit(
314+
ctx, tbwr, reportRetry, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr)),
315+
)
283316
}
284317
case Skip:
285318
return nil
286319
case Error:
287320
{
288321
setErrorToRetryOperation(ctx, tbwr, ops, clock)
289-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
322+
return withBackupStateAudit(
323+
ctx, tbwr, reportRetry, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr)),
324+
)
290325
}
291326
case RunNewTb:
292327
{
@@ -322,14 +357,29 @@ func TBWROperationHandler(
322357
now := clock.Now()
323358
tbwr.UpdatedAt = timestamppb.New(now)
324359
tbwr.Audit.CompletedAt = timestamppb.New(now)
325-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithCreateBackup(*backup).WithCreateOperation(tb).WithUpdateOperation(tbwr))
360+
return withBackupStateAudit(
361+
ctx, tbwr, reportRetry, db.ExecuteUpsert(
362+
ctx,
363+
queryBuilderFactory().WithCreateBackup(*backup).WithCreateOperation(tb).WithUpdateOperation(tbwr),
364+
),
365+
)
326366
} else {
327367
//increment retries
328-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
368+
return withBackupStateAudit(
369+
ctx, tbwr, reportRetry, db.ExecuteUpsert(
370+
ctx,
371+
queryBuilderFactory().WithUpdateOperation(tbwr),
372+
),
373+
)
329374
}
330375
} else {
331376
xlog.Debug(ctx, "running new TB", zap.String("TBOperationID", tb.ID))
332-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithCreateBackup(*backup).WithCreateOperation(tb).WithUpdateOperation(tbwr))
377+
return withBackupStateAudit(
378+
ctx, tbwr, reportRetry, db.ExecuteUpsert(
379+
ctx,
380+
queryBuilderFactory().WithCreateBackup(*backup).WithCreateOperation(tb).WithUpdateOperation(tbwr),
381+
),
382+
)
333383
}
334384
}
335385
default:
@@ -339,7 +389,12 @@ func TBWROperationHandler(
339389
tbwr.UpdatedAt = timestamppb.New(now)
340390
tbwr.Audit.CompletedAt = timestamppb.New(now)
341391

342-
_ = db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
392+
upsertError := withBackupStateAudit(
393+
ctx, tbwr, reportRetry, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr)),
394+
)
395+
if upsertError != nil {
396+
return upsertError
397+
}
343398
return errors.New(tbwr.Message)
344399
}
345400
}
@@ -355,14 +410,18 @@ func TBWROperationHandler(
355410
now := clock.Now()
356411
tbwr.UpdatedAt = timestamppb.New(now)
357412
tbwr.Audit.CompletedAt = timestamppb.New(now)
358-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
413+
return withBackupStateAudit(
414+
ctx, tbwr, false, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr)),
415+
)
359416
} else {
360417
if lastTbOp.State == types.OperationStatePending || lastTbOp.State == types.OperationStateRunning {
361418
xlog.Info(ctx, "cancelling TB operation", zap.String("TBOperationID", lastTbOp.ID))
362419
lastTbOp.State = types.OperationStateStartCancelling
363420
lastTbOp.Message = "Cancelling by parent operation"
364421
lastTbOp.UpdatedAt = timestamppb.New(clock.Now())
365-
return db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(lastTbOp))
422+
return withBackupStateAudit(
423+
ctx, tbwr, false, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(lastTbOp)),
424+
)
366425
}
367426
}
368427
}
@@ -373,7 +432,12 @@ func TBWROperationHandler(
373432
now := clock.Now()
374433
tbwr.UpdatedAt = timestamppb.New(now)
375434
tbwr.Audit.CompletedAt = timestamppb.New(now)
376-
_ = db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr))
435+
upsertError := withBackupStateAudit(
436+
ctx, tbwr, false, db.ExecuteUpsert(ctx, queryBuilderFactory().WithUpdateOperation(tbwr)),
437+
)
438+
if upsertError != nil {
439+
return upsertError
440+
}
377441
return errors.New(tbwr.Message)
378442
}
379443
}

0 commit comments

Comments
 (0)