Skip to content

Commit 20fc077

Browse files
committed
fix(matching): we returned wrong error if shard is lost
During refactoring for #7711 When shard was lost and matcher is on shard-manager, we returned untyped error which could potentially lead to wrong retries from client. Also, returning right error ensures we have the same stats/logs as we would have without shard-manager. Signed-off-by: Jan Kisel <dkrot@uber.com>
1 parent a50e09a commit 20fc077

File tree

2 files changed

+144
-41
lines changed

2 files changed

+144
-41
lines changed

service/matching/handler/engine.go

Lines changed: 51 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ type (
112112
timeSource clock.TimeSource
113113
failoverNotificationVersion int64
114114
ShardDistributorMatchingConfig clientcommon.Config
115+
selfAddress membership.HostInfo
115116
}
116117
)
117118

@@ -142,6 +143,11 @@ func NewEngine(
142143
shardDistributorClient executorclient.Client,
143144
ShardDistributorMatchingConfig clientcommon.Config,
144145
) Engine {
146+
selfAddress, err := resolver.WhoAmI()
147+
if err != nil {
148+
logger.Fatal("failed to lookup self im membership", tag.Error(err))
149+
}
150+
145151
e := &matchingEngineImpl{
146152
shutdown: make(chan struct{}),
147153
shutdownCompletion: &sync.WaitGroup{},
@@ -162,6 +168,7 @@ func NewEngine(
162168
isolationState: isolationState,
163169
timeSource: timeSource,
164170
ShardDistributorMatchingConfig: ShardDistributorMatchingConfig,
171+
selfAddress: selfAddress,
165172
}
166173

167174
e.setupExecutor(shardDistributorClient)
@@ -1515,66 +1522,69 @@ func (e *matchingEngineImpl) emitInfoOrDebugLog(
15151522
}
15161523
}
15171524

1518-
func (e *matchingEngineImpl) errIfShardOwnershipLost(ctx context.Context, taskList *tasklist.Identifier) error {
1519-
if !e.config.EnableTasklistOwnershipGuard() {
1520-
return nil
1525+
func (e *matchingEngineImpl) stillOwnShard(ctx context.Context, taskList *tasklist.Identifier, reason, whoOwns *string) (bool, error) {
1526+
if e.isShuttingDown() {
1527+
*reason = "engine is shutting down"
1528+
return false, nil
15211529
}
15221530

1523-
// We have a shard-processor shared by all the task lists with the same name.
1524-
// For now there is no 1:1 mapping between shards and tasklists. (#tasklists >= #shards)
1525-
sp, err := e.executor.GetShardProcess(ctx, taskList.GetName())
15261531
if e.executor.IsOnboardedToSD() {
1532+
// We have a shard-processor shared by all the task lists with the same name.
1533+
// For now there is no 1:1 mapping between shards and tasklists. (#tasklists >= #shards)
1534+
sp, err := e.executor.GetShardProcess(ctx, taskList.GetName())
15271535
if err != nil {
1528-
return fmt.Errorf("failed to lookup ownership in SD: %w", err)
1536+
return false, fmt.Errorf("failed to lookup ownership in SD: %w", err)
15291537
}
1538+
15301539
if sp == nil {
1531-
return fmt.Errorf("failed to lookup ownership in SD: shard process is nil")
1540+
*reason = "shard process not found"
1541+
return false, nil
15321542
}
1533-
return nil
1543+
return true, nil
15341544
}
15351545

1536-
self, err := e.membershipResolver.WhoAmI()
1546+
taskListOwner, err := e.membershipResolver.Lookup(service.Matching, taskList.GetName())
15371547
if err != nil {
1538-
return fmt.Errorf("failed to lookup self im membership: %w", err)
1548+
return false, fmt.Errorf("failed to lookup task list owner: %w", err)
1549+
}
1550+
if taskListOwner.Identity() != e.selfAddress.Identity() {
1551+
*reason = "engine does not own this shard"
1552+
*whoOwns = taskListOwner.Identity()
15391553
}
15401554

1541-
if e.isShuttingDown() {
1542-
e.logger.Warn("request to get tasklist is being rejected because engine is shutting down",
1543-
tag.WorkflowDomainID(taskList.GetDomainID()),
1544-
tag.WorkflowTaskListType(taskList.GetType()),
1545-
tag.WorkflowTaskListName(taskList.GetName()),
1546-
)
1555+
return true, nil
1556+
}
15471557

1548-
return cadence_errors.NewTaskListNotOwnedByHostError(
1549-
"not known",
1550-
self.Identity(),
1551-
taskList.GetName(),
1552-
)
1558+
// Defensive check to make sure we actually own the task list
1559+
//
1560+
// If we try to create a task list manager for a task list that is not owned by us, return an error
1561+
// The new task list manager will steal the task list from the current owner, which should only happen if
1562+
// the task list is owned by the current host.
1563+
func (e *matchingEngineImpl) errIfShardOwnershipLost(ctx context.Context, taskList *tasklist.Identifier) error {
1564+
if !e.config.EnableTasklistOwnershipGuard() {
1565+
return nil
15531566
}
15541567

1555-
// Defensive check to make sure we actually own the task list
1556-
// If we try to create a task list manager for a task list that is not owned by us, return an error
1557-
// The new task list manager will steal the task list from the current owner, which should only happen if
1558-
// the task list is owned by the current host.
1559-
taskListOwner, err := e.membershipResolver.Lookup(service.Matching, taskList.GetName())
1568+
reason := "unknown reason"
1569+
whoOwns := "not known"
1570+
ok, err := e.stillOwnShard(ctx, taskList, &reason, &whoOwns)
15601571
if err != nil {
1561-
return fmt.Errorf("failed to lookup task list owner: %w", err)
1572+
return err
15621573
}
1563-
1564-
if taskListOwner.Identity() != self.Identity() {
1565-
e.logger.Warn("Request to get tasklist is being rejected because engine does not own this shard",
1566-
tag.WorkflowDomainID(taskList.GetDomainID()),
1567-
tag.WorkflowTaskListType(taskList.GetType()),
1568-
tag.WorkflowTaskListName(taskList.GetName()),
1569-
)
1570-
return cadence_errors.NewTaskListNotOwnedByHostError(
1571-
taskListOwner.Identity(),
1572-
self.Identity(),
1573-
taskList.GetName(),
1574-
)
1574+
if ok {
1575+
return nil
15751576
}
15761577

1577-
return nil
1578+
e.logger.Warn(fmt.Sprintf("request to get tasklist is being rejected because %s", reason),
1579+
tag.WorkflowDomainID(taskList.GetDomainID()),
1580+
tag.WorkflowTaskListType(taskList.GetType()),
1581+
tag.WorkflowTaskListName(taskList.GetName()),
1582+
)
1583+
return cadence_errors.NewTaskListNotOwnedByHostError(
1584+
whoOwns,
1585+
e.selfAddress.Identity(),
1586+
taskList.GetName(),
1587+
)
15781588
}
15791589

15801590
func (e *matchingEngineImpl) isShuttingDown() bool {

service/matching/handler/engine_test.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/uber/cadence/common/client"
3939
"github.com/uber/cadence/common/clock"
4040
"github.com/uber/cadence/common/dynamicconfig/dynamicproperties"
41+
cadence_errors "github.com/uber/cadence/common/errors"
4142
"github.com/uber/cadence/common/log"
4243
"github.com/uber/cadence/common/membership"
4344
"github.com/uber/cadence/common/metrics"
@@ -743,6 +744,98 @@ func TestIsShuttingDown(t *testing.T) {
743744
assert.True(t, e.isShuttingDown())
744745
}
745746

747+
func TestErrIfShardOwnershipLost(t *testing.T) {
748+
taskListID, err := tasklist.NewIdentifier("test-domain-id", "test-tasklist", persistence.TaskListTypeActivity)
749+
require.NoError(t, err)
750+
751+
testCases := []struct {
752+
name string
753+
enableOwnershipGuard bool
754+
setupMocks func(ctrl *gomock.Controller, executor *executorclient.MockExecutor[tasklist.ShardProcessor], engine *matchingEngineImpl)
755+
expectErrorContains string
756+
expectNotOwnedError bool
757+
expectedOwnedBy string
758+
}{
759+
{
760+
name: "ownership guard disabled",
761+
enableOwnershipGuard: false,
762+
},
763+
{
764+
name: "engine is shutting down",
765+
setupMocks: func(ctrl *gomock.Controller, executor *executorclient.MockExecutor[tasklist.ShardProcessor], engine *matchingEngineImpl) {
766+
close(engine.shutdown)
767+
},
768+
enableOwnershipGuard: true,
769+
expectNotOwnedError: true,
770+
expectedOwnedBy: "not known",
771+
},
772+
{
773+
name: "ownership lookup in SD fails",
774+
setupMocks: func(ctrl *gomock.Controller, executor *executorclient.MockExecutor[tasklist.ShardProcessor], engine *matchingEngineImpl) {
775+
executor.EXPECT().GetShardProcess(gomock.Any(), taskListID.GetName()).Return(nil, errors.New("sd down"))
776+
},
777+
enableOwnershipGuard: true,
778+
expectErrorContains: "failed to lookup ownership in SD",
779+
},
780+
{
781+
name: "shard process not found in SD",
782+
setupMocks: func(ctrl *gomock.Controller, executor *executorclient.MockExecutor[tasklist.ShardProcessor], engine *matchingEngineImpl) {
783+
executor.EXPECT().GetShardProcess(gomock.Any(), taskListID.GetName()).Return(nil, nil)
784+
},
785+
enableOwnershipGuard: true,
786+
expectNotOwnedError: true,
787+
expectedOwnedBy: "not known", // SD does not provide the owner identity
788+
},
789+
{
790+
name: "owned when shard process exists in SD",
791+
setupMocks: func(ctrl *gomock.Controller, executor *executorclient.MockExecutor[tasklist.ShardProcessor], engine *matchingEngineImpl) {
792+
executor.EXPECT().GetShardProcess(gomock.Any(), taskListID.GetName()).Return(tasklist.NewMockShardProcessor(ctrl), nil)
793+
},
794+
enableOwnershipGuard: true,
795+
},
796+
}
797+
798+
for _, tc := range testCases {
799+
t.Run(tc.name, func(t *testing.T) {
800+
ctrl := gomock.NewController(t)
801+
executor := executorclient.NewMockExecutor[tasklist.ShardProcessor](ctrl)
802+
engine := &matchingEngineImpl{
803+
shutdown: make(chan struct{}),
804+
executor: executor,
805+
selfAddress: membership.NewDetailedHostInfo("", "host-123", nil),
806+
config: &config.Config{
807+
EnableTasklistOwnershipGuard: func(opts ...dynamicproperties.FilterOption) bool {
808+
return tc.enableOwnershipGuard
809+
},
810+
},
811+
logger: log.NewNoop(),
812+
}
813+
executor.EXPECT().IsOnboardedToSD().Return(true).AnyTimes()
814+
815+
if tc.setupMocks != nil {
816+
tc.setupMocks(ctrl, executor, engine)
817+
}
818+
819+
err := engine.errIfShardOwnershipLost(context.Background(), taskListID)
820+
821+
if tc.expectErrorContains != "" {
822+
require.Error(t, err)
823+
require.ErrorContains(t, err, tc.expectErrorContains)
824+
} else if tc.expectNotOwnedError {
825+
require.Error(t, err)
826+
var notOwnedErr *cadence_errors.TaskListNotOwnedByHostError
827+
require.ErrorAs(t, err, &notOwnedErr)
828+
829+
assert.Equal(t, tc.expectedOwnedBy, notOwnedErr.OwnedByIdentity)
830+
assert.Equal(t, engine.selfAddress.Identity(), notOwnedErr.MyIdentity)
831+
assert.Equal(t, taskListID.GetName(), notOwnedErr.TasklistName)
832+
} else {
833+
require.NoError(t, err)
834+
}
835+
})
836+
}
837+
}
838+
746839
func TestGetTasklistsNotOwned(t *testing.T) {
747840

748841
ctrl := gomock.NewController(t)

0 commit comments

Comments
 (0)