Skip to content
This repository was archived by the owner on Nov 17, 2025. It is now read-only.

Commit d04ce7b

Browse files
authored
Merge pull request #369 from matrix-org/kegan/migrate-stuck-invites
Migrate stuck invites
2 parents bb003c1 + 03f6287 commit d04ce7b

File tree

2 files changed

+315
-0
lines changed

2 files changed

+315
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package migrations
2+
3+
import (
4+
"context"
5+
"database/sql"
6+
"fmt"
7+
8+
"github.com/lib/pq"
9+
"github.com/pressly/goose/v3"
10+
)
11+
12+
func init() {
13+
goose.AddMigrationContext(upClearStuckInvites, downClearStuckInvites)
14+
}
15+
16+
// The purpose of this migration is to find users who have rooms which have
17+
// not been properly processed by the proxy and invalidate their since token
18+
// so they will do an initial sync on the next poller startup. This is specifically
19+
// targeting stuck invites, where there is an invite in the invites table but
20+
// the room is already joined. This is usually (always?) due to missing a create
21+
// event when the room was joined, caused by a synapse bug outlined in
22+
// https://github.com/matrix-org/sliding-sync/issues/367
23+
// This isn't exclusively a problem with invites, though it manifests more clearly there.
24+
func upClearStuckInvites(ctx context.Context, tx *sql.Tx) error {
25+
// The syncv3_unread table is updated any time A) a room is in rooms.join and B) the unread count has changed,
26+
// where nil != 0. Therefore, we can use this table as a proxy for "have we seen a v2 response which has put this
27+
// room into rooms.join"? For every room in rooms.join, we should have seen a create event for it, and hence have
28+
// an entry in syncv3_rooms. If we do not have an entry in syncv3_rooms but do have an entry in syncv3_unread, this
29+
// implies we failed to properly store this joined room and therefore the user who the unread marker is for should be
30+
// reset to force an initial sync. On matrix.org, of the users using sliding sync, this will catch around ~1.82% of users
31+
rows, err := tx.QueryContext(ctx, `
32+
SELECT distinct(user_id) FROM syncv3_unread
33+
WHERE room_id NOT IN (
34+
SELECT room_id
35+
FROM syncv3_rooms
36+
)
37+
`)
38+
defer rows.Close()
39+
if err != nil {
40+
return fmt.Errorf("failed to select bad users: %w", err)
41+
}
42+
43+
var usersToInvalidate []string
44+
for rows.Next() {
45+
var userID string
46+
err = rows.Scan(&userID)
47+
if err != nil {
48+
return fmt.Errorf("failed to scan user: %w", err)
49+
}
50+
usersToInvalidate = append(usersToInvalidate, userID)
51+
}
52+
logger.Info().Int("len_invalidate_users", len(usersToInvalidate)).Msg("invalidating users")
53+
if len(usersToInvalidate) < 50 {
54+
logger.Info().Strs("invalidate_users", usersToInvalidate).Msg("invalidating users")
55+
}
56+
57+
// for each user:
58+
// - reset their since token for all devices
59+
// - remove any outstanding invites (we'll be told about them again when they initial sync)
60+
res, err := tx.ExecContext(ctx, `
61+
UPDATE syncv3_sync2_devices SET since='' WHERE user_id=ANY($1)
62+
`, pq.StringArray(usersToInvalidate))
63+
if err != nil {
64+
return fmt.Errorf("failed to invalidate since tokens: %w", err)
65+
}
66+
ra, _ := res.RowsAffected()
67+
logger.Info().Int64("num_devices", ra).Msg("reset since tokens")
68+
69+
res, err = tx.ExecContext(ctx, `
70+
DELETE FROM syncv3_invites WHERE user_id=ANY($1)
71+
`, pq.StringArray(usersToInvalidate))
72+
if err != nil {
73+
return fmt.Errorf("failed to remove outstanding invites: %w", err)
74+
}
75+
ra, _ = res.RowsAffected()
76+
logger.Info().Int64("num_invites", ra).Msg("reset invites")
77+
return nil
78+
}
79+
80+
func downClearStuckInvites(ctx context.Context, tx *sql.Tx) error {
81+
// we can't roll this back
82+
return nil
83+
}
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
package migrations
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"testing"
8+
"time"
9+
10+
"github.com/jmoiron/sqlx"
11+
"github.com/matrix-org/sliding-sync/sqlutil"
12+
"github.com/matrix-org/sliding-sync/state"
13+
"github.com/matrix-org/sliding-sync/sync2"
14+
)
15+
16+
func TestClearStuckInvites(t *testing.T) {
17+
db, close := connectToDB(t)
18+
defer close()
19+
roomsTable := state.NewRoomsTable(db)
20+
inviteTable := state.NewInvitesTable(db)
21+
unreadTable := state.NewUnreadTable(db)
22+
deviceTable := sync2.NewDevicesTable(db)
23+
tokensTable := sync2.NewTokensTable(db, "secret")
24+
25+
zero := 0
26+
device1 := "TEST_1"
27+
device2 := "TEST_2"
28+
roomA := "!TestClearStuckInvites_a:localhost"
29+
roomB := "!TestClearStuckInvites_b:localhost"
30+
roomC := "!TestClearStuckInvites_c:localhost"
31+
roomD := "!TestClearStuckInvites_d:localhost"
32+
roomE := "!TestClearStuckInvites_e:localhost"
33+
roomF := "!TestClearStuckInvites_f:localhost"
34+
roomG := "!TestClearStuckInvites_g:localhost"
35+
alice := "@TestClearStuckInvites_alice:localhost"
36+
bob := "@TestClearStuckInvites_bob:localhost"
37+
charlie := "@TestClearStuckInvites_charlie:localhost"
38+
doris := "@TestClearStuckInvites_doris:localhost"
39+
users := []string{
40+
alice, bob, charlie, doris,
41+
}
42+
43+
// Test cases:
44+
// Room | In Invite Table? | In Unread Table? | In Room Table? | Comment
45+
// A Y Y Y OK, Genuine invite, proxy in room
46+
// B Y Y N BAD, Stuck invite, proxy never saw join
47+
// C Y N Y OK, Genuine invite, proxy in room, no unread counts (unusual but valid)
48+
// D Y N N OK, Genuine invite, proxy not in room
49+
// E N Y Y OK, Genuine joined room
50+
// F N Y N BAD, Stuck joined room, proxy never saw join
51+
// G N N Y OK, Genuine joined room, no unread counts (unusual but valid)
52+
// - N N N Impossible, room id isn't in any table!
53+
roomToInfo := map[string]struct {
54+
invitedUser string
55+
unreadUser string
56+
inRoomTable bool
57+
}{
58+
roomA: {
59+
invitedUser: alice,
60+
unreadUser: bob,
61+
inRoomTable: true,
62+
},
63+
roomB: {
64+
invitedUser: bob,
65+
unreadUser: bob,
66+
inRoomTable: false,
67+
},
68+
roomC: {
69+
invitedUser: charlie,
70+
inRoomTable: true,
71+
},
72+
roomD: {
73+
invitedUser: doris,
74+
inRoomTable: false,
75+
},
76+
roomE: {
77+
unreadUser: alice,
78+
inRoomTable: true,
79+
},
80+
roomF: {
81+
unreadUser: doris,
82+
},
83+
roomG: {
84+
inRoomTable: true,
85+
},
86+
}
87+
88+
err := sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error {
89+
for roomID, info := range roomToInfo {
90+
if info.inRoomTable {
91+
err := roomsTable.Upsert(txn, state.RoomInfo{
92+
ID: roomID,
93+
}, 0, 0)
94+
if err != nil {
95+
return fmt.Errorf("Upsert room: %s", err)
96+
}
97+
}
98+
if info.invitedUser != "" {
99+
err := inviteTable.InsertInvite(info.invitedUser, roomID, []json.RawMessage{json.RawMessage(`{}`)})
100+
if err != nil {
101+
return fmt.Errorf("InsertInvite: %s", err)
102+
}
103+
}
104+
if info.unreadUser != "" {
105+
err := unreadTable.UpdateUnreadCounters(info.unreadUser, roomID, &zero, &zero)
106+
if err != nil {
107+
return fmt.Errorf("UpdateUnreadCounters: %s", err)
108+
}
109+
}
110+
}
111+
for _, userID := range users {
112+
for _, deviceID := range []string{device1, device2} {
113+
// each user has 2 devices
114+
if err := deviceTable.InsertDevice(txn, userID, deviceID); err != nil {
115+
return fmt.Errorf("InsertDevice: %s", err)
116+
}
117+
_, err := tokensTable.Insert(txn, userID+deviceID, userID, deviceID, time.Now())
118+
if err != nil {
119+
return fmt.Errorf("TokensTable.Insert: %s", err)
120+
}
121+
}
122+
}
123+
return nil
124+
})
125+
if err != nil {
126+
t.Fatalf("failed to set up test configuration: %s", err)
127+
}
128+
// set since tokens (this is done without a txn hence cannot be bundled in as the UPDATE would fail)
129+
for _, userID := range users {
130+
for i, deviceID := range []string{device1, device2} {
131+
// each user has 2 devices
132+
since := fmt.Sprintf("since_%d", i)
133+
if err := deviceTable.UpdateDeviceSince(userID, deviceID, since); err != nil {
134+
t.Fatalf("UpdateDeviceSince: %s", err)
135+
}
136+
}
137+
}
138+
139+
t.Log("Run the migration.")
140+
tx, err := db.Beginx()
141+
if err != nil {
142+
t.Fatal(err)
143+
}
144+
if err := upClearStuckInvites(context.Background(), tx.Tx); err != nil {
145+
t.Fatalf("upClearStuckInvites: %s", err)
146+
}
147+
tx.Commit()
148+
149+
// make a new txn for assertions
150+
tx, err = db.Beginx()
151+
if err != nil {
152+
t.Fatal(err)
153+
}
154+
155+
// users in room B (bob) and F (doris) should be reset.
156+
tokens, err := tokensTable.TokenForEachDevice(tx)
157+
if err != nil {
158+
t.Fatalf("TokenForEachDevice: %s", err)
159+
}
160+
wantResults := map[[2]string]struct {
161+
wantSinceReset bool
162+
}{
163+
{bob, device1}: {
164+
wantSinceReset: true,
165+
},
166+
{bob, device2}: {
167+
wantSinceReset: true,
168+
},
169+
{doris, device1}: {
170+
wantSinceReset: true,
171+
},
172+
{doris, device2}: {
173+
wantSinceReset: true,
174+
},
175+
// everyone else should NOT have since reset
176+
{alice, device1}: {
177+
wantSinceReset: false,
178+
},
179+
{alice, device2}: {
180+
wantSinceReset: false,
181+
},
182+
{charlie, device1}: {
183+
wantSinceReset: false,
184+
},
185+
{charlie, device2}: {
186+
wantSinceReset: false,
187+
},
188+
}
189+
for _, tok := range tokens {
190+
key := [2]string{tok.UserID, tok.DeviceID}
191+
want, ok := wantResults[key]
192+
if !ok {
193+
continue // different user in another test?
194+
}
195+
if want.wantSinceReset && tok.Since != "" {
196+
t.Errorf("%s want since reset, got %+v", key, tok)
197+
}
198+
if !want.wantSinceReset && tok.Since == "" {
199+
t.Errorf("%s did not want since reset, got %+v", key, tok)
200+
}
201+
}
202+
// invites for Bob and Doris are gone
203+
for _, userID := range []string{bob, doris} {
204+
got, err := inviteTable.SelectAllInvitesForUser(userID)
205+
if err != nil {
206+
t.Fatalf("SelectAllInvitesForUser: %s", err)
207+
}
208+
if len(got) > 0 {
209+
t.Fatalf("SelectAllInvitesForUser got invites for %s, wanted none: %+v", userID, got)
210+
}
211+
}
212+
// ensure other invites remain
213+
wantInvites := map[string][]string{
214+
alice: {roomA},
215+
charlie: {roomC},
216+
}
217+
for userID, wantInvitesRooms := range wantInvites {
218+
got, err := inviteTable.SelectAllInvitesForUser(userID)
219+
if err != nil {
220+
t.Fatalf("SelectAllInvitesForUser: %s", err)
221+
}
222+
if len(got) != len(wantInvitesRooms) {
223+
t.Fatalf("SelectAllInvitesForUser got %d invites for %s, wanted %d", len(got), userID, len(wantInvitesRooms))
224+
}
225+
for _, wantRoom := range wantInvitesRooms {
226+
_, exists := got[wantRoom]
227+
if !exists {
228+
t.Fatalf("SelectAllInvitesForUser wanted invite for %s in %s, but it's missing", userID, wantRoom)
229+
}
230+
}
231+
}
232+
}

0 commit comments

Comments
 (0)