Skip to content

Commit 4ebc3bf

Browse files
authored
Replication issue diagnostics (#304)
* Add diagnostic message if no commit is performed on the active replication stream. * For postgres, ping every minute. * Add changeset. * Disable check for mysql for now.
1 parent 1aafdaf commit 4ebc3bf

File tree

3 files changed

+40
-2
lines changed

3 files changed

+40
-2
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
'@powersync/service-core': minor
3+
'@powersync/service-image': minor
4+
---
5+
6+
Report lack of commits or keepalives as issues in the diagnostics api.

packages/service-core/src/api/diagnostics.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,38 @@ export async function getSyncRulesStatus(
134134
})
135135
);
136136

137+
if (live_status && status?.active && sourceConfig.type != 'mysql') {
138+
// Check replication lag for active sync rules.
139+
// Right now we exclude mysql, since it we don't have consistent keepalives for it.
140+
if (sync_rules.last_checkpoint_ts == null && sync_rules.last_keepalive_ts == null) {
141+
errors.push({
142+
level: 'warning',
143+
message: 'No checkpoint found, cannot calculate replication lag'
144+
});
145+
} else {
146+
const lastTime = Math.max(
147+
sync_rules.last_checkpoint_ts?.getTime() ?? 0,
148+
sync_rules.last_keepalive_ts?.getTime() ?? 0
149+
);
150+
const lagSeconds = Math.round((Date.now() - lastTime) / 1000);
151+
// On idle instances, keepalive messages are only persisted every 60 seconds.
152+
// So we use 2 minutes as a threshold for warnings, and 15 minutes for critical.
153+
// The replication lag metric should give a more granular value, but that is not available directly
154+
// in the API containers used for diagnostics, and this should give a good enough indication.
155+
if (lagSeconds > 15 * 60) {
156+
errors.push({
157+
level: 'fatal',
158+
message: `No replicated commit in more than ${lagSeconds}s`
159+
});
160+
} else if (lagSeconds > 120) {
161+
errors.push({
162+
level: 'warning',
163+
message: `No replicated commit in more than ${lagSeconds}s`
164+
});
165+
}
166+
}
167+
}
168+
137169
return {
138170
content: include_content ? sync_rules.sync_rules_content : undefined,
139171
connections: [

packages/service-core/src/replication/AbstractReplicator.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import { AbstractReplicationJob } from './AbstractReplicationJob.js';
1010
import { ErrorRateLimiter } from './ErrorRateLimiter.js';
1111
import { ConnectionTestResult } from './ReplicationModule.js';
1212

13-
// 5 minutes
14-
const PING_INTERVAL = 1_000_000_000n * 300n;
13+
// 1 minute
14+
const PING_INTERVAL = 1_000_000_000n * 60n;
1515

1616
export interface CreateJobOptions {
1717
lock: storage.ReplicationLock;

0 commit comments

Comments
 (0)