Skip to content

Commit 51a47b0

Browse files
authored
Stuck watch fix (#1440)
* Refactor code structure for improved readability and maintainability, update @kubernetes/client-node to 1.2.0 * Enhance K8sWatcher to include cluster ARM ID and region in CR watching methods; update LoggerWrapper events for better tracking of watch completion scenarios. * Improve K8sWatcher timeout handling by adding a manual timeout for the cases when the watch hangs * Update K8sWatcher to handle manual timeout for watch hangs; upgrade @kubernetes/client-node to 1.2.0 * Fix comment formatting in Events enum for clarity
1 parent 62b0e7e commit 51a47b0

File tree

6 files changed

+277
-434
lines changed

6 files changed

+277
-434
lines changed

appmonitoring/ts/src/.trivyignore

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,3 @@
1-
# tough-cookie -> request -> kubernetes/client-node, blocked on kubernetes/client-node's migration from request to fetch
2-
CVE-2023-26136
3-
CVE-2024-28863
4-
5-
# ws -> kubernetes/client-node
6-
CVE-2024-37890
7-
8-
9-
# @kubernetes/client-node@1.0.0
10-
# │ └─┬ tar@7.4.3
11-
# │ └─┬ minizlib@3.0.1
12-
# │ └─┬ rimraf@5.0.10
13-
# │ └─┬ glob@10.4.5
14-
# │ └─┬ foreground-child@3.3.0
15-
# │ └── cross-spawn@7.0.3 deduped
16-
CVE-2024-21538
17-
181
# ┬ @kubernetes/client-node@1.0.0
192
# └── jsonpath-plus@10.2.0
20-
CVE-2025-1302
3+
#CVE-2025-1302

appmonitoring/ts/src/K8sWatcher.ts

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { HeartbeatLogs, HeartbeatMetrics, RequestMetadata, Watchdogs, logger } from "./LoggerWrapper.js";
1+
import { Events, HeartbeatLogs, HeartbeatMetrics, RequestMetadata, Watchdogs, logger } from "./LoggerWrapper.js";
22
import * as k8s from "@kubernetes/client-node";
33
import { InstrumentationCR, ListResponse } from "./RequestDefinition.js"
44
import { InstrumentationCRsCollection } from "./InstrumentationCRsCollection.js";
@@ -11,7 +11,7 @@ export class K8sWatcher {
1111

1212
private static lastSuccessfulListTimestamp: Date = new Date();
1313

14-
public static async StartWatchingCRs(crs: InstrumentationCRsCollection, onNewCR: (cr: InstrumentationCR, isRemoved: boolean) => void, onResetCRs: (crs: InstrumentationCR[]) => void, operationId: string): Promise<void> {
14+
public static async StartWatchingCRs(crs: InstrumentationCRsCollection, onNewCR: (cr: InstrumentationCR, isRemoved: boolean) => void, onResetCRs: (crs: InstrumentationCR[]) => void, operationId: string, clusterArmId: string, clusterArmRegion: string): Promise<void> {
1515
const kc = new k8s.KubeConfig();
1616
kc.loadFromDefault();
1717

@@ -23,7 +23,7 @@ export class K8sWatcher {
2323
let latestResourceVersion: string = null;
2424
while (true) { // eslint-disable-line
2525
try {
26-
latestResourceVersion = await K8sWatcher.WatchCRs(k8sApi, watch, latestResourceVersion, crs, operationId, onNewCR, onResetCRs);
26+
latestResourceVersion = await K8sWatcher.WatchCRs(k8sApi, watch, latestResourceVersion, crs, operationId, onNewCR, onResetCRs, clusterArmId, clusterArmRegion);
2727
} catch (e) {
2828
// either the list call or the watch call failed
2929
const ex = logger.sanitizeException(e);
@@ -40,12 +40,14 @@ export class K8sWatcher {
4040
}
4141

4242
// pause for a bit to avoid generating too much load in case of cascading failures
43-
await new Promise(r => setTimeout(r, 5000));
43+
// randomize the wait time to avoid all pods of the deployment hitting the API server at the same time
44+
// if this is caused by networking or node issues and happens for all pods at the same time
45+
await new Promise(r => setTimeout(r, Math.random() * 5000.0));
4446
}
4547
}
4648
}
4749

48-
private static async WatchCRs(k8sApi: k8s.CustomObjectsApi, watch: k8s.Watch, latestResourceVersion: string, crs: InstrumentationCRsCollection, operationId: string, onNewCR: (cr: InstrumentationCR, isRemoved: boolean) => void, onResetCRs: (crs: InstrumentationCR[]) => void): Promise<string> {
50+
private static async WatchCRs(k8sApi: k8s.CustomObjectsApi, watch: k8s.Watch, latestResourceVersion: string, crs: InstrumentationCRsCollection, operationId: string, onNewCR: (cr: InstrumentationCR, isRemoved: boolean) => void, onResetCRs: (crs: InstrumentationCR[]) => void, clusterArmId: string, clusterArmRegion: string): Promise<string> {
4951
let requestMetadata = new RequestMetadata("CR watcher", crs);
5052

5153
logger.info(`Listing CRs, resourceVersion=${latestResourceVersion}...`, operationId, requestMetadata);
@@ -56,7 +58,8 @@ export class K8sWatcher {
5658
group: K8sWatcher.crdApiGroup,
5759
version: K8sWatcher.crdApiVersion,
5860
plural: K8sWatcher.crdNamePlural,
59-
resourceVersion: latestResourceVersion ?? undefined
61+
resourceVersion: latestResourceVersion ?? undefined,
62+
timeoutSeconds: 30
6063
});
6164

6265
logger.addHeartbeatMetric(HeartbeatMetrics.CRsListCallSucceededCount, 1, "200");
@@ -79,6 +82,9 @@ export class K8sWatcher {
7982

8083
// watch() doesn't block (it starts the loop and returns immediately), so we can't just return the promise it returns to our caller
8184
// we must instead create our own promise and resolve it manually when the watch informs us that it stopped via a callback
85+
const watchTimeoutSeconds = 240;
86+
let doneCallbackInvoked = false;
87+
let watchAbortController: AbortController = null;
8288
const watchIsDonePromise: Promise<string> = new Promise((resolve, reject) => {
8389
try {
8490
// /api/v1/namespaces
@@ -87,7 +93,7 @@ export class K8sWatcher {
8793
{
8894
allowWatchBookmarks: true,
8995
resourceVersion: latestResourceVersion,
90-
timeoutSeconds: 240 // watch will be stopped after at most this many seconds
96+
timeoutSeconds: watchTimeoutSeconds // watch will be stopped after at most this many seconds
9197
//fieldSelector: fieldSelector
9298
},
9399
(type, apiObj) => {
@@ -115,7 +121,9 @@ export class K8sWatcher {
115121
}
116122
},
117123
err => { // watch is done callback
118-
logger.info("Watch has completed", operationId, requestMetadata);
124+
doneCallbackInvoked = true;
125+
126+
logger.info("Watch has completed via the done callback", operationId, requestMetadata);
119127
if (err != null) {
120128
// this indicates an issue with the watch encountered once the stream is opened
121129
// we want to handle it in the same way as an exception (which is triggered during opening of the stream)
@@ -131,7 +139,11 @@ export class K8sWatcher {
131139
logger.addHeartbeatMetric(HeartbeatMetrics.CRsWatchCallSucceededCount, 1, "200");
132140

133141
resolve(latestResourceVersion);
142+
}).then(abortController => {
143+
watchAbortController = abortController;
134144
});
145+
146+
logger.info("Watch call returned", operationId, requestMetadata);
135147
} catch (e) {
136148
logger.error(`Watch.watch() threw: ${JSON.stringify(e)}`, operationId, requestMetadata);
137149
logger.addHeartbeatMetric(HeartbeatMetrics.CRsWatchCallFailedCount, 1, e?.statusCode ?? 0);
@@ -141,7 +153,34 @@ export class K8sWatcher {
141153
}
142154
});
143155

144-
return watchIsDonePromise;
156+
// the watch has a bug where it may not invoke the callback upon connection being lost, and it will ignore the timeout in that case
157+
// we have to handle that with a manual timeout, and log if the manual timeout occures below the watch reports completion via the callback
158+
const timeoutPromise = new Promise<string>((resolve, reject) => {
159+
setTimeout(() =>
160+
{
161+
if(doneCallbackInvoked) {
162+
// the watch completed normally, we don't need to do anything
163+
} else {
164+
// the watch hasn't invoked the callback within the timeout, so need to step in
165+
logger.error(`Watch hung, manual timeout is used`, operationId, requestMetadata);
166+
// no await, no hurry
167+
logger.SendEvent(Events[Events.WatchHung], operationId, null, clusterArmId, clusterArmRegion, true);
168+
169+
// try to abort the watch, not sure if it's possible, best effort
170+
try {
171+
watchAbortController?.abort("Watch hung, manual timeout is used");
172+
} catch(e) {
173+
// swallow, the watch is already in a bad state, we don't care what happens here
174+
logger.error(`Failed to abort the watch: ${e}`, operationId, requestMetadata);
175+
}
176+
177+
reject(new Error(`Watch hung, manual timeout is used`));
178+
}
179+
},
180+
3 * watchTimeoutSeconds * 1000);
181+
});
182+
183+
return Promise.race([watchIsDonePromise, timeoutPromise]);
145184
}
146185

147186
private static IsExpectedIntermittentException(e: any) {

appmonitoring/ts/src/LoggerWrapper.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ export enum Events {
7373

7474
ArmIdIncorrect, // ARM ID of the cluster we have received is incorrect
7575
CertificateLoadFailure, // we have failed to load certificates
76+
WatchHung // watch.watch() has hung, so we had to use our own timeout
7677
}
7778

7879
export enum HeartbeatMetrics {

0 commit comments

Comments
 (0)