Skip to content

Commit c5bd1c3

Browse files
committed
add tcp keepalive and option to disable websocket keepalive
1 parent a005f90 commit c5bd1c3

File tree

2 files changed

+43
-1
lines changed

2 files changed

+43
-1
lines changed

src/robusta/core/model/env_vars.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,15 @@ def load_bool(env_var, default: bool):
8282
# Timeout for the ping response, before killing the connection. Must be smaller than the interval
8383
WEBSOCKET_PING_TIMEOUT = int(os.environ.get("WEBSOCKET_PING_TIMEOUT", 30))
8484

85+
# TCP keepalive configuration (disabled by default)
86+
WEBSOCKET_TCP_KEEPALIVE_ENABLED = os.environ.get("WEBSOCKET_TCP_KEEPALIVE_ENABLED", "false").lower() == "true"
87+
# Time in seconds before sending the first keepalive probe (Linux: TCP_KEEPIDLE, macOS: TCP_KEEPALIVE)
88+
WEBSOCKET_TCP_KEEPALIVE_IDLE = int(os.environ.get("WEBSOCKET_TCP_KEEPALIVE_IDLE", 2))
89+
# Interval in seconds between keepalive probes
90+
WEBSOCKET_TCP_KEEPALIVE_INTERVAL = int(os.environ.get("WEBSOCKET_TCP_KEEPALIVE_INTERVAL", 2))
91+
# Number of failed probes before connection is considered dead
92+
WEBSOCKET_TCP_KEEPALIVE_COUNT = int(os.environ.get("WEBSOCKET_TCP_KEEPALIVE_COUNT", 5))
93+
8594
TRACE_INCOMING_REQUESTS = load_bool("TRACE_INCOMING_REQUESTS", False)
8695
TRACE_INCOMING_ALERTS = load_bool("TRACE_INCOMING_ALERTS", False)
8796

src/robusta/integrations/receiver.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import json
55
import logging
66
import os
7+
import socket
8+
import sys
79
import time
810
from concurrent.futures import ThreadPoolExecutor
911
from contextlib import nullcontext
@@ -24,6 +26,10 @@
2426
SENTRY_ENABLED,
2527
WEBSOCKET_PING_INTERVAL,
2628
WEBSOCKET_PING_TIMEOUT,
29+
WEBSOCKET_TCP_KEEPALIVE_COUNT,
30+
WEBSOCKET_TCP_KEEPALIVE_ENABLED,
31+
WEBSOCKET_TCP_KEEPALIVE_IDLE,
32+
WEBSOCKET_TCP_KEEPALIVE_INTERVAL, WEBSOCKET_APP_KEEPALIVE_ENABLED,
2733
)
2834
from robusta.core.playbooks.playbook_utils import to_safe_str
2935
from robusta.core.playbooks.playbooks_event_handler import PlaybooksEventHandler
@@ -42,6 +48,22 @@
4248
WEBSOCKET_THREADPOOL_SIZE = int(os.environ.get("WEBSOCKET_THREADPOOL_SIZE", 10))
4349

4450

51+
def _get_tcp_keepalive_options() -> tuple:
52+
"""Build TCP keepalive socket options tuple for run_forever(sockopt=...)."""
53+
# TCP_KEEPIDLE is Linux-only; macOS uses TCP_KEEPALIVE (0x10) for the same purpose
54+
if sys.platform == "darwin":
55+
tcp_keepalive_idle = 0x10
56+
else:
57+
tcp_keepalive_idle = socket.TCP_KEEPIDLE
58+
59+
return (
60+
(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
61+
(socket.IPPROTO_TCP, tcp_keepalive_idle, WEBSOCKET_TCP_KEEPALIVE_IDLE),
62+
(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, WEBSOCKET_TCP_KEEPALIVE_INTERVAL),
63+
(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, WEBSOCKET_TCP_KEEPALIVE_COUNT),
64+
)
65+
66+
4567
class ValidationResponse(BaseModel):
4668
http_code: int = 200
4769
error_code: Optional[int] = None
@@ -114,11 +136,22 @@ def start_receiver(self):
114136

115137
def run_forever(self):
116138
logging.info("starting relay receiver")
139+
sockopt = None
140+
if WEBSOCKET_TCP_KEEPALIVE_ENABLED:
141+
sockopt = _get_tcp_keepalive_options()
142+
logging.info(
143+
f"TCP keepalive enabled: idle={WEBSOCKET_TCP_KEEPALIVE_IDLE}s, "
144+
f"interval={WEBSOCKET_TCP_KEEPALIVE_INTERVAL}s, count={WEBSOCKET_TCP_KEEPALIVE_COUNT}"
145+
)
117146
while self.active:
147+
# Handles WEBSOCKET_PING_INTERVAL == 0
148+
ping_timeout = WEBSOCKET_PING_TIMEOUT if WEBSOCKET_PING_INTERVAL else None
149+
logging.info("relay websocket starting")
118150
self.ws.run_forever(
119151
ping_interval=WEBSOCKET_PING_INTERVAL,
120152
ping_payload="p",
121-
ping_timeout=WEBSOCKET_PING_TIMEOUT,
153+
ping_timeout=ping_timeout,
154+
sockopt=sockopt,
122155
)
123156
logging.info("relay websocket closed")
124157
time.sleep(INCOMING_WEBSOCKET_RECONNECT_DELAY_SEC)

0 commit comments

Comments
 (0)