Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions conf/vendor/agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,26 @@ cache:
Password to connect to protected Redis server. When this parameter is
not defined, Redis server is accessed without password.
ex: SECR3T
cluster_mode:
type: bool
default: false
doc: |
Enable Redis cluster mode for high-availability caching.
When enabled, the agent connects to a Redis cluster instead of
a standalone instance, providing automatic failover and load distribution.
Requires cluster_nodes to be specified.
ex: yes
cluster_nodes:
type: list
content: str
doc: |
List of Redis cluster node addresses in format host:port.
Only used when cluster_mode is enabled.
Minimum 3 nodes recommended for production HA clusters.
ex:
- "10.0.0.1:6379"
- "10.0.0.2:6379"
- "10.0.0.3:6379"
version:
type: int
default: 1800
Expand Down
2 changes: 2 additions & 0 deletions slurmweb/apps/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def __init__(self, seed):
host=self.settings.cache.host,
port=self.settings.cache.port,
password=self.settings.cache.password,
cluster_mode=getattr(self.settings.cache, 'cluster_mode', False),
cluster_nodes=getattr(self.settings.cache, 'cluster_nodes', None),
)
else:
logger.warning("Caching is disabled")
Expand Down
59 changes: 57 additions & 2 deletions slurmweb/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import logging

import redis
from redis.cluster import RedisCluster, ClusterNode
import pickle

from .errors import SlurmwebCacheError
Expand All @@ -31,10 +32,64 @@ class CachingService:
KEY_PREFIX_MISS = "cache-miss-"
KEY_PREFIX_HIT = "cache-hit-"

def __init__(self, host: str, port: int, password: t.Union[str, None]):
def __init__(
self,
host: str,
port: int,
password: t.Union[str, None],
cluster_mode: bool = False,
cluster_nodes: t.Optional[t.List[str]] = None,
):
"""Initialize Redis connection (standalone or cluster mode).

Args:
host: Redis server hostname (used in standalone mode)
port: Redis server port (used in standalone mode)
password: Redis password (optional, used in both modes)
cluster_mode: Enable Redis cluster mode (default: False)
cluster_nodes: List of cluster nodes in "host:port" format
Example: ["10.0.0.1:6379", "10.0.0.2:6379"]
Required when cluster_mode=True
"""
self.host = host
self.port = port
self.connection = redis.Redis(host=host, port=port, password=password)
self.cluster_mode = cluster_mode

if cluster_mode:
if not cluster_nodes:
raise ValueError(
"cluster_nodes must be provided when cluster_mode=True"
)

# Parse cluster_nodes from "host:port" string format to ClusterNode objects
startup_nodes = [
ClusterNode(host, int(port))
for node in cluster_nodes
for host, port in [node.split(":", 1)]
]

logger.info(
"Initializing Redis cluster connection with %d nodes",
len(startup_nodes),
)

self.connection = RedisCluster(
startup_nodes=startup_nodes,
password=password,
decode_responses=False, # Binary mode for pickle
skip_full_coverage_check=True, # Allow partial clusters
)
else:
logger.info("Initializing Redis standalone connection to %s:%d", host, port)
self.connection = redis.Redis(host=host, port=port, password=password)

# Validate connection at initialization (fail-fast)
try:
self.connection.ping()
logger.info("Redis connection established successfully")
except redis.exceptions.ConnectionError as error:
logger.error("Failed to connect to Redis: %s", error)
raise

def put(self, key: CacheKey, value: t.Any, expiration: int):
try:
Expand Down
Loading