Skip to content

Commit bf9c242

Browse files
feat: restart lock handling (#3)
* feat(lock): add InMemory lock handling * feat: add restart lock handling * feat(lock): add lock for previously locked resources * chore(k8s): improve error handling and parsing KindNamespaceName from string * feat(k8s): add pod status check * feat(lock): add IsLock func to Lock * refactor(api): api schema for status and lock handling * fix(lock): in mem lock tests * chore(lock): removed unused function GetLocks
1 parent 9370928 commit bf9c242

File tree

13 files changed

+979
-102
lines changed

13 files changed

+979
-102
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ The configuration of the application is mostly done through environment variable
2020
| `CONFIG_FILE_PATH` | string | `config.yaml` | The path to the configuration file. |
2121
| `KUBE_CONFIG_PATH` | string | `` | The path to the kubeconfig file. If not specified, the application tries to use the in-cluster config. |
2222
| `WATCH_INTERVAL` | int | `10` | The interval in seconds the application watches for pod, deployment or statefulset changes |
23+
| `FORCE_UNLOCK_SEC` | int | `300` | The time in seconds a restart can take before the lock is force released. |
2324

2425
In order to provide a list of services that should be allowed to be restarted, a configuration file must be provided. In that file, the services are defined as follows:
2526

@@ -73,8 +74,8 @@ The application provides a simple API to restart services. The following endpoin
7374
| `/` | GET | Returns the HTML control page. |
7475
| `/metrics` | GET | Returns the Prometheus metrics. |
7576
| `/api/v1/service` | GET | Returns a list of services that can be restarted. |
77+
| `/api/v1/service/status` | GET | Returns the status of the service with the given kind, namespace and name. As websocket stream. |
7678
| `/api/v1/service/{kind}/{namespace}/{name}/restart` | POST | Restarts the service with the given kind, namespace and name. |
77-
| `/api/v1/service/{kind}/{namespace}/{name}/status` | GET | Returns the status of the service with the given kind, namespace and name. As websocket stream. |
7879

7980
## Metrics
8081

cmd/main.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/k8scope/k8s-restart-app/internal/api"
1010
"github.com/k8scope/k8s-restart-app/internal/config"
1111
"github.com/k8scope/k8s-restart-app/internal/ledger"
12+
"github.com/k8scope/k8s-restart-app/internal/lock"
1213
"github.com/k8scope/k8s-restart-app/internal/utils"
1314
"github.com/prometheus/client_golang/prometheus/promhttp"
1415
"k8s.io/client-go/kubernetes"
@@ -21,9 +22,12 @@ var (
2122
envConfigFilePath = utils.StringEnvOrDefault("CONFIG_FILE_PATH", "config.yaml")
2223
envKubeConfigPath = utils.StringEnvOrDefault("KUBE_CONFIG_PATH", "")
2324
envWatchInterval = utils.IntEnvOrDefault("WATCH_INTERVAL", 10)
25+
envForceUnlockSec = utils.IntEnvOrDefault("FORCE_UNLOCK_SEC", 300)
2426

2527
// non env variables
2628
k8sClient *kubernetes.Clientset
29+
// lock handling
30+
lockH *lock.Lock = lock.NewLock(lock.NewInMem(), envForceUnlockSec)
2731

2832
appConfig *config.Config
2933

@@ -63,8 +67,12 @@ func init() {
6367
}
6468
appConfig = cfg
6569

66-
// setup ledger
67-
ldgr = ledger.New(k8sClient, envWatchInterval)
70+
// setup ledger and watch apps
71+
ldgr = ledger.New(k8sClient, lockH, envWatchInterval)
72+
for _, app := range appConfig.Services {
73+
ldgr.Watch(app)
74+
}
75+
6876
}
6977

7078
func main() {
@@ -77,10 +85,10 @@ func main() {
7785
rt.Route("/api/v1", func(r chi.Router) {
7886
r.Route("/service", func(r chi.Router) {
7987
r.Get("/", api.ListApplications(*appConfig))
88+
r.Get("/status", api.Status(ldgr))
8089
r.Route("/{kind}/{namespace}/{name}", func(r chi.Router) {
8190
r.Use(api.MiddlewareValidation(*appConfig))
82-
r.Post("/restart", api.Restart(k8sClient))
83-
r.Get("/status", api.Status(ldgr))
91+
r.Post("/restart", api.Restart(k8sClient, lockH))
8492
})
8593
})
8694
})

internal/api/index.go.html

Lines changed: 75 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -70,68 +70,93 @@ <h1>Service Dashboard</h1>
7070

7171
data.services.forEach(service => {
7272
const row = document.createElement('tr');
73-
const statusCellId = `status-${service.kind}-${service.name}-${service.namespace}`;
73+
const statusCellId = getStatusCellId(service.kind, service.namespace, service.name);
74+
const actionBtnID = getActionButtonID(service.kind, service.namespace, service.name);
7475

7576
row.innerHTML = `
7677
<td>${service.kind}</td>
7778
<td>${service.name}</td>
7879
<td>${service.namespace}</td>
7980
<td id="${statusCellId}">Loading...</td>
80-
<td><button onclick="restartService('${service.kind}', '${service.name}', '${service.namespace}')">Restart</button></td>
81+
<td><button id="${actionBtnID}" disabled="true" onclick="restartService('${service.kind}', '${service.name}', '${service.namespace}')">Restart</button></td>
8182
`;
8283
tableBody.appendChild(row);
83-
84-
// check if HTTP or HTTPS
85-
const wsProtocol = location.protocol === 'https:' ? 'wss' : 'ws';
86-
// Open a WebSocket connection for each service to update status
87-
const statusWebSocket = new WebSocket(`${wsProtocol}://${location.host}/api/v1/service/${service.kind}/${service.namespace}/${service.name}/status`);
88-
89-
// Handle WebSocket connection open event
90-
statusWebSocket.onopen = () => {
91-
console.log(`Connected to status WebSocket for ${service.name}`);
92-
};
93-
94-
// Update status field with WebSocket message
95-
statusWebSocket.onmessage = (event) => {
96-
try {
97-
const statusData = JSON.parse(event.data);
98-
const statusElement = document.getElementById(statusCellId);
99-
100-
// Construct a human-readable status message from pod_status map
101-
let statusMessage = '';
102-
for (const [status, count] of Object.entries(statusData.pod_status)) {
103-
statusMessage += `${status}: ${count} pods, `;
104-
}
105-
statusMessage = statusMessage.slice(0, -2); // Remove trailing comma and space
106-
107-
// Update the status element with the new message
108-
if (statusElement) {
109-
statusElement.textContent = statusMessage || 'No status available';
110-
console.log(`Status for ${service.name} updated to: ${statusMessage}`);
111-
} else {
112-
console.warn(`Status element missing for service ${service.name}.`);
113-
}
114-
} catch (error) {
115-
console.error(`Failed to parse WebSocket message for ${service.name}: `, error);
116-
}
117-
};
118-
119-
// Handle WebSocket connection error
120-
statusWebSocket.onerror = (error) => {
121-
console.error(`WebSocket error for ${service.name}: `, error);
122-
document.getElementById(statusCellId).textContent = 'Error';
123-
};
124-
125-
// Handle WebSocket connection close
126-
statusWebSocket.onclose = () => {
127-
console.log(`WebSocket closed for ${service.name}`);
128-
};
12984
});
13085
} catch (error) {
13186
console.error('Failed to load services:', error);
13287
}
13388
}
13489

90+
function getStatusCellId(kind, name, namespace) {
91+
return `status-${kind}-${name}-${namespace}`;
92+
}
93+
94+
function getActionButtonID(kind, name, namespace) {
95+
return `btn-action-${kind}-${name}-${namespace}`;
96+
}
97+
98+
async function getServiceStatus() {
99+
try {
100+
// check if HTTP or HTTPS
101+
const wsProtocol = location.protocol === 'https:' ? 'wss' : 'ws';
102+
// Open a WebSocket connection for each service to update status
103+
const statusWebSocket = new WebSocket(`${wsProtocol}://${location.host}/api/v1/service/status`);
104+
105+
// Handle WebSocket connection open event
106+
statusWebSocket.onopen = () => {
107+
console.log(`Connected to status WebSocket`);
108+
};
109+
110+
// Update status field with WebSocket message
111+
statusWebSocket.onmessage = (event) => {
112+
try {
113+
const statusData = JSON.parse(event.data);
114+
kind = statusData.kind_namespace_name.kind;
115+
namespace = statusData.kind_namespace_name.namespace;
116+
name = statusData.kind_namespace_name.name;
117+
const statusElement = document.getElementById(getStatusCellId(kind, namespace, name));
118+
119+
// Construct a human-readable status message from pod_status map
120+
let statusMessage = '';
121+
for (const [status, count] of Object.entries(statusData.status.pod_status)) {
122+
statusMessage += `${status}: ${count} pods, `;
123+
}
124+
statusMessage = statusMessage.slice(0, -2); // Remove trailing comma and space
125+
126+
// Update the status element with the new message
127+
if (statusElement) {
128+
statusElement.textContent = statusMessage || 'No status available';
129+
} else {
130+
console.warn(`Status element missing for service ${kind}/${namespace}/${name}.`);
131+
}
132+
133+
// lock or unlock the restart button based on the is_locked field
134+
const actionButton = document.getElementById(getActionButtonID(kind, namespace, name));
135+
if (actionButton) {
136+
actionButton.disabled = statusData.is_locked;
137+
} else {
138+
console.warn(`Action button missing for service ${kind}/${namespace}/${name}.`);
139+
}
140+
141+
} catch (error) {
142+
console.error(`Failed to parse WebSocket message for ${kind}/${namespace}/${name}: `, error);
143+
}
144+
};
145+
146+
// Handle WebSocket connection error
147+
statusWebSocket.onerror = (error) => {
148+
console.error(`WebSocket error: `, error);
149+
};
150+
151+
// Handle WebSocket connection close
152+
statusWebSocket.onclose = () => {
153+
console.log(`WebSocket connection closed`);
154+
};
155+
} catch (error) {
156+
console.error(`Failed to connect to status WebSocket: `, error);
157+
}
158+
}
159+
135160
// Function to restart a specific service
136161
async function restartService(kind, name, namespace) {
137162
try {
@@ -147,6 +172,8 @@ <h1>Service Dashboard</h1>
147172
}
148173
}
149174

175+
// Fetch status for each service
176+
getServiceStatus();
150177
// Load services on page load
151178
window.onload = loadServices;
152179
</script>

internal/api/restart.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package api
22

33
import (
44
"encoding/json"
5+
"errors"
56
"log/slog"
67
"net/http"
78

@@ -10,16 +11,17 @@ import (
1011
"github.com/k8scope/k8s-restart-app/internal/config"
1112
"github.com/k8scope/k8s-restart-app/internal/k8s"
1213
"github.com/k8scope/k8s-restart-app/internal/ledger"
14+
"github.com/k8scope/k8s-restart-app/internal/lock"
1315
"github.com/prometheus/client_golang/prometheus"
1416
"github.com/prometheus/client_golang/prometheus/promauto"
1517
"k8s.io/client-go/kubernetes"
1618
)
1719

1820
var (
19-
metricGaugeConnectedWatchers = promauto.NewGaugeVec(prometheus.GaugeOpts{
21+
metricGaugeConnectedWatchers = promauto.NewGauge(prometheus.GaugeOpts{
2022
Name: "restart_app_connected_status_watchers",
2123
Help: "The number of connected status watchers",
22-
}, []string{"kind", "namespace", "name"})
24+
})
2325
metricCountRestarts = promauto.NewCounterVec(prometheus.CounterOpts{
2426
Name: "restart_app_restarts_total",
2527
Help: "The total number of restarts",
@@ -72,11 +74,15 @@ func MiddlewareValidation(config config.Config) func(http.Handler) http.Handler
7274
}
7375
}
7476

75-
func Restart(client *kubernetes.Clientset) func(w http.ResponseWriter, r *http.Request) {
77+
func Restart(client *kubernetes.Clientset, lck *lock.Lock) func(w http.ResponseWriter, r *http.Request) {
7678
return func(w http.ResponseWriter, r *http.Request) {
7779
kindNamespaceName := getKindNamespaceNameFromRequest(r)
7880
metricCountRestarts.WithLabelValues(kindNamespaceName.Kind, kindNamespaceName.Namespace, kindNamespaceName.Name).Inc()
79-
err := k8s.RestartService(r.Context(), client, kindNamespaceName)
81+
err := k8s.RestartService(r.Context(), client, lck, kindNamespaceName)
82+
if errors.Is(err, lock.ErrResourceLocked) {
83+
http.Error(w, err.Error(), http.StatusLocked)
84+
return
85+
}
8086
if err != nil {
8187
metricCountRestartsFailed.WithLabelValues(kindNamespaceName.Kind, kindNamespaceName.Namespace, kindNamespaceName.Name).Inc()
8288
http.Error(w, err.Error(), http.StatusInternalServerError)
@@ -103,13 +109,12 @@ func Status(ledger *ledger.Ledger) func(w http.ResponseWriter, r *http.Request)
103109
}
104110
return func(w http.ResponseWriter, r *http.Request) {
105111
ctx := r.Context()
106-
kindNamespaceName := getKindNamespaceNameFromRequest(r)
107112

108-
metricGaugeConnectedWatchers.WithLabelValues(kindNamespaceName.Kind, kindNamespaceName.Namespace, kindNamespaceName.Name).Inc()
109-
defer metricGaugeConnectedWatchers.WithLabelValues(kindNamespaceName.Kind, kindNamespaceName.Namespace, kindNamespaceName.Name).Dec()
113+
metricGaugeConnectedWatchers.Inc()
114+
defer metricGaugeConnectedWatchers.Dec()
110115

111116
// start listening for updates
112-
statusCh, unregister := ledger.Register(kindNamespaceName)
117+
statusCh, unregister := ledger.Register()
113118
// when the client disconnects, we stop listening for updates and unregister the client
114119
defer unregister() //nolint:errcheck
115120

internal/k8s/restart.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,71 @@ package k8s
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"time"
78

9+
"github.com/k8scope/k8s-restart-app/internal/lock"
810
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
911
"k8s.io/apimachinery/pkg/types"
1012
"k8s.io/client-go/kubernetes"
1113
)
1214

15+
var (
16+
ErrInvalidKindNamespaceNameFormat = fmt.Errorf("invalid format")
17+
ErrInvalidKind = fmt.Errorf("invalid kind")
18+
)
19+
1320
type KindNamespaceName struct {
1421
Kind string `json:"kind"`
1522
Name string `json:"name"`
1623
Namespace string `json:"namespace"`
1724
}
1825

26+
// KindNamespaceNameFromString parses a string into a KindNamespaceName
27+
// The string should be in the format of "Kind/Namespace/Name"
28+
//
29+
// Example:
30+
//
31+
// KindNamespaceNameFromString("Deployment/my-namespace/my-deployment")
32+
//
33+
// This will return a KindNamespaceName with Kind: Deployment, Namespace: my-namespace, Name: my-deployment
34+
func KindNamespaceNameFromString(s string) (*KindNamespaceName, error) {
35+
segment := strings.Split(s, "/")
36+
if len(segment) != 3 {
37+
return nil, fmt.Errorf("%w: %s", ErrInvalidKindNamespaceNameFormat, s)
38+
}
39+
if segment[0] != "Deployment" && segment[0] != "StatefulSet" {
40+
return nil, fmt.Errorf("%w: %s", ErrInvalidKind, segment[0])
41+
}
42+
return &KindNamespaceName{
43+
Kind: segment[0],
44+
Namespace: segment[1],
45+
Name: segment[2],
46+
}, nil
47+
}
48+
1949
func (s KindNamespaceName) String() string {
2050
return s.Kind + "/" + s.Namespace + "/" + s.Name
2151
}
2252

23-
func RestartService(ctx context.Context, clientset *kubernetes.Clientset, service KindNamespaceName) error {
53+
func RestartService(ctx context.Context, clientset *kubernetes.Clientset, lock *lock.Lock, service KindNamespaceName) error {
2454
switch service.Kind {
2555
case "Deployment":
56+
err := lock.Lock(service.String())
57+
if err != nil {
58+
// we don't want to unlock the lock here, because we want to keep the lock until the service is restarted
59+
return err
60+
}
2661
return restartDeployment(ctx, clientset, service)
2762
case "StatefulSet":
63+
err := lock.Lock(service.String())
64+
if err != nil {
65+
// we don't want to unlock the lock here, because we want to keep the lock until the service is restarted
66+
return err
67+
}
2868
return restartStatefulSet(ctx, clientset, service)
2969
default:
30-
return fmt.Errorf("invalid service kind: %s", service.Kind)
70+
return fmt.Errorf("%w: %s", ErrInvalidKind, service.Kind)
3171
}
3272
}
3373

0 commit comments

Comments
 (0)