Skip to content

Commit a3f59ad

Browse files
committed
Merge main into dev-1.34 to keep in sync
2 parents b79bbd3 + 7d58002 commit a3f59ad

File tree

32 files changed

+1666
-508
lines changed

32 files changed

+1666
-508
lines changed

assets/js/banner-dismiss.js

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,53 @@
1-
$(document).ready(function() {
2-
function setCookie(name, value, days) {
3-
let expires = "";
4-
let date = new Date(); // Create a new Date object
5-
let dateToSecond = 24 * 60 * 60 * 1000;
6-
7-
if (days) {
8-
date.setTime(date.getTime() + days * dateToSecond); // Modify the existing Date object
9-
expires = "; expires=" + date.toUTCString();
10-
}
11-
12-
document.cookie = name + "=" + value + expires + "; path=/";
13-
}
14-
15-
function getCookie(name) {
16-
let matches = document.cookie.match(new RegExp(
17-
"(?:^|; )" + name.replace(/([\.$?*|{}\(\)\[\]\\\/\+^])/g, '\\$1') + "=([^;]*)"
18-
));
19-
return matches ? "true" : undefined;
20-
}
21-
22-
/* Check the presence of a cookie */
23-
let announcement = document.querySelector("#announcement");
1+
$(document).ready(function() {
2+
function setCookie(name, value, days) {
3+
let expires = "";
4+
let date = new Date(); // Create a new Date object
5+
let dateToSecond = 24 * 60 * 60 * 1000;
6+
7+
if (days) {
8+
date.setTime(date.getTime() + days * dateToSecond); // Modify the existing Date object
9+
expires = "; expires=" + date.toUTCString();
10+
}
11+
12+
document.cookie = name + "=" + value + expires + "; path=/";
13+
}
14+
15+
function getCookie(name) {
16+
let matches = document.cookie.match(new RegExp(
17+
"(?:^|; )" + name.replace(/([\.$?*|{}\(\)\[\]\\\/\+^])/g, '\\$1') + "=([^;]*)"
18+
));
19+
return matches ? "true" : undefined;
20+
}
21+
22+
function getTokenName() {
23+
let announcement_name_rewritten = announcement.getAttribute('data-announcement-name').replace(/\s/g, '_');
24+
let token = 'announcement_ack_'+announcement_name_rewritten; // Generate the unique token for this announcement
25+
return token;
26+
}
27+
28+
/* Check the presence of a cookie */
29+
let announcement = document.querySelector("#announcement");
2430
if (announcement) {
25-
let token = `announcement_ack_${announcement.getAttribute('data-announcement-name').replace(/\s/g, '_')}`; // Generate the unique token for announcement
26-
let acknowledged = getCookie(token);
31+
let announcement_name_rewritten = announcement.getAttribute('data-announcement-name').replace(/\s/g, '_');
32+
let tokenName = getTokenName();
33+
let acknowledged = getCookie(tokenName);
2734
if (acknowledged === "true") {
2835
announcement.remove(); // Remove the announcement if the cookie is set
2936
}
3037
else {
3138
announcement.classList.add('display-announcement') // Display the announcement if the cookie is not set
3239
}
3340
}
34-
35-
/* Driver code to set the cookie */
36-
let button = document.querySelector('#banner-dismiss');
41+
42+
/* Driver code to set the cookie */
43+
let button = document.querySelector('#banner-dismiss');
3744
if (button) {
45+
let tokenName = getTokenName();
3846
button.removeAttribute('style');
3947
button.addEventListener('click', function() {
40-
setCookie(token, "true",
48+
setCookie(tokenName, "true",
4149
button.getAttribute('data-ttl')); // Set a cookie with time to live parameter
4250
announcement.remove();
4351
});
4452
}
45-
});
53+
});
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
---
2+
layout: blog
3+
title: "Introducing Headlamp AI Assistant"
4+
date: 2025-08-07T20:00:00+01:00
5+
slug: introducing-headlamp-ai-assistant
6+
author: >
7+
Joaquim Rocha (Microsoft)
8+
canonicalUrl: "https://headlamp.dev/blog/2025/08/07/introducing-the-headlamp-ai-assistant"
9+
---
10+
11+
_This announcement originally [appeared](https://headlamp.dev/blog/2025/08/07/introducing-the-headlamp-ai-assistant) on the Headlamp blog._
12+
13+
To simplify Kubernetes management and troubleshooting, we're thrilled to
14+
introduce [Headlamp AI Assistant](https://github.com/headlamp-k8s/plugins/tree/main/ai-assistant#readme): a powerful new plugin for Headlamp that helps
15+
you understand and operate your Kubernetes clusters and applications with
16+
greater clarity and ease.
17+
18+
Whether you're a seasoned engineer or just getting started, the AI Assistant offers:
19+
* **Fast time to value:** Ask questions like _"Is my application healthy?"_ or
20+
_"How can I fix this?"_ without needing deep Kubernetes knowledge.
21+
* **Deep insights:** Start with high-level queries and dig deeper with prompts
22+
like _"List all the problematic pods"_ or _"How can I fix this pod?"_
23+
* **Focused & relevant:** Ask questions in the context of what you're viewing
24+
in the UI, such as _"What's wrong here?"_
25+
* **Action-oriented:** Let the AI take action for you, like _"Restart that
26+
deployment"_, with your permission.
27+
28+
Here is a demo of the AI Assistant in action as it helps troubleshoot an
29+
application running with issues in a Kubernetes cluster:
30+
31+
{{< youtube id="GzXkUuCTcd4" title="Headlamp AI Assistant" class="youtube-quote-sm" >}}
32+
33+
## Hopping on the AI train
34+
35+
Large Language Models (LLMs) have transformed not just how we access data but
36+
also how we interact with it. The rise of tools like ChatGPT opened a world of
37+
possibilities, inspiring a wave of new applications. Asking questions or giving
38+
commands in natural language is intuitive, especially for users who aren't deeply
39+
technical. Now everyone can quickly ask how to do X or Y, without feeling awkward
40+
or having to traverse pages and pages of documentation like before.
41+
42+
Therefore, Headlamp AI Assistant brings a conversational UI to [Headlamp](https://headlamp.dev),
43+
powered by LLMs that Headlamp users can configure with their own API keys.
44+
It is available as a Headlamp plugin, making it easy to integrate into your
45+
existing setup. Users can enable it by installing the plugin and configuring
46+
it with their own LLM API keys, giving them control over which model powers
47+
the assistant. Once enabled, the assistant becomes part of the Headlamp UI,
48+
ready to respond to contextual queries and perform actions directly from the
49+
interface.
50+
51+
## Context is everything
52+
53+
As expected, the AI Assistant is focused on helping users with Kubernetes
54+
concepts. Yet, while there is a lot of value in responding to Kubernetes
55+
related questions from Headlamp's UI, we believe that the great benefit of such
56+
an integration is when it can use the context of what the user is experiencing
57+
in an application. So, the Headlamp AI Assistant knows what you're currently
58+
viewing in Headlamp, and this makes the interaction feel more like working
59+
with a human assistant.
60+
61+
For example, if a pod is failing, users can simply ask _"What's wrong here?"_
62+
and the AI Assistant will respond with the root cause, like a missing
63+
environment variable or a typo in the image name. Follow-up prompts like
64+
_"How can I fix this?"_ allow the AI Assistant to suggest a fix, streamlining
65+
what used to take multiple steps into a quick, conversational flow.
66+
67+
Sharing the context from Headlamp is not a trivial task though, so it's
68+
something we will keep working on perfecting.
69+
70+
## Tools
71+
72+
Context from the UI is helpful, but sometimes additional capabilities are
73+
needed. If the user is viewing the pod list and wants to identify problematic
74+
deployments, switching views should not be necessary. To address this, the AI
75+
Assistant includes support for a Kubernetes tool. This allows asking questions
76+
like "Get me all deployments with problems" prompting the assistant to fetch
77+
and display relevant data from the current cluster. Likewise, if the user
78+
requests an action like "Restart that deployment" after the AI points out what
79+
deployment needs restarting, it can also do that. In case of "write"
80+
operations, the AI Assistant does check with the user for permission to run them.
81+
82+
## AI Plugins
83+
84+
Although the initial version of the AI Assistant is already useful for
85+
Kubernetes users, future iterations will expand its capabilities. Currently,
86+
the assistant supports only the Kubernetes tool, but further integration with
87+
Headlamp plugins is underway. Similarly, we could get richer insights for
88+
GitOps via the Flux plugin, monitoring through Prometheus, package management
89+
with Helm, and more.
90+
91+
And of course, as the popularity of MCP grows, we are looking into how to
92+
integrate it as well, for a more plug-and-play fashion.
93+
94+
## Try it out!
95+
96+
We hope this first version of the AI Assistant helps users manage Kubernetes
97+
clusters more effectively and assist newcomers in navigating the learning
98+
curve. We invite you to try out this early version and give us your feedback.
99+
The AI Assistant plugin can be installed from Headlamp's Plugin Catalog in the
100+
desktop version, or by using the container image when deploying Headlamp.
101+
Stay tuned for the future versions of the Headlamp AI Assistant!
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
---
2+
layout: blog
3+
title: "Kubernetes v1.34: Finer-Grained Control Over Container Restarts"
4+
date: 2025-0X-XX
5+
draft: true
6+
slug: kubernetes-v1-34-per-container-restart-policy
7+
author: >
8+
[Yuan Wang](https://github.com/yuanwang04)
9+
---
10+
11+
With the release of Kubernetes 1.34, a new alpha feature is introduced
12+
that gives you more granular control over container restarts within a Pod. This
13+
feature, named **Container Restart Policy and Rules**, allows you to specify a
14+
restart policy for each container individually, overriding the Pod's global
15+
restart policy. In addition, it also allows you to conditionally restart
16+
individual containers based on their exit codes. This feature is available
17+
behind the alpha feature gate `ContainerRestartRules`.
18+
19+
This has been a long-requested feature. Let's dive into how it works and how you
20+
can use it.
21+
22+
## The problem with a single restart policy
23+
24+
Before this feature, the `restartPolicy` was set at the Pod level. This meant
25+
that all containers in a Pod shared the same restart policy (`Always`,
26+
`OnFailure`, or `Never`). While this works for many use cases, it can be
27+
limiting in others.
28+
29+
For example, consider a Pod with a main application container and an init
30+
container that performs some initial setup. You might want the main container
31+
to always restart on failure, but the init container should only run once and
32+
never restart. With a single Pod-level restart policy, this wasn't possible.
33+
34+
## Introducing per-container restart policies
35+
36+
With the new `ContainerRestartRules` feature gate, you can now specify a
37+
`restartPolicy` for each container in your Pod's spec. You can also define
38+
`restartPolicyRules` to control restarts based on exit codes. This gives you
39+
the fine-grained control you need to handle complex scenarios.
40+
41+
## Use cases
42+
43+
Let's look at some real-life use cases where per-container restart policies can
44+
be beneficial.
45+
46+
### In-place restarts for training jobs
47+
48+
In ML research, it's common to orchestrate a large number of long-running AI/ML
49+
training workloads. In these scenarios, workload failures are unavoidable. When
50+
a workload fails with a retriable exit code, you want the container to restart
51+
quickly without rescheduling the entire Pod, which consumes a significant amount
52+
of time and resources. Restarting the failed container "in-place" is critical
53+
for better utilization of compute resources. The container should only restart
54+
"in-place" if it failed due to a retriable error; otherwise, the container and
55+
Pod should terminate and possibly be rescheduled.
56+
57+
This can now be achieved with container-level `restartPolicyRules`. The workload
58+
can exit with different codes to represent retriable and non-retriable errors.
59+
With `restartPolicyRules`, the workload can be restarted in-place quickly, but
60+
only when the error is retriable.
61+
62+
### Try-once init containers
63+
64+
Init containers are often used to perform initialization work for the main
65+
container, such as setting up environments and credentials. Sometimes, you want
66+
the main container to always be restarted, but you don't want to retry
67+
initialization if it fails.
68+
69+
With a container-level `restartPolicy`, this is now possible. The init container
70+
can be executed only once, and its failure would be considered a Pod failure. If
71+
the initialization succeeds, the main container can be always restarted.
72+
73+
### Pods with multiple containers
74+
75+
For Pods that run multiple containers, you might have different restart
76+
requirements for each container. Some containers might have a clear definition
77+
of success and should only be restarted on failure. Others might need to be
78+
always restarted.
79+
80+
This is now possible with a container-level `restartPolicy`, allowing individual
81+
containers to have different restart policies.
82+
83+
## How to use it
84+
85+
To use this new feature, you need to enable the `ContainerRestartRules` feature
86+
gate on your Kubernetes cluster control-plane and worker nodes running
87+
Kubernetes 1.34+. Once enabled, you can specify the `restartPolicy` and
88+
`restartPolicyRules` fields in your container definitions.
89+
90+
Here are some examples:
91+
92+
### Example 1: Restarting on specific exit codes
93+
94+
In this example, the container should restart if and only if it fails with a
95+
retriable error, represented by exit code 42.
96+
97+
To achieve this, the container has `restartPolicy: Never`, and a restart
98+
policy rule that tells Kubernetes to restart the container in-place if it exits
99+
with code 42.
100+
101+
```yaml
102+
apiVersion: v1
103+
kind: Pod
104+
metadata:
105+
name: restart-on-exit-codes
106+
annotations:
107+
kubernetes.io/description: "This Pod only restart the container only when it exits with code 42."
108+
spec:
109+
restartPolicy: Never
110+
containers:
111+
- name: restart-on-exit-codes
112+
image: docker.io/library/busybox:1.28
113+
command: ['sh', '-c', 'sleep 60 && exit 0']
114+
restartPolicy: Never # Container restart policy must be specified if rules are specified
115+
restartPolicyRules: # Only restart the container if it exits with code 42
116+
- action: Restart
117+
exitCodes:
118+
operator: In
119+
values: [42]
120+
```
121+
122+
### Example 2: A try-once init container
123+
124+
In this example, a Pod should always be restarted once the initialization succeeds.
125+
However, the initialization should only be tried once.
126+
127+
To achieve this, the Pod has an `Always` restart policy. The `init-once`
128+
init container will only try once. If it fails, the Pod will fail. This allows
129+
the Pod to fail if the initialization failed, but also keep running once the
130+
initialization succeeds.
131+
132+
```yaml
133+
apiVersion: v1
134+
kind: Pod
135+
metadata:
136+
name: fail-pod-if-init-fails
137+
annotations:
138+
kubernetes.io/description: "This Pod has an init container that runs only once. After initialization succeeds, the main container will always be restarted."
139+
spec:
140+
restartPolicy: Always
141+
initContainers:
142+
- name: init-once # This init container will only try once. If it fails, the Pod will fail.
143+
image: docker.io/library/busybox:1.28
144+
command: ['sh', '-c', 'echo "Failing initialization" && sleep 10 && exit 1']
145+
restartPolicy: Never
146+
containers:
147+
- name: main-container # This container will always be restarted once initialization succeeds.
148+
image: docker.io/library/busybox:1.28
149+
command: ['sh', '-c', 'sleep 1800 && exit 0']
150+
```
151+
152+
### Example 3: Containers with different restart policies
153+
154+
In this example, there are two containers with different restart requirements. One
155+
should always be restarted, while the other should only be restarted on failure.
156+
157+
This is achieved by using a different container-level `restartPolicy` on each of
158+
the two containers.
159+
```yaml
160+
apiVersion: v1
161+
kind: Pod
162+
metadata:
163+
name: on-failure-pod
164+
annotations:
165+
kubernetes.io/description: "This Pod has two containers with different restart policies."
166+
spec:
167+
containers:
168+
- name: restart-on-failure
169+
image: docker.io/library/busybox:1.28
170+
command: ['sh', '-c', 'echo "Not restarting after success" && sleep 10 && exit 0']
171+
restartPolicy: OnFailure
172+
- name: restart-always
173+
image: docker.io/library/busybox:1.28
174+
command: ['sh', '-c', 'echo "Always restarting" && sleep 1800 && exit 0']
175+
restartPolicy: Always
176+
```
177+
178+
## Learn more
179+
180+
- Read the documentation for
181+
[container restart policy](/docs/concepts/workloads/pod-lifecycle/#container-restart-rules).
182+
- Read the KEP for the
183+
[Container Restart Rules](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/5307-container-restart-policy)
184+
185+
## Roadmap
186+
187+
More actions and signals to restart Pods and containers are coming! Notably,
188+
there are plans to add support for restarting the entire Pod. Planning and
189+
discussions on these features are in progress. Feel free to share feedback or
190+
requests with the SIG Node community!
191+
192+
## Your feedback is welcome!
193+
194+
This is an alpha feature, and the Kubernetes project would love to hear your feedback.
195+
Please try it out. This feature is driven by the
196+
[SIG Node](https://github.com/Kubernetes/community/blob/master/sig-node/README.md).
197+
If you are interested in helping develop this feature, sharing feedback, or
198+
participating in any other ongoing SIG Node projects, please reach out to the
199+
SIG Node community!
200+
201+
You can reach SIG Node by several means:
202+
- Slack: [#sig-node](https://kubernetes.slack.com/messages/sig-node)
203+
- [Mailing list](https://groups.google.com/forum/#!forum/kubernetes-sig-node)
204+
- [Open Community Issues/PRs](https://github.com/kubernetes/community/labels/sig%2Fnode)

0 commit comments

Comments
 (0)