-
Notifications
You must be signed in to change notification settings - Fork 3
Open
Description
Thanks for shipping something generic for OOMs — really nice work.
I’ve built a small OOM tracer too and a couple of bits might be useful for K8s workflows:
I mapped the OOM to the right cgroup (and thus container/pod). I used oc->memcg first when available, and fall back to the victim’s cgroup:
SEC("kprobe/oom_kill_process")
int BPF_KPROBE(kprobe__oom_kill_process, struct oom_control *oc) {
// ...
struct mem_cgroup *memcg = BPF_CORE_READ(oc, memcg);
const char *name = memcg
? BPF_CORE_READ(memcg, css.cgroup, kn, name)
: BPF_CORE_READ(victim, cgroups, subsys[0], cgroup, kn, name);
bpf_core_read_str(&e->cgroup_name, sizeof(e->cgroup_name), name);
// ...
}To resolve the pod from the cgroup/container ID via the K8s API:
// ID identifies a single container running in a Kubernetes Pod
type ID struct {
Namespace string
PodName string
PodUID types.UID
PodLabels map[string]string
}
var podPattern = regexp.MustCompile(`pod([a-f0-9_]+)\.slice`)
var cidPattern = regexp.MustCompile(`[a-f0-9]{64}`)
// LookupPod finds the pod by UID or container ID on the local node.
func (o *OOMTracer) LookupPod(pcid string) (*ID, error) {
pageFn := pager.SimplePageFunc(func(opts metav1.ListOptions) (runtime.Object, error) {
opts.FieldSelector = "spec.nodeName=" + o.nodeName
return o.CoreV1().Pods("").List(context.TODO(), opts)
})
p := pager.New(pageFn)
p.PageSize, p.PageBufferSize = o.pageSize, o.pageBufferSize
podUIDMatch := podPattern.FindStringSubmatch(pcid)
containerIDMatch := cidPattern.FindStringSubmatch(pcid)
if podUIDMatch == nil && containerIDMatch == nil {
return nil, fmt.Errorf("no matching container id or pod uid")
}
var id *ID
ctx := context.Background()
err := p.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error {
pod := obj.(*corev1.Pod)
if podUIDMatch != nil {
podUID := strings.ReplaceAll(podUIDMatch[1], "_", "-")
if string(pod.UID) == podUID {
id = &ID{Namespace: pod.Namespace, PodName: pod.Name, PodUID: pod.UID, PodLabels: pod.Labels}
}
return nil
}
for _, s := range pod.Status.ContainerStatuses {
live := cidPattern.FindStringSubmatch(s.ContainerID)
if len(live) > 0 && len(containerIDMatch) > 0 && live[0] == containerIDMatch[0] {
id = &ID{Namespace: pod.Namespace, PodName: pod.Name, PodUID: pod.UID, PodLabels: pod.Labels}
break
}
}
return nil
})
return id, err
}I also have an idea that before the kernel kills the victim, try pre-emptive deschedule/evict of the pod to another node (taint+evict or a small controller reacting to the OOM signal). This can stop “flapping” pods from repeatedly dying on a hot node.
Hope this is useful, and happy to contribute more if I get some time. 🙌
Metadata
Metadata
Assignees
Labels
No labels