Skip to content

Commit 5af1710

Browse files
authored
Merge pull request kubernetes#126243 from SergeyKanzhelev/devicePluginFailures
Implement resource health in pod status (KEP 4680)
2 parents 49ff255 + 62f96d2 commit 5af1710

40 files changed

+3273
-1090
lines changed

api/openapi-spec/swagger.json

Lines changed: 53 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/openapi-spec/v3/api__v1_openapi.json

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1525,6 +1525,24 @@
15251525
"description": "AllocatedResources represents the compute resources allocated for this container by the node. Kubelet sets this value to Container.Resources.Requests upon successful pod admission and after successfully admitting desired pod resize.",
15261526
"type": "object"
15271527
},
1528+
"allocatedResourcesStatus": {
1529+
"description": "AllocatedResourcesStatus represents the status of various resources allocated for this Pod.",
1530+
"items": {
1531+
"allOf": [
1532+
{
1533+
"$ref": "#/components/schemas/io.k8s.api.core.v1.ResourceStatus"
1534+
}
1535+
],
1536+
"default": {}
1537+
},
1538+
"type": "array",
1539+
"x-kubernetes-list-map-keys": [
1540+
"name"
1541+
],
1542+
"x-kubernetes-list-type": "map",
1543+
"x-kubernetes-patch-merge-key": "name",
1544+
"x-kubernetes-patch-strategy": "merge"
1545+
},
15281546
"containerID": {
15291547
"description": "ContainerID is the ID of the container in the format '<type>://<container_id>'. Where type is a container runtime identifier, returned from Version call of CRI API (for example \"containerd\").",
15301548
"type": "string"
@@ -6598,6 +6616,24 @@
65986616
"type": "object",
65996617
"x-kubernetes-map-type": "atomic"
66006618
},
6619+
"io.k8s.api.core.v1.ResourceHealth": {
6620+
"description": "ResourceHealth represents the health of a resource. It has the latest device health information. This is a part of KEP https://kep.k8s.io/4680 and historical health changes are planned to be added in future iterations of a KEP.",
6621+
"properties": {
6622+
"health": {
6623+
"description": "Health of the resource. can be one of:\n - Healthy: operates as normal\n - Unhealthy: reported unhealthy. We consider this a temporary health issue\n since we do not have a mechanism today to distinguish\n temporary and permanent issues.\n - Unknown: The status cannot be determined.\n For example, Device Plugin got unregistered and hasn't been re-registered since.\n\nIn future we may want to introduce the PermanentlyUnhealthy Status.",
6624+
"type": "string"
6625+
},
6626+
"resourceID": {
6627+
"default": "",
6628+
"description": "ResourceID is the unique identifier of the resource. See the ResourceID type for more information.",
6629+
"type": "string"
6630+
}
6631+
},
6632+
"required": [
6633+
"resourceID"
6634+
],
6635+
"type": "object"
6636+
},
66016637
"io.k8s.api.core.v1.ResourceQuota": {
66026638
"description": "ResourceQuota sets aggregate quota restrictions enforced per namespace",
66036639
"properties": {
@@ -6777,6 +6813,35 @@
67776813
},
67786814
"type": "object"
67796815
},
6816+
"io.k8s.api.core.v1.ResourceStatus": {
6817+
"properties": {
6818+
"name": {
6819+
"default": "",
6820+
"description": "Name of the resource. Must be unique within the pod and match one of the resources from the pod spec.",
6821+
"type": "string"
6822+
},
6823+
"resources": {
6824+
"description": "List of unique Resources health. Each element in the list contains an unique resource ID and resource health. At a minimum, ResourceID must uniquely identify the Resource allocated to the Pod on the Node for the lifetime of a Pod. See ResourceID type for it's definition.",
6825+
"items": {
6826+
"allOf": [
6827+
{
6828+
"$ref": "#/components/schemas/io.k8s.api.core.v1.ResourceHealth"
6829+
}
6830+
],
6831+
"default": {}
6832+
},
6833+
"type": "array",
6834+
"x-kubernetes-list-map-keys": [
6835+
"resourceID"
6836+
],
6837+
"x-kubernetes-list-type": "map"
6838+
}
6839+
},
6840+
"required": [
6841+
"name"
6842+
],
6843+
"type": "object"
6844+
},
67806845
"io.k8s.api.core.v1.SELinuxOptions": {
67816846
"description": "SELinuxOptions are the labels to be applied to the container",
67826847
"properties": {

pkg/api/pod/util.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,17 @@ func dropDisabledPodStatusFields(podStatus, oldPodStatus *api.PodStatus, podSpec
813813
}
814814
}
815815

816+
if !utilfeature.DefaultFeatureGate.Enabled(features.ResourceHealthStatus) {
817+
setAllocatedResourcesStatusToNil := func(csl []api.ContainerStatus) {
818+
for i := range csl {
819+
csl[i].AllocatedResourcesStatus = nil
820+
}
821+
}
822+
setAllocatedResourcesStatusToNil(podStatus.ContainerStatuses)
823+
setAllocatedResourcesStatusToNil(podStatus.InitContainerStatuses)
824+
setAllocatedResourcesStatusToNil(podStatus.EphemeralContainerStatuses)
825+
}
826+
816827
// drop ContainerStatus.User field to empty (disable SupplementalGroupsPolicy)
817828
if !utilfeature.DefaultFeatureGate.Enabled(features.SupplementalGroupsPolicy) && !supplementalGroupsPolicyInUse(oldPodSpec) {
818829
dropUserField := func(csl []api.ContainerStatus) {

pkg/apis/core/types.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2780,6 +2780,59 @@ type ContainerStatus struct {
27802780
// +featureGate=SupplementalGroupsPolicy
27812781
// +optional
27822782
User *ContainerUser
2783+
// AllocatedResourcesStatus represents the status of various resources
2784+
// allocated for this Pod.
2785+
// +featureGate=ResourceHealthStatus
2786+
// +optional
2787+
AllocatedResourcesStatus []ResourceStatus
2788+
}
2789+
2790+
type ResourceStatus struct {
2791+
Name ResourceName
2792+
// List of unique Resources health. Each element in the list contains a unique resource ID and resource health.
2793+
// At a minimum, ResourceID must uniquely identify the Resource
2794+
// allocated to the Pod on the Node for the lifetime of a Pod.
2795+
// See ResourceID type for it's definition.
2796+
Resources []ResourceHealth
2797+
2798+
// allow to extend this struct in future with the overall health fields or things like Device Plugin version
2799+
}
2800+
2801+
// ResourceID is calculated based on the source of this resource health information.
2802+
// For DevicePlugin:
2803+
//
2804+
// deviceplugin:DeviceID, where DeviceID is from the Device structure of DevicePlugin's ListAndWatchResponse type: https://github.com/kubernetes/kubernetes/blob/eda1c780543a27c078450e2f17d674471e00f494/staging/src/k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha/api.proto#L61-L73
2805+
//
2806+
// DevicePlugin ID is usually a constant for the lifetime of a Node and typically can be used to uniquely identify the device on the node.
2807+
// For DRA:
2808+
//
2809+
// dra:<driver name>/<pool name>/<device name>: such a device can be looked up in the information published by that DRA driver to learn more about it. It is designed to be globally unique in a cluster.
2810+
type ResourceID string
2811+
2812+
type ResourceHealthStatus string
2813+
2814+
const (
2815+
ResourceHealthStatusHealthy ResourceHealthStatus = "Healthy"
2816+
ResourceHealthStatusUnhealthy ResourceHealthStatus = "Unhealthy"
2817+
ResourceHealthStatusUnknown ResourceHealthStatus = "Unknown"
2818+
)
2819+
2820+
// ResourceHealth represents the health of a resource. It has the latest device health information.
2821+
// This is a part of KEP https://kep.k8s.io/4680 and historical health changes are planned to be added in future iterations of a KEP.
2822+
type ResourceHealth struct {
2823+
// ResourceID is the unique identifier of the resource. See the ResourceID type for more information.
2824+
ResourceID ResourceID
2825+
// Health of the resource.
2826+
// can be one of:
2827+
// - Healthy: operates as normal
2828+
// - Unhealthy: reported unhealthy. We consider this a temporary health issue
2829+
// since we do not have a mechanism today to distinguish
2830+
// temporary and permanent issues.
2831+
// - Unknown: The status cannot be determined.
2832+
// For example, Device Plugin got unregistered and hasn't been re-registered since.
2833+
//
2834+
// In future we may want to introduce the PermanentlyUnhealthy Status.
2835+
Health ResourceHealthStatus
27832836
}
27842837

27852838
// ContainerUser represents user identity information

pkg/apis/core/v1/zz_generated.conversion.go

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)