Skip to content

Commit d53cb79

Browse files
committed
DRA cel: enforce runtime limit by default again
As pointed out during code review, the CEL cost estimates are not considered perfectly reliable. Therefore it is better to also do runtime checks. Some downstream users might decide to allow CEL expressions to run longer. Therefore the cost limit is now part of an Options struct. kube-scheduler uses the default cost limit defined in the resource.k8s.io API, which is the same cost limit that also the apiserver uses during validation.
1 parent 021c9fb commit d53cb79

File tree

6 files changed

+40
-35
lines changed

6 files changed

+40
-35
lines changed

pkg/apis/resource/types.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -545,8 +545,13 @@ type CELDeviceSelector struct {
545545
// Validation against this limit and [CELSelectorExpressionMaxLength] happens
546546
// only when setting an expression for the first time or when changing it. If
547547
// the limits are changed in a future Kubernetes release, existing users are
548-
// guaranteed that existing expressions will continue to be valid and won't be
549-
// interrupted at runtime after an up- or downgrade.
548+
// guaranteed that existing expressions will continue to be valid.
549+
//
550+
// However, the kube-scheduler also applies this cost limit at runtime, so it
551+
// could happen that a valid expression fails at runtime after an up- or
552+
// downgrade. This can also happen without version skew when the cost estimate
553+
// underestimated the actual cost. That this might happen is the reason why
554+
// kube-scheduler enforces the runtime limit instead of relying on validation.
550555
//
551556
// According to
552557
// https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,

pkg/apis/resource/validation/validation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ func validateCELSelector(celSelector resource.CELDeviceSelector, fldPath *field.
176176
return allErrs
177177
}
178178

179-
result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, envType)
179+
result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, dracel.Options{EnvType: &envType})
180180
if result.Error != nil {
181181
allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error))
182182
} else if result.MaxCost > resource.CELSelectorExpressionMaxCost {

staging/src/k8s.io/api/resource/v1alpha3/types.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -551,8 +551,13 @@ type CELDeviceSelector struct {
551551
// Validation against this limit and [CELSelectorExpressionMaxLength] happens
552552
// only when setting an expression for the first time or when changing it. If
553553
// the limits are changed in a future Kubernetes release, existing users are
554-
// guaranteed that existing expressions will continue to be valid and won't be
555-
// interrupted at runtime after an up- or downgrade.
554+
// guaranteed that existing expressions will continue to be valid.
555+
//
556+
// However, the kube-scheduler also applies this cost limit at runtime, so it
557+
// could happen that a valid expression fails at runtime after an up- or
558+
// downgrade. This can also happen without version skew when the cost estimate
559+
// underestimated the actual cost. That this might happen is the reason why
560+
// kube-scheduler enforces the runtime limit instead of relying on validation.
556561
//
557562
// According to
558563
// https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,

staging/src/k8s.io/dynamic-resource-allocation/cel/compile.go

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"context"
2121
"errors"
2222
"fmt"
23-
"math"
2423
"reflect"
2524
"strings"
2625
"sync"
@@ -39,6 +38,7 @@ import (
3938
apiservercel "k8s.io/apiserver/pkg/cel"
4039
"k8s.io/apiserver/pkg/cel/environment"
4140
"k8s.io/apiserver/pkg/cel/library"
41+
"k8s.io/utils/ptr"
4242
)
4343

4444
const (
@@ -93,10 +93,21 @@ func newCompiler() *compiler {
9393
return &compiler{envset: mustBuildEnv()}
9494
}
9595

96+
// Options contains several additional parameters
97+
// for [CompileCELExpression]. All of them have reasonable
98+
// defaults.
99+
type Options struct {
100+
// EnvType allows to override the default environment type [environment.StoredExpressions].
101+
EnvType *environment.Type
102+
103+
// CostLimit allows overriding the default runtime cost limit [resourceapi.CELSelectorExpressionMaxCost].
104+
CostLimit *uint64
105+
}
106+
96107
// CompileCELExpression returns a compiled CEL expression. It evaluates to bool.
97108
//
98109
// TODO (https://github.com/kubernetes/kubernetes/issues/125826): validate AST to detect invalid attribute names.
99-
func (c compiler) CompileCELExpression(expression string, envType environment.Type) CompilationResult {
110+
func (c compiler) CompileCELExpression(expression string, options Options) CompilationResult {
100111
resultError := func(errorString string, errType apiservercel.ErrorType) CompilationResult {
101112
return CompilationResult{
102113
Error: &apiservercel.Error{
@@ -107,7 +118,7 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
107118
}
108119
}
109120

110-
env, err := c.envset.Env(envType)
121+
env, err := c.envset.Env(ptr.Deref(options.EnvType, environment.StoredExpressions))
111122
if err != nil {
112123
return resultError(fmt.Sprintf("unexpected error loading CEL environment: %v", err), apiservercel.ErrorTypeInternal)
113124
}
@@ -131,25 +142,10 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
131142
return resultError("unexpected compilation error: "+err.Error(), apiservercel.ErrorTypeInternal)
132143
}
133144
prog, err := env.Program(ast,
134-
// The Kubernetes CEL base environment sets the VAP cost limit.
135-
//
136-
// In DRA, the cost check is done only at validation time. At
137-
// runtime, any expression that passed validation gets executed
138-
// without interrupting it. The advantage is that it becomes
139-
// easier to change the limit because stored expression do not
140-
// suddenly fail after an up- or downgrade. The limit could
141-
// even become a configuration parameter of the apiserver
142-
// because that is the only place where the limit gets checked
143-
//
144-
// We cannot unset the cost limit
145-
// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/cel/options.go#L512-L518). But
146-
// we can set a high value that then never triggers
147-
// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/interpreter/runtimecost.go#L104-L106).
148-
//
149-
// Even better would be to enable cost tracking only during
150-
// validation, but for that k8s.io/apiserver/pkg/cel/environment would
151-
// have to be changed to not set the limit.
152-
cel.CostLimit(math.MaxInt64),
145+
// The Kubernetes CEL base environment sets the VAP limit as runtime cost limit.
146+
// DRA has its own default cost limit and also allows the caller to change that
147+
// limit.
148+
cel.CostLimit(ptr.Deref(options.CostLimit, resourceapi.CELSelectorExpressionMaxCost)),
153149
cel.InterruptCheckFrequency(celconfig.CheckFrequency),
154150
)
155151
if err != nil {

staging/src/k8s.io/dynamic-resource-allocation/cel/compile_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323

2424
resourceapi "k8s.io/api/resource/v1alpha3"
2525
"k8s.io/apimachinery/pkg/api/resource"
26-
"k8s.io/apiserver/pkg/cel/environment"
2726
"k8s.io/klog/v2/ktesting"
2827
"k8s.io/utils/ptr"
2928
)
@@ -226,17 +225,18 @@ device.attributes["dra.example.com"]["version"].isGreaterThan(semver("0.0.1"))
226225
}
227226
return attributes
228227
}(),
229-
expression: `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
230-
driver: "dra.example.com",
231-
expectCost: 18446744073709551615, // Exceeds limit!
228+
expression: `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
229+
driver: "dra.example.com",
230+
expectMatchError: "actual cost limit exceeded",
231+
expectCost: 18446744073709551615, // Exceeds limit!
232232
},
233233
}
234234

235235
func TestCEL(t *testing.T) {
236236
for name, scenario := range testcases {
237237
t.Run(name, func(t *testing.T) {
238238
_, ctx := ktesting.NewTestContext(t)
239-
result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
239+
result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
240240
if scenario.expectCompileError != "" && result.Error == nil {
241241
t.Fatalf("expected compile error %q, got none", scenario.expectCompileError)
242242
}
@@ -294,7 +294,7 @@ func BenchmarkDeviceMatches(b *testing.B) {
294294
}
295295
b.Run(name, func(b *testing.B) {
296296
_, ctx := ktesting.NewTestContext(b)
297-
result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
297+
result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
298298
if result.Error != nil {
299299
b.Fatalf("unexpected compile error: %s", result.Error.Error())
300300
}

staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
v1 "k8s.io/api/core/v1"
2727
resourceapi "k8s.io/api/resource/v1alpha3"
2828
"k8s.io/apimachinery/pkg/util/sets"
29-
"k8s.io/apiserver/pkg/cel/environment"
3029
resourcelisters "k8s.io/client-go/listers/resource/v1alpha3"
3130
"k8s.io/dynamic-resource-allocation/cel"
3231
"k8s.io/klog/v2"
@@ -653,7 +652,7 @@ func (alloc *allocator) isSelectable(r requestIndices, slice *resourceapi.Resour
653652

654653
func (alloc *allocator) selectorsMatch(r requestIndices, device *resourceapi.BasicDevice, deviceID DeviceID, class *resourceapi.DeviceClass, selectors []resourceapi.DeviceSelector) (bool, error) {
655654
for i, selector := range selectors {
656-
expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, environment.StoredExpressions)
655+
expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, cel.Options{})
657656
if expr.Error != nil {
658657
// Could happen if some future apiserver accepted some
659658
// future expression and then got downgraded. Normally

0 commit comments

Comments
 (0)