DRA cel: enforce runtime limit by default again

pohly · pohly · commit d53cb79cec11 · 2024-10-23T21:24:45.000+02:00
As pointed out during code review, the CEL cost estimates are not considered
perfectly reliable. Therefore it is better to also do runtime checks.

Some downstream users might decide to allow CEL expressions to run
longer. Therefore the cost limit is now part of an Options struct.
kube-scheduler uses the default cost limit defined in the resource.k8s.io API,
which is the same cost limit that also the apiserver uses during validation.
diff --git a/pkg/apis/resource/types.go b/pkg/apis/resource/types.go
@@ -545,8 +545,13 @@ type CELDeviceSelector struct {
 // Validation against this limit and [CELSelectorExpressionMaxLength] happens
 // only when setting an expression for the first time or when changing it. If
 // the limits are changed in a future Kubernetes release, existing users are
-// guaranteed that existing expressions will continue to be valid and won't be
-// interrupted at runtime after an up- or downgrade.
+// guaranteed that existing expressions will continue to be valid.
+//
+// However, the kube-scheduler also applies this cost limit at runtime, so it
+// could happen that a valid expression fails at runtime after an up- or
+// downgrade. This can also happen without version skew when the cost estimate
+// underestimated the actual cost. That this might happen is the reason why
+// kube-scheduler enforces the runtime limit instead of relying on validation.
 //
 // According to
 // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,
diff --git a/pkg/apis/resource/validation/validation.go b/pkg/apis/resource/validation/validation.go
@@ -176,7 +176,7 @@ func validateCELSelector(celSelector resource.CELDeviceSelector, fldPath *field.
 		return allErrs
 	}
 
-	result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, envType)
+	result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, dracel.Options{EnvType: &envType})
 	if result.Error != nil {
 		allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error))
 	} else if result.MaxCost > resource.CELSelectorExpressionMaxCost {
diff --git a/staging/src/k8s.io/api/resource/v1alpha3/types.go b/staging/src/k8s.io/api/resource/v1alpha3/types.go
@@ -551,8 +551,13 @@ type CELDeviceSelector struct {
 // Validation against this limit and [CELSelectorExpressionMaxLength] happens
 // only when setting an expression for the first time or when changing it. If
 // the limits are changed in a future Kubernetes release, existing users are
-// guaranteed that existing expressions will continue to be valid and won't be
-// interrupted at runtime after an up- or downgrade.
+// guaranteed that existing expressions will continue to be valid.
+//
+// However, the kube-scheduler also applies this cost limit at runtime, so it
+// could happen that a valid expression fails at runtime after an up- or
+// downgrade. This can also happen without version skew when the cost estimate
+// underestimated the actual cost. That this might happen is the reason why
+// kube-scheduler enforces the runtime limit instead of relying on validation.
 //
 // According to
 // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,
diff --git a/staging/src/k8s.io/dynamic-resource-allocation/cel/compile.go b/staging/src/k8s.io/dynamic-resource-allocation/cel/compile.go
@@ -20,7 +20,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"math"
 	"reflect"
 	"strings"
 	"sync"
@@ -39,6 +38,7 @@ import (
 	apiservercel "k8s.io/apiserver/pkg/cel"
 	"k8s.io/apiserver/pkg/cel/environment"
 	"k8s.io/apiserver/pkg/cel/library"
+	"k8s.io/utils/ptr"
 )
 
 const (
@@ -93,10 +93,21 @@ func newCompiler() *compiler {
 	return &compiler{envset: mustBuildEnv()}
 }
 
+// Options contains several additional parameters
+// for [CompileCELExpression]. All of them have reasonable
+// defaults.
+type Options struct {
+	// EnvType allows to override the default environment type [environment.StoredExpressions].
+	EnvType *environment.Type
+
+	// CostLimit allows overriding the default runtime cost limit [resourceapi.CELSelectorExpressionMaxCost].
+	CostLimit *uint64
+}
+
 // CompileCELExpression returns a compiled CEL expression. It evaluates to bool.
 //
 // TODO (https://github.com/kubernetes/kubernetes/issues/125826): validate AST to detect invalid attribute names.
-func (c compiler) CompileCELExpression(expression string, envType environment.Type) CompilationResult {
+func (c compiler) CompileCELExpression(expression string, options Options) CompilationResult {
 	resultError := func(errorString string, errType apiservercel.ErrorType) CompilationResult {
 		return CompilationResult{
 			Error: &apiservercel.Error{
@@ -107,7 +118,7 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
 		}
 	}
 
-	env, err := c.envset.Env(envType)
+	env, err := c.envset.Env(ptr.Deref(options.EnvType, environment.StoredExpressions))
 	if err != nil {
 		return resultError(fmt.Sprintf("unexpected error loading CEL environment: %v", err), apiservercel.ErrorTypeInternal)
 	}
@@ -131,25 +142,10 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
 		return resultError("unexpected compilation error: "+err.Error(), apiservercel.ErrorTypeInternal)
 	}
 	prog, err := env.Program(ast,
-		// The Kubernetes CEL base environment sets the VAP cost limit.
-		//
-		// In DRA, the cost check is done only at validation time.  At
-		// runtime, any expression that passed validation gets executed
-		// without interrupting it. The advantage is that it becomes
-		// easier to change the limit because stored expression do not
-		// suddenly fail after an up- or downgrade.  The limit could
-		// even become a configuration parameter of the apiserver
-		// because that is the only place where the limit gets checked
-		//
-		// We cannot unset the cost limit
-		// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/cel/options.go#L512-L518). But
-		// we can set a high value that then never triggers
-		// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/interpreter/runtimecost.go#L104-L106).
-		//
-		// Even better would be to enable cost tracking only during
-		// validation, but for that k8s.io/apiserver/pkg/cel/environment would
-		// have to be changed to not set the limit.
-		cel.CostLimit(math.MaxInt64),
+		// The Kubernetes CEL base environment sets the VAP limit as runtime cost limit.
+		// DRA has its own default cost limit and also allows the caller to change that
+		// limit.
+		cel.CostLimit(ptr.Deref(options.CostLimit, resourceapi.CELSelectorExpressionMaxCost)),
 		cel.InterruptCheckFrequency(celconfig.CheckFrequency),
 	)
 	if err != nil {
diff --git a/staging/src/k8s.io/dynamic-resource-allocation/cel/compile_test.go b/staging/src/k8s.io/dynamic-resource-allocation/cel/compile_test.go
@@ -23,7 +23,6 @@ import (
 
 	resourceapi "k8s.io/api/resource/v1alpha3"
 	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/apiserver/pkg/cel/environment"
 	"k8s.io/klog/v2/ktesting"
 	"k8s.io/utils/ptr"
 )
@@ -226,17 +225,18 @@ device.attributes["dra.example.com"]["version"].isGreaterThan(semver("0.0.1"))
 			}
 			return attributes
 		}(),
-		expression: `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
-		driver:     "dra.example.com",
-		expectCost: 18446744073709551615, // Exceeds limit!
+		expression:       `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
+		driver:           "dra.example.com",
+		expectMatchError: "actual cost limit exceeded",
+		expectCost:       18446744073709551615, // Exceeds limit!
 	},
 }
 
 func TestCEL(t *testing.T) {
 	for name, scenario := range testcases {
 		t.Run(name, func(t *testing.T) {
 			_, ctx := ktesting.NewTestContext(t)
-			result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
+			result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
 			if scenario.expectCompileError != "" && result.Error == nil {
 				t.Fatalf("expected compile error %q, got none", scenario.expectCompileError)
 			}
@@ -294,7 +294,7 @@ func BenchmarkDeviceMatches(b *testing.B) {
 		}
 		b.Run(name, func(b *testing.B) {
 			_, ctx := ktesting.NewTestContext(b)
-			result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
+			result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
 			if result.Error != nil {
 				b.Fatalf("unexpected compile error: %s", result.Error.Error())
 			}
diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
@@ -26,7 +26,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	resourceapi "k8s.io/api/resource/v1alpha3"
 	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/apiserver/pkg/cel/environment"
 	resourcelisters "k8s.io/client-go/listers/resource/v1alpha3"
 	"k8s.io/dynamic-resource-allocation/cel"
 	"k8s.io/klog/v2"
@@ -653,7 +652,7 @@ func (alloc *allocator) isSelectable(r requestIndices, slice *resourceapi.Resour
 
 func (alloc *allocator) selectorsMatch(r requestIndices, device *resourceapi.BasicDevice, deviceID DeviceID, class *resourceapi.DeviceClass, selectors []resourceapi.DeviceSelector) (bool, error) {
 	for i, selector := range selectors {
-		expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, environment.StoredExpressions)
+		expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, cel.Options{})
 		if expr.Error != nil {
 			// Could happen if some future apiserver accepted some
 			// future expression and then got downgraded. Normally

Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,7 @@ func validateCELSelector(celSelector resource.CELDeviceSelector, fldPath *field.`
`176`	`176`	`return allErrs`
`177`	`177`	`}`
`178`	`178`
`179`		`- result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, envType)`
	`179`	`+ result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, dracel.Options{EnvType: &envType})`
`180`	`180`	`if result.Error != nil {`
`181`	`181`	`allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error))`
`182`	`182`	`} else if result.MaxCost > resource.CELSelectorExpressionMaxCost {`