Skip to content

Commit e628187

Browse files
authored
fix: upgrade k8s 1.34, fix shm path, helm chart issues (#355)
* chore: lint issue * fix: kubernetes upgrade, fix scheduler deps issue * fix: upgrade k8s version to 1.34, use fixed operator version in helm chart * fix: update shm path * chore: comment & wording * fix: connection naming * fix: upgrade github action * fix: add test for dedicated gpu allocation mode
1 parent f25c65d commit e628187

File tree

27 files changed

+717
-509
lines changed

27 files changed

+717
-509
lines changed

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
uses: actions/checkout@v5
3030

3131
- name: Setup Go
32-
uses: actions/setup-go@v5
32+
uses: actions/setup-go@v6
3333
with:
3434
go-version: '~1.24'
3535

.github/workflows/test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,13 @@ jobs:
2828
strategy:
2929
matrix:
3030
# from https://github.com/kubernetes-sigs/controller-tools/blob/main/envtest-releases.yaml
31-
envtest_k8s_version: [1.23.5, 1.33.0]
31+
envtest_k8s_version: [1.23.5, 1.34.0]
3232
steps:
3333
- name: Clone the code
3434
uses: actions/checkout@v5
3535

3636
- name: Setup Go
37-
uses: actions/setup-go@v5
37+
uses: actions/setup-go@v6
3838
with:
3939
go-version: '~1.24'
4040

.vscode/settings.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"clientcmdapi",
2626
"clientgoscheme",
2727
"clientset",
28+
"clientsetfake",
2829
"cloudnative",
2930
"cloudprovider",
3031
"clusterissuers",
@@ -46,6 +47,7 @@
4647
"envtest",
4748
"essd",
4849
"Eventf",
50+
"featuregate",
4951
"finalizer",
5052
"Finalizers",
5153
"frameworkruntime",
@@ -78,6 +80,8 @@
7880
"iface",
7981
"imageutils",
8082
"influxdata",
83+
"internalcache",
84+
"internalqueue",
8185
"jsonpatch",
8286
"karpenter",
8387
"karpv",

api/v1/gpuresourcequota_types.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package v1
1919
import (
2020
v1 "k8s.io/api/core/v1"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22-
"k8s.io/kubernetes/pkg/scheduler/framework"
22+
fwk "k8s.io/kube-scheduler/framework"
2323
)
2424

2525
// GPUResourceQuotaSpec defines the desired state of GPUResourceQuota
@@ -188,6 +188,10 @@ type AllocRequest struct {
188188
PodMeta metav1.ObjectMeta
189189
}
190190

191+
func (p *AllocRequest) Clone() fwk.StateData {
192+
return p
193+
}
194+
191195
type GPUAllocationInfo struct {
192196
Request Resource `json:"request,omitempty"`
193197
Limit Resource `json:"limit,omitempty"`
@@ -203,7 +207,7 @@ type AdjustRequest struct {
203207
NewLimit Resource
204208
}
205209

206-
func (ar *AllocRequest) Clone() framework.StateData {
210+
func (ar *AdjustRequest) Clone() fwk.StateData {
207211
return ar
208212
}
209213

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.5.6
18+
version: 1.5.7
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ controller:
3131
image:
3232
repository: tensorfusion/tensor-fusion-operator
3333
# Overrides the image tag whose default is the chart appVersion.
34-
tag: "latest"
34+
tag: "1.43.4"
3535
# This is for setting Kubernetes Annotations to a Pod.
3636
# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
3737

@@ -120,7 +120,7 @@ agent:
120120

121121
image:
122122
repository: tensorfusion/tensor-fusion-agent
123-
tag: "latest"
123+
tag: "1.0.0"
124124

125125
resources:
126126
requests:

cmd/main.go

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,6 @@ import (
2727

2828
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
2929
// to ensure that exec-entrypoint and run can make use of them.
30-
31-
"k8s.io/client-go/kubernetes"
32-
_ "k8s.io/client-go/plugin/pkg/client/auth"
33-
"k8s.io/client-go/rest"
34-
"k8s.io/klog/v2"
35-
36-
"k8s.io/apimachinery/pkg/runtime"
37-
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
38-
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
39-
"k8s.io/kubernetes/cmd/kube-scheduler/app"
40-
"k8s.io/kubernetes/pkg/scheduler"
41-
ctrl "sigs.k8s.io/controller-runtime"
42-
"sigs.k8s.io/controller-runtime/pkg/client"
43-
"sigs.k8s.io/controller-runtime/pkg/healthz"
44-
"sigs.k8s.io/controller-runtime/pkg/manager"
45-
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
46-
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
47-
"sigs.k8s.io/controller-runtime/pkg/webhook"
48-
49-
"sigs.k8s.io/yaml"
50-
5130
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
5231
"github.com/NexusGPU/tensor-fusion/cmd/sched"
5332
"github.com/NexusGPU/tensor-fusion/internal/alert"
@@ -65,6 +44,25 @@ import (
6544
"github.com/NexusGPU/tensor-fusion/internal/utils"
6645
"github.com/NexusGPU/tensor-fusion/internal/version"
6746
webhookcorev1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1"
47+
"k8s.io/apimachinery/pkg/runtime"
48+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
49+
k8sVer "k8s.io/apimachinery/pkg/util/version"
50+
"k8s.io/apiserver/pkg/util/feature"
51+
"k8s.io/client-go/kubernetes"
52+
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
53+
_ "k8s.io/client-go/plugin/pkg/client/auth"
54+
"k8s.io/client-go/rest"
55+
"k8s.io/klog/v2"
56+
"k8s.io/kubernetes/cmd/kube-scheduler/app"
57+
"k8s.io/kubernetes/pkg/scheduler"
58+
ctrl "sigs.k8s.io/controller-runtime"
59+
"sigs.k8s.io/controller-runtime/pkg/client"
60+
"sigs.k8s.io/controller-runtime/pkg/healthz"
61+
"sigs.k8s.io/controller-runtime/pkg/manager"
62+
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
63+
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
64+
"sigs.k8s.io/controller-runtime/pkg/webhook"
65+
"sigs.k8s.io/yaml"
6866
// +kubebuilder:scaffold:imports
6967
)
7068

@@ -204,6 +202,14 @@ func main() {
204202
_ = os.Setenv(constants.KubeApiVersionMajorEnv, version.Major)
205203
_ = os.Setenv(constants.KubeApiVersionMinorEnv, version.Minor)
206204

205+
// TODO: there will still be risk after FeatureGate removed when the feature is stable for a long time
206+
// To be compatible with long-term k8s version, need to patch Kubernetes source code
207+
k8sVersion := k8sVer.MustParseSemantic(version.String())
208+
err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion)
209+
if err != nil {
210+
setupLog.Error(err, "unable to set k8s version for feature gating")
211+
}
212+
207213
alertEvaluatorReady = make(chan struct{})
208214
setupTimeSeriesAndWatchGlobalConfigChanges(ctx, mgr)
209215

@@ -221,7 +227,7 @@ func main() {
221227
pricingProvider := pricing.NewStaticPricingProvider()
222228
startWebhook(mgr, portAllocator, pricingProvider)
223229

224-
scheduler := startScheduler(ctx, allocator, mgr)
230+
scheduler := startScheduler(ctx, allocator, mgr, k8sVersion)
225231

226232
startCustomResourceController(ctx, mgr, metricsRecorder, allocator, portAllocator)
227233

@@ -461,6 +467,7 @@ func startScheduler(
461467
ctx context.Context,
462468
allocator *gpuallocator.GpuAllocator,
463469
mgr manager.Manager,
470+
k8sVersion *k8sVer.Version,
464471
) *scheduler.Scheduler {
465472
if os.Getenv(constants.EnableSchedulerEnv) == constants.FalseStringValue {
466473
return nil
@@ -479,7 +486,9 @@ func startScheduler(
479486
gpuTopoPlugin.NewWithDeps(allocator, mgr.GetClient()),
480487
)
481488

482-
cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, false, gpuResourceFitOpt, gpuTopoOpt)
489+
cc, scheduler, err := sched.SetupScheduler(
490+
ctx, mgr, schedulerConfigPath, false, k8sVersion, gpuResourceFitOpt, gpuTopoOpt,
491+
)
483492
if err != nil {
484493
setupLog.Error(err, "unable to create tensor fusion scheduler")
485494
os.Exit(1)

cmd/sched/setup.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"strings"
2323

2424
utilerrors "k8s.io/apimachinery/pkg/util/errors"
25+
k8sVer "k8s.io/apimachinery/pkg/util/version"
26+
"k8s.io/apiserver/pkg/util/feature"
2527
"k8s.io/client-go/tools/events"
2628
"k8s.io/component-base/configz"
2729
"k8s.io/klog/v2"
@@ -50,6 +52,7 @@ func SetupScheduler(
5052
mgr manager.Manager,
5153
schedulerConfigPath string,
5254
disableHttpEndpoint bool,
55+
k8sVersion *k8sVer.Version,
5356
outOfTreeRegistryOptions ...app.Option,
5457
) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) {
5558
opts := options.NewOptions()
@@ -73,6 +76,12 @@ func SetupScheduler(
7376
return nil, nil, err
7477
}
7578

79+
// Setup enumerationVersion again since it's overridden by the config
80+
err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion)
81+
if err != nil {
82+
return nil, nil, err
83+
}
84+
7685
if cfg, err := latest.Default(); err != nil {
7786
return nil, nil, err
7887
} else {

0 commit comments

Comments
 (0)