Skip to content

Commit 3d2049b

Browse files
KVCache and Intelligent Routing for HyperPod Inference (#290)
* feat: KVCache and Intelligent Routing for HyperPod Inference * Initial commit for kv-cache and intelligent support in inference SDK/CLI (#252) * Initial commit for kv-cache and intelligent support in inference SDK * Initial commit for kv-cache & intelligent routing CLI changes * [Fix] clean up test code * Separate out kv-cache and intelligent routing spec update into v1_1 inference cli * Fix ut and black format * Fix template.py extra white lines * fix: update inference template.py to reflect init experience template submission changes --------- Co-authored-by: jiapinw <[email protected]>
1 parent 0ae955c commit 3d2049b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+4388
-292
lines changed

helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ dependencies:
1313
version: 1.13.2
1414
- name: cert-manager
1515
repository: https://charts.jetstack.io
16-
version: v1.18.0
16+
version: v1.18.2
1717
- name: keda
1818
repository: https://kedacore.github.io/charts
1919
version: 2.17.1
20-
digest: sha256:5f877809dfd7c4d13b13f3de92e0824c28f80ed3abcf7c54f11764d9aeabbeba
21-
generated: "2025-06-19T22:21:36.075156362Z"
20+
digest: sha256:f54ece80a00cb4da98440551765d9c660a0704d6b59f4f9030a5a9e86eab4eea
21+
generated: "2025-10-27T17:20:29.746399171Z"

helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.1.0
18+
version: 1.0.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.16.0"
24+
appVersion: "2.0"
2525

2626
dependencies:
2727
- name: aws-mountpoint-s3-csi-driver
@@ -45,7 +45,7 @@ dependencies:
4545
condition: alb.enabled
4646
- name: cert-manager
4747
alias: cert-manager
48-
version: v1.18.0
48+
version: v1.18.2
4949
repository: "https://charts.jetstack.io"
5050
condition: cert-manager.enabled
5151
- name: keda

helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml

Lines changed: 1397 additions & 3 deletions
Large diffs are not rendered by default.

helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml

Lines changed: 807 additions & 4 deletions
Large diffs are not rendered by default.

helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml

Lines changed: 254 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
1-
---
21
apiVersion: apiextensions.k8s.io/v1
32
kind: CustomResourceDefinition
43
metadata:
54
annotations:
5+
cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert'
66
controller-gen.kubebuilder.io/version: v0.16.4
77
name: sagemakerendpointregistrations.inference.sagemaker.aws.amazon.com
88
spec:
9+
conversion:
10+
strategy: Webhook
11+
webhook:
12+
clientConfig:
13+
service:
14+
name: '{{ .Values.namePrefix }}-conversion-webhook'
15+
namespace: '{{ .Values.shortPrefix }}-system'
16+
path: /convert
17+
conversionReviewVersions:
18+
- v1
919
group: inference.sagemaker.aws.amazon.com
1020
names:
1121
kind: SageMakerEndpointRegistration
@@ -14,7 +24,7 @@ spec:
1424
singular: sagemakerendpointregistration
1525
scope: Namespaced
1626
versions:
17-
- name: v1alpha1
27+
- name: v1
1828
schema:
1929
openAPIV3Schema:
2030
description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations
@@ -88,6 +98,10 @@ spec:
8898
description: InstanceType is the ML compute instance type used for
8999
EndpointConfig creation
90100
type: string
101+
invocationEndpoint:
102+
default: invocations
103+
description: The invocation endpoint path used by the model server
104+
type: string
91105
loadBalancerHostName:
92106
description: Needed to embed the LB Host Name
93107
type: string
@@ -248,3 +262,241 @@ spec:
248262
storage: true
249263
subresources:
250264
status: {}
265+
- name: v1alpha1
266+
schema:
267+
openAPIV3Schema:
268+
description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations
269+
API
270+
properties:
271+
apiVersion:
272+
description: |-
273+
APIVersion defines the versioned schema of this representation of an object.
274+
Servers should convert recognized schemas to the latest internal value, and
275+
may reject unrecognized values.
276+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
277+
type: string
278+
kind:
279+
description: |-
280+
Kind is a string value representing the REST resource this object represents.
281+
Servers may infer this from the endpoint the client submits requests to.
282+
Cannot be updated.
283+
In CamelCase.
284+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
285+
type: string
286+
metadata:
287+
type: object
288+
spec:
289+
description: SageMakerEndpointRegistrationSpec defines the desired state
290+
of SageMakerEndpointRegistration
291+
properties:
292+
eksClusterDetails:
293+
properties:
294+
arn:
295+
description: Stores cluster ARN
296+
type: string
297+
clusterSecurityGroupId:
298+
description: Stores ClusterSecurityGroup of the EKS Cluster
299+
type: string
300+
name:
301+
description: Stores cluster name
302+
type: string
303+
securityGroupIds:
304+
description: Stores AdditionalSecurityGroupIds of the EKS Cluster
305+
items:
306+
type: string
307+
type: array
308+
subnetIds:
309+
description: Stores SubnetIDs of the EKS Cluster
310+
items:
311+
type: string
312+
type: array
313+
vpcId:
314+
description: Stores VPC Id of the EKS Cluster
315+
type: string
316+
required:
317+
- arn
318+
- clusterSecurityGroupId
319+
- name
320+
- securityGroupIds
321+
- subnetIds
322+
- vpcId
323+
type: object
324+
executionRole:
325+
description: The Amazon Resource Name (ARN) of an IAM role that will
326+
be used to create model, endpoint config, and the endpoint
327+
maxLength: 2048
328+
minLength: 20
329+
pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$
330+
type: string
331+
imageUri:
332+
description: The ImageUri where inference code is stored
333+
maxLength: 255
334+
type: string
335+
instanceType:
336+
description: InstanceType is the ML compute instance type used for
337+
EndpointConfig creation
338+
type: string
339+
invocationEndpoint:
340+
default: invocations
341+
description: The invocation endpoint path used by the model server
342+
type: string
343+
loadBalancerHostName:
344+
description: Needed to embed the LB Host Name
345+
type: string
346+
name:
347+
description: Name used for AWS resource creation
348+
maxLength: 63
349+
pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}
350+
type: string
351+
restApiId:
352+
description: REST API Gateway identifier that proxies requests to
353+
the HyperPod endpoint (via NLB/ALB)
354+
type: string
355+
tlsConfig:
356+
properties:
357+
tlsCertificateOutputS3Bucket:
358+
description: S3 bucket that stores the certificate that needs
359+
to be trusted
360+
type: string
361+
tlsCertificateS3Keys:
362+
description: The output tls certificate S3 key that points to
363+
the .pem file
364+
items:
365+
type: string
366+
type: array
367+
tlsServerNameOverride:
368+
description: The server name override for tls certificate selection
369+
type: string
370+
required:
371+
- tlsCertificateOutputS3Bucket
372+
- tlsCertificateS3Keys
373+
type: object
374+
required:
375+
- eksClusterDetails
376+
- executionRole
377+
- imageUri
378+
- instanceType
379+
- loadBalancerHostName
380+
- name
381+
- restApiId
382+
- tlsConfig
383+
type: object
384+
status:
385+
description: SageMakerEndpointRegistrationStatus defines the observed
386+
state of SageMakerEndpointRegistration
387+
properties:
388+
conditions:
389+
description: Detailed conditions representing the state of the deployment
390+
items:
391+
description: Condition contains details for one aspect of the current
392+
state of this API Resource.
393+
properties:
394+
lastTransitionTime:
395+
description: |-
396+
lastTransitionTime is the last time the condition transitioned from one status to another.
397+
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
398+
format: date-time
399+
type: string
400+
message:
401+
description: |-
402+
message is a human readable message indicating details about the transition.
403+
This may be an empty string.
404+
maxLength: 32768
405+
type: string
406+
observedGeneration:
407+
description: |-
408+
observedGeneration represents the .metadata.generation that the condition was set based upon.
409+
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
410+
with respect to the current state of the instance.
411+
format: int64
412+
minimum: 0
413+
type: integer
414+
reason:
415+
description: |-
416+
reason contains a programmatic identifier indicating the reason for the condition's last transition.
417+
Producers of specific condition types may define expected values and meanings for this field,
418+
and whether the values are considered a guaranteed API.
419+
The value should be a CamelCase string.
420+
This field may not be empty.
421+
maxLength: 1024
422+
minLength: 1
423+
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
424+
type: string
425+
status:
426+
description: status of the condition, one of True, False, Unknown.
427+
enum:
428+
- "True"
429+
- "False"
430+
- Unknown
431+
type: string
432+
type:
433+
description: type of condition in CamelCase or in foo.example.com/CamelCase.
434+
maxLength: 316
435+
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
436+
type: string
437+
required:
438+
- lastTransitionTime
439+
- message
440+
- reason
441+
- status
442+
- type
443+
type: object
444+
type: array
445+
endpoint:
446+
description: Endpoint Metadata
447+
properties:
448+
arn:
449+
description: The Amazon Resource Name (ARN) of the SageMaker endpoint
450+
pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$)
451+
type: string
452+
configArn:
453+
description: The Amazon Resource Name (ARN) of the endpoint configuration.
454+
pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$)
455+
type: string
456+
lastModifiedTime:
457+
description: The last modified time of SageMaker endpoint.
458+
format: date-time
459+
type: string
460+
modelArn:
461+
description: The ARN of the model created in SageMaker.
462+
pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$)
463+
type: string
464+
required:
465+
- arn
466+
- configArn
467+
- modelArn
468+
type: object
469+
loadBalancer:
470+
description: LoadBalancer Metadata
471+
properties:
472+
hostName:
473+
description: Hostname of LoadBalancer
474+
type: string
475+
required:
476+
- hostName
477+
type: object
478+
observedGeneration:
479+
description: Latest generation reconciled by controller
480+
format: int64
481+
type: integer
482+
state:
483+
description: Current phase of the Endpoint creation Step
484+
enum:
485+
- CreationInProgress
486+
- CreationFailed
487+
- CreationCompleted
488+
- DeletionInProgress
489+
- DeletionFailed
490+
- DeletionCompleted
491+
- UpdateInProgress
492+
- UpdateFailed
493+
- UpdateCompleted
494+
type: string
495+
required:
496+
- state
497+
type: object
498+
type: object
499+
served: true
500+
storage: false
501+
subresources:
502+
status: {}

helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,14 @@ spec:
5555
- --metrics-bind-address=:8443
5656
- --leader-elect
5757
- --health-probe-bind-address=:8081
58+
- --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs
5859
image: "{{ .Values.image.repository }}/hyperpod-inference-operator:{{ .Values.image.tag }}"
5960
imagePullPolicy: {{ .Values.image.pullPolicy }}
6061
name: manager
62+
ports:
63+
- containerPort: 9443
64+
name: webhook-server
65+
protocol: TCP
6166
securityContext:
6267
allowPrivilegeEscalation: false
6368
capabilities:
@@ -73,8 +78,10 @@ spec:
7378
httpGet:
7479
path: /healthz
7580
port: 8081
81+
initialDelaySeconds: 180
7682
failureThreshold: 120
7783
periodSeconds: 60
84+
timeoutSeconds: 5
7885
readinessProbe:
7986
httpGet:
8087
path: /readyz
@@ -90,6 +97,10 @@ spec:
9097
requests:
9198
cpu: 10m
9299
memory: 64Mi
100+
volumeMounts:
101+
- mountPath: /tmp/k8s-webhook-server/serving-certs
102+
name: webhook-certs
103+
readOnly: true
93104
env:
94105
- name: HYPERPOD_CLUSTER_ARN
95106
value: {{ .Values.hyperpodClusterArn }}
@@ -103,5 +114,15 @@ spec:
103114
value: {{ .Values.eksClusterName }}
104115
- name: TLS_CERTIFICATE_OUTPUT_S3URI
105116
value: {{ .Values.tlsCertificateS3Bucket }}
117+
- name: ENABLE_WEBHOOKS
118+
value: "{{ .Values.enableWebhooks }}"
119+
- name: CHART_VERSION
120+
value: {{ .Chart.Version | quote }}
121+
- name: APP_VERSION
122+
value: {{ .Chart.AppVersion | quote }}
106123
serviceAccountName: {{ .Values.namePrefix }}-controller-manager
107124
terminationGracePeriodSeconds: 10
125+
volumes:
126+
- name: webhook-certs
127+
secret:
128+
secretName: webhook-server-cert

0 commit comments

Comments
 (0)