Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-deployment
spec:
replicas: 1
selector:
matchLabels:
app: tgi-server
template:
metadata:
labels:
app: tgi-server
hf.co/model: google--gemma-7b-it
hf.co/task: text-generation
spec:
containers:
- name: tgi-container
image: ghcr.io/huggingface/text-generation-inference:sha-f852190
# image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310:latest
resources:
requests:
nvidia.com/gpu: 4
ephemeral-storage: "64Gi"
env:
- name: MODEL_ID
value: "google/gemma-7b-it"
- name: NUM_SHARD
value: "4"
- name: MAX_BATCH_PREFILL_TOKENS
value: "4096"
- name: MAX_INPUT_TOKENS
value: "4000"
- name: MAX_TOTAL_TOKENS
value: "4096"
- name: PORT
value: "8080"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_token
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 1Gi
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
17 changes: 17 additions & 0 deletions examples/gke/tgi-multi-gpu-deployment/config/ingress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: tgi-ingress
annotations:
kubernetes.io/ingress.class: "gce"
spec:
rules:
- http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: tgi-service
port:
number: 8080
12 changes: 12 additions & 0 deletions examples/gke/tgi-multi-gpu-deployment/config/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: tgi-service
spec:
selector:
app: tgi-server
type: ClusterIP
ports:
- protocol: TCP
port: 8080
targetPort: 8080