Target stack: MicroK8s + Flux GitOps, NGINX Ingress, Prometheus/Grafana/Tempo/Loki/Mimir (or Grafana Alloy as the OTLP collector), NVIDIA GPU node(s) optional, LEANN workers for edge-aware RAG.
📌 Conventions
- Namespace:
ai- Your local registry:
registry.example.internal:32000- Alloy OTLP endpoint:
alloy.monitoring.svc.cluster.local:4317- Prometheus:
kube-prometheus-stack-prometheus.monitoring:9090(adjust if different)- Ingress class:
nginx- Apps:
gateway(FastAPI/HTTP),retriever(RAG service),leann-worker(GPU optional)Replace image names and env with your own.
ai-system/
README.md
k8s/
base/
namespace.yaml
networkpolicy.yaml
config-routing.yaml # routing.yaml
config-flags.yaml # flags.yaml
config-otel-resource.yaml # common OTel resource attrs
config-schemas.yaml # JSON Schemas (tools & responses)
gateway/
deploy.yaml
service.yaml
hpa.yaml
ingress.yaml
retriever/
deploy.yaml
service.yaml
hpa.yaml
leann-worker/
deploy.yaml
flagger/
canary-gateway.yaml
loadtester.yaml
overlays/
prod/
kustomization.yaml
clusters/
prod/
ai-system-kustomization.yaml # Flux Kustomization
- Create namespace and baseline policies (
k8s/base/namespace.yaml,networkpolicy.yaml). - Add runtime configs (
config-routing.yaml,config-flags.yaml,config-otel-resource.yaml). - Add JSON Schemas into
config-schemas.yamland mount into apps. - Deploy gateway (Deployment/Service/Ingress/HPA).
- Deploy retriever (Deployment/Service/HPA).
- Deploy LEANN worker (Deployment, GPU optional).
- Install Flagger + Loadtester (if not installed) and apply
flagger/canary forgateway. - Verify Prometheus, Tempo, Loki are scraping/receiving (check Grafana dashboards).
- Wire Flux: create
clusters/prod/ai-system-kustomization.yamland commit. - Canary a new version by changing the
gatewayimage tag → watch Flagger promote.
# k8s/base/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ai
labels:
istio-injection: "disabled" # adjust if using a mesh
---
# k8s/base/networkpolicy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny
namespace: ai
spec:
podSelector: {}
policyTypes: [Ingress, Egress]
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-from-ingress-and-monitoring
namespace: ai
spec:
podSelector: {}
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: ingress-nginx
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- protocol: TCP
port: 8080
- protocol: TCP
port: 4317 # otlp/grpc if needed intra-ns
egress:
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- protocol: TCP
port: 4317 # Alloy OTLP gRPC
- protocol: TCP
port: 4318 # OTLP HTTP (optional)
- to:
- namespaceSelector: {}
ports:
- protocol: TCP
port: 53
# add your DB/vector store egress here# k8s/base/config-routing.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: routing
namespace: ai
labels:
app.kubernetes.io/name: routing
annotations:
checksum/version: "v1"
data:
routing.yaml: |
default:
# task → model weights (example)
classify:
- model: local:small
weight: 80
- model: api:gpt-4o-mini
weight: 20
chat:
- model: api:gpt-4.1-mini
weight: 100
thresholds:
escalate_on_confidence_below: 0.6
max_output_tokens: 1024# k8s/base/config-flags.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: flags
namespace: ai
labels:
app.kubernetes.io/name: flags
annotations:
checksum/version: "v1"
data:
flags.yaml: |
experiments:
enable_rerank: true
enable_semantic_cache: true
guardrails:
strict_json_mode: true
pii_redaction: true# k8s/base/config-otel-resource.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-resource
namespace: ai
labels:
app.kubernetes.io/name: otel-resource
# Shared OTel env to project consistent resource attributes
# Mount as envFrom in workloads
# Assumes Grafana Alloy as OTLP collector in monitoring ns
# Change endpoint if you send direct to Tempo/Prom
data:
OTEL_EXPORTER_OTLP_ENDPOINT: "http://alloy.monitoring.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_RESOURCE_ATTRIBUTES: "service.namespace=ai,telemetry.distro=opentelemetry,k8s.cluster.name=microk8s,deployment.environment=prod"
OTEL_METRICS_EXPORTER: "otlp"
OTEL_TRACES_EXPORTER: "otlp"
OTEL_LOGS_EXPORTER: "otlp"# k8s/base/config-schemas.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: schemas
namespace: ai
labels:
app.kubernetes.io/name: schemas
data:
tool.search.jsonschema: |
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://example.ai/schemas/tool.search.json",
"title": "SearchToolResult",
"type": "object",
"required": ["results"],
"properties": {
"results": {
"type": "array",
"items": {
"type": "object",
"required": ["doc_id", "score", "title", "url"],
"properties": {
"doc_id": {"type": "string"},
"score": {"type": "number", "minimum": 0},
"title": {"type": "string"},
"url": {"type": "string", "format": "uri"}
}
},
"minItems": 1,
"maxItems": 10
}
}
}
response.answer.jsonschema: |
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://example.ai/schemas/response.answer.json",
"title": "Answer",
"type": "object",
"required": ["final", "citations", "trace_id", "schema_valid"],
"properties": {
"final": {"type": "string"},
"citations": {
"type": "array",
"items": {"type": "string"}
},
"trace_id": {"type": "string"},
"schema_valid": {"type": "boolean"},
"safety_flags": {
"type": "array",
"items": {"type": "string"}
}
}
}Mount these under /app/schemas (see Deployments) and validate model outputs at runtime.
# k8s/base/gateway/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gateway
namespace: ai
labels:
app.kubernetes.io/name: gateway
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: gateway
template:
metadata:
labels:
app.kubernetes.io/name: gateway
annotations:
checksum/routing: "{{sha256 of routing.yaml if templated}}"
checksum/flags: "{{sha256 of flags.yaml if templated}}"
spec:
serviceAccountName: default
containers:
- name: gateway
image: registry.example.internal:32000/ai/gateway:1.0.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
env:
- name: ROUTING_PATH
value: /config/routing.yaml
- name: FLAGS_PATH
value: /config/flags.yaml
- name: SCHEMAS_DIR
value: /schemas
- name: OTEL_SERVICE_NAME
value: gateway
- name: OTEL_TRACES_SAMPLER
value: parentbased_traceidratio
- name: OTEL_TRACES_SAMPLER_ARG
value: "0.1" # 10% sampling; tune as needed
envFrom:
- configMapRef:
name: otel-resource
volumeMounts:
- name: routing
mountPath: /config/routing.yaml
subPath: routing.yaml
readOnly: true
- name: flags
mountPath: /config/flags.yaml
subPath: flags.yaml
readOnly: true
- name: schemas
mountPath: /schemas
readOnly: true
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /livez
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
volumes:
- name: routing
configMap:
name: routing
items:
- key: routing.yaml
path: routing.yaml
- name: flags
configMap:
name: flags
items:
- key: flags.yaml
path: flags.yaml
- name: schemas
configMap:
name: schemas# k8s/base/gateway/service.yaml
apiVersion: v1
kind: Service
metadata:
name: gateway
namespace: ai
labels:
app.kubernetes.io/name: gateway
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: gateway
ports:
- name: http
port: 8080
targetPort: 8080# k8s/base/gateway/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: gateway
namespace: ai
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/proxy-body-size: "4m"
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: gateway.example.internal # change to your domain
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gateway
port:
number: 8080# k8s/base/gateway/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: gateway
namespace: ai
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: gateway
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70# k8s/base/retriever/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: retriever
namespace: ai
labels:
app.kubernetes.io/name: retriever
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: retriever
template:
metadata:
labels:
app.kubernetes.io/name: retriever
spec:
containers:
- name: retriever
image: registry.example.internal:32000/ai/retriever:1.0.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
env:
- name: OTEL_SERVICE_NAME
value: retriever
envFrom:
- configMapRef:
name: otel-resource
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /livez
port: 8080
initialDelaySeconds: 10
periodSeconds: 10# k8s/base/retriever/service.yaml
apiVersion: v1
kind: Service
metadata:
name: retriever
namespace: ai
labels:
app.kubernetes.io/name: retriever
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: retriever
ports:
- name: http
port: 8080
targetPort: 8080# k8s/base/retriever/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: retriever
namespace: ai
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: retriever
minReplicas: 1
maxReplicas: 6
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70Label your GPU node:
kubectl label nodes <gpu-node> accelerator=nvidia
# k8s/base/leann-worker/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: leann-worker
namespace: ai
labels:
app.kubernetes.io/name: leann-worker
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: leann-worker
template:
metadata:
labels:
app.kubernetes.io/name: leann-worker
spec:
nodeSelector:
accelerator: nvidia # ensure scheduled on GPU node if available
containers:
- name: leann
image: registry.example.internal:32000/ai/leann-worker:1.0.0
imagePullPolicy: IfNotPresent
env:
- name: OTEL_SERVICE_NAME
value: leann-worker
envFrom:
- configMapRef:
name: otel-resource
resources:
requests:
cpu: "500m"
memory: "1Gi"
nvidia.com/gpu: 1
limits:
cpu: "2"
memory: "4Gi"
nvidia.com/gpu: 1
# If you use CUDA MPS (time-slicing), configure at the device-plugin/host level.
# Optionally add CUDA envs here if your worker supports them.Assumes Flagger and Prometheus are installed. If not, install Flagger (Helm) and apply
loadtester.yamlbelow.
# k8s/base/flagger/canary-gateway.yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: gateway
namespace: ai
spec:
provider: nginx
# Prometheus address used by Flagger for built-in SLOs
metricsServer: http://kube-prometheus-stack-prometheus.monitoring:9090
targetRef:
apiVersion: apps/v1
kind: Deployment
name: gateway
service:
port: 8080
# Optional: enable session stickiness, headers, etc.
analysis:
interval: 1m
threshold: 10 # max number of failed checks before rollback
iterations: 10 # total checks before promotion
maxWeight: 50 # canary traffic percentage
stepWeight: 10 # increase every iteration
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 1000
interval: 30s
webhooks:
- name: load-test
type: pre-rollout
url: http://flagger-loadtester.ai/
timeout: 10s
metadata:
cmd: "hey -z 45s -q 10 -c 2 http://gateway.ai:8080/healthz"# k8s/base/flagger/loadtester.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: flagger-loadtester
namespace: ai
labels:
app: flagger-loadtester
spec:
replicas: 1
selector:
matchLabels:
app: flagger-loadtester
template:
metadata:
labels:
app: flagger-loadtester
spec:
containers:
- name: loadtester
image: ghcr.io/fluxcd/flagger-loadtester:0.26.1
ports:
- name: http
containerPort: 8080
---
apiVersion: v1
kind: Service
metadata:
name: flagger-loadtester
namespace: ai
spec:
selector:
app: flagger-loadtester
ports:
- name: http
port: 80
targetPort: 8080# k8s/overlays/prod/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- ../../base/namespace.yaml
- ../../base/networkpolicy.yaml
- ../../base/config-routing.yaml
- ../../base/config-flags.yaml
- ../../base/config-otel-resource.yaml
- ../../base/config-schemas.yaml
- ../../base/gateway/deploy.yaml
- ../../base/gateway/service.yaml
- ../../base/gateway/ingress.yaml
- ../../base/gateway/hpa.yaml
- ../../base/retriever/deploy.yaml
- ../../base/retriever/service.yaml
- ../../base/retriever/hpa.yaml
- ../../base/leann-worker/deploy.yaml
- ../../base/flagger/canary-gateway.yaml
- ../../base/flagger/loadtester.yaml
# Optional common labels/annotations
commonLabels:
app.kubernetes.io/part-of: ai-system# clusters/prod/ai-system-kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: ai-system
namespace: flux-system
spec:
interval: 1m
path: ./k8s/overlays/prod
prune: true
sourceRef:
kind: GitRepository
name: flux-system
wait: true
timeout: 5m
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: gateway
namespace: ai
- apiVersion: apps/v1
kind: Deployment
name: retriever
namespace: ai
- apiVersion: apps/v1
kind: Deployment
name: leann-worker
namespace: ai- This template sets
OTEL_EXPORTER_OTLP_ENDPOINTviaconfig-otel-resource.yamlto the Alloy service inmonitoringns (alloy.monitoring.svc.cluster.local:4317). - Ensure Alloy is configured to receive OTLP (traces/metrics/logs) and export traces to Tempo, metrics to Prometheus Remote Write (or scrape), logs to Loki.
- If you don’t run Alloy, point
OTEL_EXPORTER_OTLP_ENDPOINTat Tempo (traces) and at your metrics/logs pipeline accordingly.
- Update the container tag for
gatewayink8s/base/gateway/deploy.yaml(e.g.,1.0.1). - Commit & push to your Flux repo branch tracked by
flux-system. - Watch Flagger analysis:
kubectl -n ai describe canary gatewayandkubectl -n ai get ev --watch. - If metrics hold, Flagger promotes; on failures, it rolls back.
# example only — expose /metrics from gateway first
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: gateway
namespace: monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/name: gateway
namespaceSelector:
matchNames: ["ai"]
endpoints:
- port: http
path: /metrics
interval: 30s- Use SOPS or SealedSecrets for API keys (model providers, DBs). Reference via
envFrom.secretRefin Deployments. - Add
NetworkPolicyegress destinations for your DB/vector store. - For GPU: ensure MicroK8s NVIDIA add-on/device-plugin is enabled and your images include CUDA when required.
kubectl -n ai get deploy,svc,hpa,ingress→ all Ready.- Ingress DNS resolves;
curl -I http://gateway.example.internal/healthz→ 200. - Tempo shows new traces for
gateway/retriever. - Flagger Loadtester reachable:
kubectl -n ai port-forward svc/flagger-loadtester 8080:80→ GET/works.
gatewayserves HTTP on:8080, has/healthzand/livezendpoints, reads routing/flags, validates schemas in/schemas.retrieverserves HTTP on:8080, traces outbound calls to vector DB, etc.leann-workerruns a long-lived worker or server; remove GPU requests if CPU-only.
Target:
-
New config repo:
ai-system-config(RAG/LLM app manifests live here) -
Existing platform repo: your monorepo (contains Flux bootstrap + infra)
-
Git SSH URL (cluster‑internal):
ssh://git@gitea-ssh.git-local.svc.cluster.local:2222/m4xx3d0ut/ai-system-config.git
This guide is copy‑pasteable. Follow the steps in order.
- Flux controllers are running in
flux-system(source/kustomize/helm controllers). - Ingress + monitoring stack (Prometheus/Tempo/Loki/Alloy) are deployed by your existing Kustomizations.
- (Optional) Flagger installed (CRDs + controller) if you plan to use the provided canary.
- Your workstation has
kubectlcontext pointing at MicroK8s cluster.
You can create the repo in Gitea UI first (empty), or push and let Gitea create it if allowed.
# local working copy
mkdir -p ~/git/ai-system-config && cd ~/git/ai-system-config
git init
cat > README.md <<'EOF'
# ai-system-config
Flux-managed configs for the AI gateway, retriever, and LEANN worker.
EOF
# K8s layout (copy manifests from the LLMOps templates doc into these files)
mkdir -p k8s/base/{gateway,retriever,leann-worker,flagger} k8s/overlays/prod
ai-system-config/ README.md k8s/ base/ namespace.yaml networkpolicy.yaml config-routing.yaml # routing.yaml config-flags.yaml # flags.yaml config-otel-resource.yaml # common OTel resource attrs config-schemas.yaml # JSON Schemas (tools & responses) gateway/ deploy.yaml service.yaml hpa.yaml ingress.yaml retriever/ deploy.yaml service.yaml hpa.yaml leann-worker/ deploy.yaml flagger/ canary-gateway.yaml loadtester.yaml overlays/ prod/ kustomization.yaml
## 2) Base K8s Manifests
### 2.1 Namespace & NetworkPolicy
```yaml
# k8s/base/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ai
labels:
istio-injection: "disabled" # adjust if using a mesh
---
# k8s/base/networkpolicy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny
namespace: ai
spec:
podSelector: {}
policyTypes: [Ingress, Egress]
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-from-ingress-and-monitoring
namespace: ai
spec:
podSelector: {}
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: ingress-nginx
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- protocol: TCP
port: 8080
- protocol: TCP
port: 4317 # otlp/grpc if needed intra-ns
egress:
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- protocol: TCP
port: 4317 # Alloy OTLP gRPC
- protocol: TCP
port: 4318 # OTLP HTTP (optional)
- to:
- namespaceSelector: {}
ports:
- protocol: TCP
port: 53
# add your DB/vector store egress here
# k8s/base/config-routing.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: routing
namespace: ai
labels:
app.kubernetes.io/name: routing
annotations:
checksum/version: "v1"
data:
routing.yaml: |
default:
# task → model weights (example)
classify:
- model: local:small
weight: 80
- model: api:gpt-4o-mini
weight: 20
chat:
- model: api:gpt-4.1-mini
weight: 100
thresholds:
escalate_on_confidence_below: 0.6
max_output_tokens: 1024# k8s/base/config-flags.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: flags
namespace: ai
labels:
app.kubernetes.io/name: flags
annotations:
checksum/version: "v1"
data:
flags.yaml: |
experiments:
enable_rerank: true
enable_semantic_cache: true
guardrails:
strict_json_mode: true
pii_redaction: true# k8s/base/config-otel-resource.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-resource
namespace: ai
labels:
app.kubernetes.io/name: otel-resource
# Shared OTel env to project consistent resource attributes
# Mount as envFrom in workloads
# Assumes Grafana Alloy as OTLP collector in monitoring ns
# Change endpoint if you send direct to Tempo/Prom
data:
OTEL_EXPORTER_OTLP_ENDPOINT: "http://alloy.monitoring.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_RESOURCE_ATTRIBUTES: "service.namespace=ai,telemetry.distro=opentelemetry,k8s.cluster.name=microk8s,deployment.environment=prod"
OTEL_METRICS_EXPORTER: "otlp"
OTEL_TRACES_EXPORTER: "otlp"
OTEL_LOGS_EXPORTER: "otlp"# k8s/base/config-schemas.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: schemas
namespace: ai
labels:
app.kubernetes.io/name: schemas
data:
tool.search.jsonschema: |
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://example.ai/schemas/tool.search.json",
"title": "SearchToolResult",
"type": "object",
"required": ["results"],
"properties": {
"results": {
"type": "array",
"items": {
"type": "object",
"required": ["doc_id", "score", "title", "url"],
"properties": {
"doc_id": {"type": "string"},
"score": {"type": "number", "minimum": 0},
"title": {"type": "string"},
"url": {"type": "string", "format": "uri"}
}
},
"minItems": 1,
"maxItems": 10
}
}
}
response.answer.jsonschema: |
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://example.ai/schemas/response.answer.json",
"title": "Answer",
"type": "object",
"required": ["final", "citations", "trace_id", "schema_valid"],
"properties": {
"final": {"type": "string"},
"citations": {
"type": "array",
"items": {"type": "string"}
},
"trace_id": {"type": "string"},
"schema_valid": {"type": "boolean"},
"safety_flags": {
"type": "array",
"items": {"type": "string"}
}
}
}Mount these under /app/schemas (see Deployments) and validate model outputs at runtime.
# k8s/base/gateway/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gateway
namespace: ai
labels:
app.kubernetes.io/name: gateway
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: gateway
template:
metadata:
labels:
app.kubernetes.io/name: gateway
annotations:
checksum/routing: "{{sha256 of routing.yaml if templated}}"
checksum/flags: "{{sha256 of flags.yaml if templated}}"
spec:
serviceAccountName: default
containers:
- name: gateway
image: registry.example.internal:32000/ai/gateway:1.0.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
env:
- name: ROUTING_PATH
value: /config/routing.yaml
- name: FLAGS_PATH
value: /config/flags.yaml
- name: SCHEMAS_DIR
value: /schemas
- name: OTEL_SERVICE_NAME
value: gateway
- name: OTEL_TRACES_SAMPLER
value: parentbased_traceidratio
- name: OTEL_TRACES_SAMPLER_ARG
value: "0.1" # 10% sampling; tune as needed
envFrom:
- configMapRef:
name: otel-resource
volumeMounts:
- name: routing
mountPath: /config/routing.yaml
subPath: routing.yaml
readOnly: true
- name: flags
mountPath: /config/flags.yaml
subPath: flags.yaml
readOnly: true
- name: schemas
mountPath: /schemas
readOnly: true
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /livez
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
volumes:
- name: routing
configMap:
name: routing
items:
- key: routing.yaml
path: routing.yaml
- name: flags
configMap:
name: flags
items:
- key: flags.yaml
path: flags.yaml
- name: schemas
configMap:
name: schemas# k8s/base/gateway/service.yaml
apiVersion: v1
kind: Service
metadata:
name: gateway
namespace: ai
labels:
app.kubernetes.io/name: gateway
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: gateway
ports:
- name: http
port: 8080
targetPort: 8080# k8s/base/gateway/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: gateway
namespace: ai
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/proxy-body-size: "4m"
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: gateway.example.internal # change to your domain
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gateway
port:
number: 8080# k8s/base/gateway/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: gateway
namespace: ai
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: gateway
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70# k8s/base/retriever/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: retriever
namespace: ai
labels:
app.kubernetes.io/name: retriever
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: retriever
template:
metadata:
labels:
app.kubernetes.io/name: retriever
spec:
containers:
- name: retriever
image: registry.example.internal:32000/ai/retriever:1.0.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
env:
- name: OTEL_SERVICE_NAME
value: retriever
envFrom:
- configMapRef:
name: otel-resource
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /livez
port: 8080
initialDelaySeconds: 10
periodSeconds: 10# k8s/base/retriever/service.yaml
apiVersion: v1
kind: Service
metadata:
name: retriever
namespace: ai
labels:
app.kubernetes.io/name: retriever
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: retriever
ports:
- name: http
port: 8080
targetPort: 8080# k8s/base/retriever/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: retriever
namespace: ai
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: retriever
minReplicas: 1
maxReplicas: 6
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70Label your GPU node:
kubectl label nodes <gpu-node> accelerator=nvidia
# k8s/base/leann-worker/deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: leann-worker
namespace: ai
labels:
app.kubernetes.io/name: leann-worker
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: leann-worker
template:
metadata:
labels:
app.kubernetes.io/name: leann-worker
spec:
nodeSelector:
accelerator: nvidia # ensure scheduled on GPU node if available
containers:
- name: leann
image: registry.example.internal:32000/ai/leann-worker:1.0.0
imagePullPolicy: IfNotPresent
env:
- name: OTEL_SERVICE_NAME
value: leann-worker
envFrom:
- configMapRef:
name: otel-resource
resources:
requests:
cpu: "500m"
memory: "1Gi"
nvidia.com/gpu: 1
limits:
cpu: "2"
memory: "4Gi"
nvidia.com/gpu: 1
# If you use CUDA MPS (time-slicing), configure at the device-plugin/host level.
# Optionally add CUDA envs here if your worker supports them.Assumes Flagger and Prometheus are installed. If not, install Flagger (Helm) and apply
loadtester.yamlbelow.
# k8s/base/flagger/canary-gateway.yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: gateway
namespace: ai
spec:
provider: nginx
# Prometheus address used by Flagger for built-in SLOs
metricsServer: http://kube-prometheus-stack-prometheus.monitoring:9090
targetRef:
apiVersion: apps/v1
kind: Deployment
name: gateway
service:
port: 8080
# Optional: enable session stickiness, headers, etc.
analysis:
interval: 1m
threshold: 10 # max number of failed checks before rollback
iterations: 10 # total checks before promotion
maxWeight: 50 # canary traffic percentage
stepWeight: 10 # increase every iteration
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 1000
interval: 30s
webhooks:
- name: load-test
type: pre-rollout
url: http://flagger-loadtester.ai/
timeout: 10s
metadata:
cmd: "hey -z 45s -q 10 -c 2 http://gateway.ai:8080/healthz"# k8s/base/flagger/loadtester.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: flagger-loadtester
namespace: ai
labels:
app: flagger-loadtester
spec:
replicas: 1
selector:
matchLabels:
app: flagger-loadtester
template:
metadata:
labels:
app: flagger-loadtester
spec:
containers:
- name: loadtester
image: ghcr.io/fluxcd/flagger-loadtester:0.26.1
ports:
- name: http
containerPort: 8080
---
apiVersion: v1
kind: Service
metadata:
name: flagger-loadtester
namespace: ai
spec:
selector:
app: flagger-loadtester
ports:
- name: http
port: 80
targetPort: 8080# k8s/overlays/prod/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- ../../base/namespace.yaml
- ../../base/networkpolicy.yaml
- ../../base/config-routing.yaml
- ../../base/config-flags.yaml
- ../../base/config-otel-resource.yaml
- ../../base/config-schemas.yaml
- ../../base/gateway/deploy.yaml
- ../../base/gateway/service.yaml
- ../../base/gateway/ingress.yaml
- ../../base/gateway/hpa.yaml
- ../../base/retriever/deploy.yaml
- ../../base/retriever/service.yaml
- ../../base/retriever/hpa.yaml
- ../../base/leann-worker/deploy.yaml
- ../../base/flagger/canary-gateway.yaml
- ../../base/flagger/loadtester.yaml
# Optional common labels/annotations
commonLabels:
app.kubernetes.io/part-of: ai-systemgit add . git commit -m "feat: bootstrap ai-system-config (base + prod overlay)"
REPO_SSH="ssh://git@gitea-ssh.git-local.svc.cluster.local:2222/m4xx3d0ut/ai-system-config.git" git remote add origin "$REPO_SSH" git branch -M main git push -u origin main
> If push fails because the repo doesn’t yet exist, create it in Gitea UI and try again.
---
## 2) Generate SSH deploy key & **known\_hosts** (cluster‑internal)
Flux needs an SSH key **and** the host key of your in‑cluster Gitea SSH service.
### 2.1 Create a dedicated keypair (workstation)
```bash
cd ~/git/ai-system-config
ssh-keygen -t ed25519 -C "flux-ai-system" -f ./flux-ai-ai-system -N ''
Add the public key flux-ai-ai-system.pub as a Deploy Key (read‑only) in the ai-system-config repo settings in Gitea.
Your workstation probably can’t resolve *.svc.cluster.local. Generate the host key from a pod that can:
# Use the source-controller pod (Alpine-based) to scan the SSH service
kubectl -n flux-system exec deploy/source-controller -- sh -c \
"apk add -q openssh-client || true; ssh-keyscan -p 2222 gitea-ssh.git-local.svc.cluster.local" \
| tee known_hostsAlternative: run a short‑lived toolbox pod (
kubectl run -it toolbox --rm --image=alpine -- sh) and runssh-keyscan -p 32222 gitea.example.internalinside it, then copy the output.
kubectl -n flux-system create secret generic ai-system-ssh \
--from-file=identity=./flux-ai-ai-system \
--from-file=identity.pub=./flux-ai-ai-system.pub \
--from-file=known_hosts=./known_hostsCreate two files in your platform-config repo so Flux can reconcile the new repo.
clusters/microk8s-prod/flux-system/gitrepository-ai-system.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: ai-system
namespace: flux-system
spec:
interval: 1m
url: ssh://git@gitea-ssh.git-local.svc.cluster.local:2222/m4xx3d0ut/ai-system-config.git
ref:
branch: main
secretRef:
name: ai-system-sshclusters/microk8s-prod/kustomizations/ai-system.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: ai-system
namespace: flux-system
spec:
interval: 1m
sourceRef:
kind: GitRepository
name: ai-system
path: ./k8s/overlays/prod
prune: true
wait: true
timeout: 5m
dependsOn:
- name: infra-core
- name: infra-crds
- name: infra-policies
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: gateway
namespace: ai
- apiVersion: apps/v1
kind: Deployment
name: retriever
namespace: ai
- apiVersion: apps/v1
kind: Deployment
name: leann-worker
namespace: aiAdd the new files to your existing aggregator kustomization so Flux applies them:
clusters/microk8s-prod/kustomizations/kustomization.yaml (example)
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- infra-core.yaml
- infra-crds.yaml
- infra-policies.yaml
- apps.yaml
- ai-system.yaml # <— add this lineCommit and push the monorepo changes.
cd ~/git/platform-config
git add clusters/microk8s-prod/flux-system/gitrepository-ai-system.yaml \
clusters/microk8s-prod/kustomizations/ai-system.yaml \
clusters/microk8s-prod/kustomizations/kustomization.yaml
git commit -m "flux: add ai-system GitRepository + Kustomization (split repo)"
git push# Check the source and kustomization
kubectl -n flux-system get gitrepositories,kustomizations
kubectl -n flux-system describe gitrepository ai-system
kubectl -n flux-system describe kustomization ai-system
# Tail events for fast feedback
kubectl -n flux-system get events --sort-by=.lastTimestamp | tail -n 50
# Verify workloads
kubectl -n ai get deploy,svc,hpa,ingress
# (Optional) check Flagger canary status if installed
kubectl -n ai describe canary gateway || trueIf the GitRepository is failing:
- host key mismatch → regenerate
known_hostsfrom inside cluster and recreate the secret. - permission denied → ensure deploy key is added to the repo and has Read access.
- network → ensure
flux-systempods can resolve and reachgitea-ssh.git-local.svc.cluster.local:2222.
If you encrypt secrets in ai-system-config:
- Reuse the same age keypair that your monorepo uses.
- Commit
.sops.yamlinai-system-configwith the public key. - Ensure the private key is present in
flux-systemas a secret (already in your monorepo). - Keep
GitRepositoryas-is; Flux will decrypt during apply.
To pin releases, change the GitRepository to use a tag instead of a branch.
spec:
ref:
tag: v1.0.0Release flow:
- Prepare changes on
maininai-system-config→ create tagv1.0.0→ update or let an automated policy bump the tag in the monorepogitrepository-ai-system.yaml→ Flux promotes.
- Edit
k8s/base/gateway/deploy.yamlimage tag inai-system-config(e.g.,1.0.1). - Commit & push to
main. - Watch Flagger analysis:
kubectl -n ai get canary
kubectl -n ai describe canary gateway
kubectl -n ai get events --watchOn success Flagger shifts traffic and promotes; otherwise it rolls back.
-
GitRepository stuck
auth/ssh: handshake failed: wrong key in secret or SSH service not reachable; verify with:kubectl -n flux-system exec deploy/source-controller -- sh -c \ "apk add -q openssh-client || true; ssh -p 2222 -o StrictHostKeyChecking=yes git@gitea-ssh.git-local.svc.cluster.local -T" || true
-
Kustomization
not ready: rundescribeto see the failing object; often missing CRDs (Flagger, Prometheus). Ensure yourdependsOnbrings them up first. -
Ingress 404: confirm DNS for
gateway.example.internal(or your host), check NGINX Ingress logs, and that the Servicegatewayis Healthy. -
No traces/metrics: verify Alloy OTLP endpoint in
config-otel-resource.yamland NetworkPolicy egress paths tomonitoringns.
- Add
k8s/overlays/stagingand a second FluxKustomizationpointing to it for multi‑env. - Add a ServiceMonitor for
/metricsonce the app exposes them. - Wire OPA/Cedar policies and quotas in a dedicated
policies/dir withinai-system-configif needed.
You now have a clean split‑repo setup: platform Flux config stays in the monorepo, while ai-system-config owns the AI stack. Flux pulls via in‑cluster SSH at gitea-ssh.git-local.svc.cluster.local:2222, with a scoped deploy key and pinned known_hosts. Promotion and rollback are controlled from the monorepo by tweaking the GitRepository ref, while day‑to‑day app evolution happens in the dedicated repo.
| Symptom (log/error) | Why (on SM 7.5) | Fix |
|---|---|---|
TopPSamplingFromProbs … too many resources requested for launch |
FlashInfer sampling kernels can over-allocate on Turing | Disable FlashInfer: VLLM_USE_FLASHINFER=0 (or VLLM_USE_FLASHINFER_SAMPLER=0) |
Cannot use FA version 2 … compute capability >= 8 |
FlexAttention v2 requires Ampere+ | Expected on Turing. vLLM falls back. No action needed. |
Your device … doesn't support torch.bfloat16 |
Turing has no BF16 | Force FP16: add --dtype float16 |
ValueError: invalid literal for int() with base 10: 'false' |
vLLM parses some envs as ints, not booleans | Use "0"/"1", not "false"/"true" |
VLLM_PORT 'tcp://…' appears to be a URI |
K8s Service env injection collides with vLLM’s VLLM_PORT |
enableServiceLinks: false and/or set env: VLLM_PORT="39000" |
| Name resolution / HF download errors | NetPol default-deny or DNS not allowed | Allow UDP/TCP 53 egress to DNS + 80/443 egress for vllm/gateway |
| Pod restarts during warmup/compile | Probes too aggressive while vLLM JITs | Add startupProbe on /health, give it ~5 min |
| vLLM Pending | GPU scheduling | Label node accelerator=nvidia, have NVIDIA device plugin, requests/limits include nvidia.com/gpu: "1" |
Drop this into your Deployment container: vllm (it matches what worked for you):
# container: vllm
image: vllm/vllm-openai:latest-x86_64
args:
- --port; "8000"
- --model; "zai-org/glm-edge-1.5b-chat"
- --download-dir; /models
- --tensor-parallel-size; "1"
- --dtype; float16 # no BF16 on SM 7.5
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: HUGGING_FACE_HUB_TOKEN
optional: true
- { name: VLLM_PORT, value: "39000" } # avoid K8s URI collision
- { name: VLLM_USE_FLASHINFER, value: "0" } # disable FlashInfer sampler on Turing
ports:
- { name: openai, containerPort: 8000 }
resources:
requests: { cpu: "4", memory: "20Gi", nvidia.com/gpu: "1" }
limits: { cpu: "8", memory: "40Gi", nvidia.com/gpu: "1" }
volumeMounts:
- { name: hf-cache, mountPath: /models }
# be patient during compile/warmup
startupProbe:
httpGet: { path: /health, port: openai }
periodSeconds: 5
timeoutSeconds: 2
failureThreshold: 60 # ~5 min
readinessProbe:
httpGet: { path: /health, port: openai }
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
httpGet: { path: /health, port: openai }
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 2And at the pod level:
spec:
enableServiceLinks: false # stop injecting VLLM_* from Service
runtimeClassName: nvidia
nodeSelector: { accelerator: nvidia }Tip: pin an image tag you trust instead of
:latestonce you’re happy (e.g.,v0.4x.y).
Allow the basics so vLLM can resolve and fetch models, and so gateway ↔ vLLM works:
# default deny (Ingress+Egress) for safety
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata: { name: default-deny, namespace: ai }
spec: { podSelector: {}, policyTypes: [Ingress, Egress] }
# DNS egress (fast unbreak)
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata: { name: allow-dns-any, namespace: ai }
spec:
podSelector: {}
policyTypes: ["Egress"]
egress:
- to: [ { ipBlock: { cidr: 0.0.0.0/0 } } ]
ports:
- { protocol: UDP, port: 53 }
- { protocol: TCP, port: 53 }
# Internet egress for vllm/gateway (80/443)
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata: { name: allow-egress-internet-for-ai, namespace: ai }
spec:
podSelector:
matchExpressions:
- { key: app.kubernetes.io/name, operator: In, values: ["vllm","gateway"] }
policyTypes: ["Egress"]
egress:
- to: [ { ipBlock: { cidr: 0.0.0.0/0 } } ]
ports:
- { protocol: TCP, port: 443 }
- { protocol: TCP, port: 80 }
# east-west inside ai ns
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata: { name: allow-ai-eastwest, namespace: ai }
spec:
podSelector: {}
policyTypes: [Ingress, Egress]
ingress: [ { from: [ { podSelector: {} } ] } ]
egress: [ { to: [ { podSelector: {} } ] } ](You can later tighten DNS to your CoreDNS labels/IPs.)
vLLM serves GET /health. Use:
- startupProbe:
/health,failureThreshold: 60,periodSeconds: 5 - readinessProbe:
/health - livenessProbe:
/health
This avoids “probe-kills” while vLLM JIT-compiles graphs (~20–30 s on your logs).
- FlashInfer disabled on Turing:
VLLM_USE_FLASHINFER=0 - BF16 disabled:
--dtype float16 - Service env collision avoided:
enableServiceLinks: false(and/or setVLLM_PORT="39000") - GPU scheduling: node labeled
accelerator=nvidia, device plugin present,nvidia.com/gpu: "1"requests/limits - NetPol: DNS (53), HTTPS (443) egress allowed for
vllm(and gateway) - HF token (if the repo requires it) in
Secret hf-token - Probes: added startup/readiness/liveness on
/health - (Optional) Compile quirks: if you still see instability, add
--enforce-eagerand/orVLLM_COMPILE_LEVEL=0
Using FlexAttention backend…+FA version 2 is not supported…→ Normal on SM 7.5; it will fall back.FlashInfer is available, but it is not enabled. Falling back…→ Good (what we want).Available KV cache memory …+ a suggested--kv-cache-memory→ Perf tuning only; not a failure.pynvml deprecated→ Noise; harmless.
# watch rollout
kubectl -n ai rollout status deploy/vllm
kubectl -n ai get pods -l app.kubernetes.io/name=vllm -w
# tail logs
kubectl -n ai logs deploy/vllm -c vllm | egrep -i 'flashinfer|flex|fa |dtype|health|error|compile'
# verify env collision is gone
kubectl -n ai exec deploy/vllm -c vllm -- sh -lc 'printenv | grep ^VLLM_PORT || echo "no VLLM_PORT (ok)"'
# smoke the OpenAI shim
kubectl -n ai exec deploy/gateway -- curl -s http://vllm.ai.svc.cluster.local:8000/v1/modelsapiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
namespace: ai
spec:
selector: { matchLabels: { app.kubernetes.io/name: vllm } }
template:
metadata:
labels: { app.kubernetes.io/name: vllm }
spec:
enableServiceLinks: false
runtimeClassName: nvidia
nodeSelector: { accelerator: nvidia }
containers:
- name: vllm
image: vllm/vllm-openai:latest-x86_64
args: ["--port","8000","--model","zai-org/glm-edge-1.5b-chat","--download-dir","/models","--tensor-parallel-size","1","--dtype","float16"]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom: { secretKeyRef: { name: hf-token, key: HUGGING_FACE_HUB_TOKEN, optional: true } }
- { name: VLLM_PORT, value: "39000" }
- { name: VLLM_USE_FLASHINFER, value: "0" }
ports: [{ name: openai, containerPort: 8000 }]
resources:
requests: { cpu: "4", memory: "20Gi", nvidia.com/gpu: "1" }
limits: { cpu: "8", memory: "40Gi", nvidia.com/gpu: "1" }
volumeMounts: [{ name: hf-cache, mountPath: /models }]
startupProbe: { httpGet: { path: "/health", port: "openai" }, periodSeconds: 5, timeoutSeconds: 2, failureThreshold: 60 }
readinessProbe: { httpGet: { path: "/health", port: "openai" }, initialDelaySeconds: 10, periodSeconds: 5, timeoutSeconds: 1 }
livenessProbe: { httpGet: { path: "/health", port: "openai" }, initialDelaySeconds: 60, periodSeconds: 10, timeoutSeconds: 2 }
volumes:
- name: hf-cache
persistentVolumeClaim: { claimName: hf-cache }