kubernetes · volatilemolotov · Jun 26, 2025 · Sep 11, 2025 · Sep 29, 2025 · Oct 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,10 @@ cscope.*
 
 /bazel-*
 *.pyc
+
+# Helm chart dependecies cache
+**/Chart.lock
+**/charts/*.tgz
+
+# Helm chart output directory
+ai/ai-starter-kit/out
diff --git a/ai/ai-starter-kit/Makefile b/ai/ai-starter-kit/Makefile
@@ -0,0 +1,70 @@
+.PHONY: check_hf_token check_OCI_target package_helm lint dep_update install install_gke start uninstall push_helm
+
+check_hf_token:
+ifndef HF_TOKEN
+	$(error HF_TOKEN is not set)
+endif
+
+check_OCI_target:
+ifndef OCI_HELM_TARGET
+	$(error OCI_HELM_TARGET is not set)
+endif
+
+package_helm:
+	helm package helm-chart/ai-starter-kit/ --destination out/
+
+push_helm: check_OCI_target
+	helm push out/ai-starter-kit* oci://$$OCI_HELM_TARGET
+
+lint:
+	helm lint helm-chart/ai-starter-kit
+
+dep_update:
+	helm dependency update helm-chart/ai-starter-kit
+
+install: check_hf_token
+	helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values.yaml
+
+install_gke: check_hf_token
+	helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke.yaml
+
+install_gke_gpu: check_hf_token
+	helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke-gpu.yaml
+
+start:
+	mkdir -p /tmp/models-cache
+	minikube start --cpus 4 --memory 15000 --mount --mount-string="/tmp/models-cache:/tmp/models-cache"
+
+start_gpu:
+	mkdir -p $HOME/models-cache
+	minikube start --driver krunkit --cpus 4 --memory 15000 --mount --mount-string="$HOME/models-cache:$HOME/models-cache"
+
+uninstall:
+	helm uninstall ai-starter-kit
+	kubectl delete pod jupyter-user
+	kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir
+
+destroy:
+	minikube delete
+
+validate_jupyterhub:
+	kubectl get pods; \
+    kubectl wait --for=condition=Ready pods -l 'component!=continuous-image-puller' --timeout=1800s; \
+    kubectl get pods; \
+    kubectl get services; \
+    kubectl port-forward service/ai-starter-kit-jupyterhub-proxy-public 8081:80 & \
+    PID=$$!; \
+    echo "Port-forward PID=$${PID}"; \
+    sleep 5s; \
+    python3 ./ci/test_hub.py "127.0.0.1:8081"; \
+    kill $$PID
+
+validate_ray:
+	kubectl wait --for=condition=Ready pods -l 'app.kubernetes.io/created-by=kuberay-operator' --timeout=1800s; \
+	kubectl get pods; \
+	kubectl get services; \
+	kubectl port-forward service/ai-starter-kit-kuberay-head-svc 8265:8265 & \
+	PID=$$!; \
+	sleep 10s; \
+	ray job submit --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"; \
+	kill $$PID
diff --git a/ai/ai-starter-kit/ci/terraform/default_env.tfvars b/ai/ai-starter-kit/ci/terraform/default_env.tfvars
@@ -0,0 +1,9 @@
+project_id            = ""
+default_resource_name = ""
+
+cluster_name      = "" # Leave empty to use the default name (default_resource_name)
+cluster_location  = "us-central1"
+private_cluster   = false
+autopilot_cluster = true
+
+service_account_name  = "" # Leave empty to use the default name
diff --git a/ai/ai-starter-kit/ci/terraform/main.tf b/ai/ai-starter-kit/ci/terraform/main.tf
@@ -0,0 +1,108 @@
+terraform {
+
+  required_providers {
+    kubectl = {
+      source  = "gavinbunney/kubectl"
+      version = ">= 1.19.0"
+    }
+  }
+}
+data "google_client_config" "default" {}
+
+
+data "google_project" "project" {
+  project_id = var.project_id
+}
+
+
+locals {
+  cluster_name = var.cluster_name != "" ? var.cluster_name : var.default_resource_name
+}
+
+module "gke_cluster" {
+  source = "github.com/ai-on-gke/common-infra/common/infrastructure?ref=main"
+
+  project_id        = var.project_id
+  cluster_name      = local.cluster_name
+  cluster_location  = var.cluster_location
+  autopilot_cluster = var.autopilot_cluster
+  private_cluster   = var.private_cluster
+  create_network    = false
+  network_name      = "default"
+  subnetwork_name   = "default"
+  enable_gpu        = true
+  gpu_pools = [
+    {
+      name               = "gpu-pool-l4"
+      machine_type       = "g2-standard-24"
+      node_locations     = "us-central1-a" ## comment to autofill node_location based on cluster_location
+      autoscaling        = true
+      min_count          = 1
+      max_count          = 3
+      disk_size_gb       = 100
+      disk_type          = "pd-balanced"
+      enable_gcfs        = true
+      logging_variant    = "DEFAULT"
+      accelerator_count  = 2
+      accelerator_type   = "nvidia-l4"
+      gpu_driver_version = "DEFAULT"
+    }
+  ]
+  ray_addon_enabled = false
+}
+
+locals {
+  #ca_certificate        = base64decode(module.gke_cluster.ca_certificate)
+  cluster_membership_id = var.cluster_membership_id == "" ? local.cluster_name : var.cluster_membership_id
+  host                  = var.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : "https://${module.gke_cluster.endpoint}"
+
+}
+
+provider "kubernetes" {
+  alias                  = "ai_starter_kit"
+  host                   = local.host
+  token                  = data.google_client_config.default.access_token
+  cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)
+
+  dynamic "exec" {
+    for_each = var.private_cluster ? [1] : []
+    content {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "gke-gcloud-auth-plugin"
+    }
+  }
+}
+
+locals {
+  service_account_name = var.service_account_name != "" ? var.service_account_name : var.default_resource_name
+}
+
+
+module "ai_starter_kit_workload_identity" {
+  providers = {
+    kubernetes = kubernetes.ai_starter_kit
+  }
+  source     = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
+  name       = local.service_account_name
+  namespace  = "default"
+  roles      = ["roles/storage.objectUser"]
+  project_id = var.project_id
+  depends_on = [module.gke_cluster]
+}
+
+provider "kubectl" {
+  alias                  = "ai_starter_kit"
+  apply_retry_count      = 15
+  host                   = local.host
+  token                  = data.google_client_config.default.access_token
+  cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)
+  load_config_file       = true
+
+  dynamic "exec" {
+    for_each = var.private_cluster ? [1] : []
+    content {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "gke-gcloud-auth-plugin"
+    }
+  }
+}
diff --git a/ai/ai-starter-kit/ci/terraform/outputs.tf b/ai/ai-starter-kit/ci/terraform/outputs.tf
@@ -0,0 +1,15 @@
+
+output "gke_cluster_name" {
+  value       = local.cluster_name
+  description = "GKE cluster name"
+}
+
+output "gke_cluster_location" {
+  value       = var.cluster_location
+  description = "GKE cluster location"
+}
+
+output "project_id" {
+  value       = var.project_id
+  description = "GKE cluster location"
+}
diff --git a/ai/ai-starter-kit/ci/terraform/variables.tf b/ai/ai-starter-kit/ci/terraform/variables.tf
@@ -0,0 +1,26 @@
+variable "project_id" {
+  type = string
+}
+variable "default_resource_name" {
+  type = string
+}
+variable "cluster_name" {
+  type = string
+}
+variable "cluster_location" {
+  type = string
+}
+variable "autopilot_cluster" {
+  type = bool
+}
+variable "private_cluster" {
+  type = bool
+}
+variable "cluster_membership_id" {
+  type        = string
+  description = "require to use connectgateway for private clusters, default: cluster_name"
+  default     = ""
+}
+variable "service_account_name" {
+  type = string
+}
diff --git a/ai/ai-starter-kit/ci/test_hub.py b/ai/ai-starter-kit/ci/test_hub.py
@@ -0,0 +1,59 @@
+import sys
+import requests
+from packaging.version import Version as V
+
+
+def test_hub_up(hub_url):
+    r = requests.get(hub_url)
+    r.raise_for_status()
+    print("JupyterHub up.")
+
+
+def test_api_root(hub_url):
+    """
+    Tests the hub api's root endpoint (/). The hub's version should be returned.
+
+    A typical jupyterhub logging response to this test:
+
+        [I 2019-09-25 12:03:12.051 JupyterHub log:174] 200 GET /hub/api ([email protected]) 9.57ms
+    """
+    r = requests.get(hub_url + "/hub/api")
+    r.raise_for_status()
+    info = r.json()
+    version = info["version"]
+    assert V("4") <= V(version) <= V("5.5"), f"version {version} must be between 4 and 5.5"
+    print("JupyterHub Rest API is working.")
+
+
+def test_hub_login(hub_url):
+    """
+    Tests the hub dummy authenticator login credentials. Login credentials retrieve
+    from /jupyter_config/config.yaml. After successfully login, user will be
+    redirected to /hub/spawn.
+    """
+    username, password = "user", "sneakypass"
+    session = requests.Session()
+
+    response = session.get(hub_url + "/hub/login")
+    response.raise_for_status()
+
+    auth_params = {}
+    if "_xsrf" in session.cookies:
+        auth_params = {"_xsrf": session.cookies["_xsrf"]}
+
+    response = session.post(
+        hub_url + "/hub/login",
+        params=auth_params,
+        data={"username": username, "password": password},
+        allow_redirects=True,
+    )
+    response.raise_for_status()
+    assert (hub_url + "/hub/spawn-pending/user") in response.url, f"unexpected response url: got {response.url}, expected {hub_url}/hub/spawn-pending/user"
+    print("JupyterHub login success.")
+
+
+hub_url = "http://" + sys.argv[1]
+
+test_hub_up(hub_url)
+test_api_root(hub_url)
+test_hub_login(hub_url)