Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,10 @@ cscope.*

/bazel-*
*.pyc

# Helm chart dependecies cache
**/Chart.lock
**/charts/*.tgz

# Helm chart output directory
ai/ai-starter-kit/out
70 changes: 70 additions & 0 deletions ai/ai-starter-kit/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
.PHONY: check_hf_token check_OCI_target package_helm lint dep_update install install_gke start uninstall push_helm

check_hf_token:
ifndef HF_TOKEN
$(error HF_TOKEN is not set)
endif

check_OCI_target:
ifndef OCI_HELM_TARGET
$(error OCI_HELM_TARGET is not set)
endif

package_helm:
helm package helm-chart/ai-starter-kit/ --destination out/

push_helm: check_OCI_target
helm push out/ai-starter-kit* oci://$$OCI_HELM_TARGET

lint:
helm lint helm-chart/ai-starter-kit

dep_update:
helm dependency update helm-chart/ai-starter-kit

install: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values.yaml

install_gke: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke.yaml

install_gke_gpu: check_hf_token
helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values-gke-gpu.yaml

start:
mkdir -p /tmp/models-cache
minikube start --cpus 4 --memory 15000 --mount --mount-string="/tmp/models-cache:/tmp/models-cache"

start_gpu:
mkdir -p $HOME/models-cache
minikube start --driver krunkit --cpus 4 --memory 15000 --mount --mount-string="$HOME/models-cache:$HOME/models-cache"

uninstall:
helm uninstall ai-starter-kit
kubectl delete pod jupyter-user
kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir

destroy:
minikube delete

validate_jupyterhub:
kubectl get pods; \
kubectl wait --for=condition=Ready pods -l 'component!=continuous-image-puller' --timeout=1800s; \
kubectl get pods; \
kubectl get services; \
kubectl port-forward service/ai-starter-kit-jupyterhub-proxy-public 8081:80 & \
PID=$$!; \
echo "Port-forward PID=$${PID}"; \
sleep 5s; \
python3 ./ci/test_hub.py "127.0.0.1:8081"; \
kill $$PID

validate_ray:
kubectl wait --for=condition=Ready pods -l 'app.kubernetes.io/created-by=kuberay-operator' --timeout=1800s; \
kubectl get pods; \
kubectl get services; \
kubectl port-forward service/ai-starter-kit-kuberay-head-svc 8265:8265 & \
PID=$$!; \
sleep 10s; \
ray job submit --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"; \
kill $$PID
9 changes: 9 additions & 0 deletions ai/ai-starter-kit/ci/terraform/default_env.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
project_id = ""
default_resource_name = ""

cluster_name = "" # Leave empty to use the default name (default_resource_name)
cluster_location = "us-central1"
private_cluster = false
autopilot_cluster = true

service_account_name = "" # Leave empty to use the default name
108 changes: 108 additions & 0 deletions ai/ai-starter-kit/ci/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
terraform {

required_providers {
kubectl = {
source = "gavinbunney/kubectl"
version = ">= 1.19.0"
}
}
}
data "google_client_config" "default" {}


data "google_project" "project" {
project_id = var.project_id
}


locals {
cluster_name = var.cluster_name != "" ? var.cluster_name : var.default_resource_name
}

module "gke_cluster" {
source = "github.com/ai-on-gke/common-infra/common/infrastructure?ref=main"

project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
network_name = "default"
subnetwork_name = "default"
enable_gpu = true
gpu_pools = [
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-balanced"
enable_gcfs = true
logging_variant = "DEFAULT"
accelerator_count = 2
accelerator_type = "nvidia-l4"
gpu_driver_version = "DEFAULT"
}
]
ray_addon_enabled = false
}

locals {
#ca_certificate = base64decode(module.gke_cluster.ca_certificate)
cluster_membership_id = var.cluster_membership_id == "" ? local.cluster_name : var.cluster_membership_id
host = var.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : "https://${module.gke_cluster.endpoint}"

}

provider "kubernetes" {
alias = "ai_starter_kit"
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)

dynamic "exec" {
for_each = var.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}

locals {
service_account_name = var.service_account_name != "" ? var.service_account_name : var.default_resource_name
}


module "ai_starter_kit_workload_identity" {
providers = {
kubernetes = kubernetes.ai_starter_kit
}
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
name = local.service_account_name
namespace = "default"
roles = ["roles/storage.objectUser"]
project_id = var.project_id
depends_on = [module.gke_cluster]
}

provider "kubectl" {
alias = "ai_starter_kit"
apply_retry_count = 15
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = var.private_cluster ? "" : base64decode(module.gke_cluster.ca_certificate)
load_config_file = true

dynamic "exec" {
for_each = var.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
15 changes: 15 additions & 0 deletions ai/ai-starter-kit/ci/terraform/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

output "gke_cluster_name" {
value = local.cluster_name
description = "GKE cluster name"
}

output "gke_cluster_location" {
value = var.cluster_location
description = "GKE cluster location"
}

output "project_id" {
value = var.project_id
description = "GKE cluster location"
}
26 changes: 26 additions & 0 deletions ai/ai-starter-kit/ci/terraform/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
variable "project_id" {
type = string
}
variable "default_resource_name" {
type = string
}
variable "cluster_name" {
type = string
}
variable "cluster_location" {
type = string
}
variable "autopilot_cluster" {
type = bool
}
variable "private_cluster" {
type = bool
}
variable "cluster_membership_id" {
type = string
description = "require to use connectgateway for private clusters, default: cluster_name"
default = ""
}
variable "service_account_name" {
type = string
}
59 changes: 59 additions & 0 deletions ai/ai-starter-kit/ci/test_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
import requests
from packaging.version import Version as V


def test_hub_up(hub_url):
r = requests.get(hub_url)
r.raise_for_status()
print("JupyterHub up.")


def test_api_root(hub_url):
"""
Tests the hub api's root endpoint (/). The hub's version should be returned.

A typical jupyterhub logging response to this test:

[I 2019-09-25 12:03:12.051 JupyterHub log:174] 200 GET /hub/api ([email protected]) 9.57ms
"""
r = requests.get(hub_url + "/hub/api")
r.raise_for_status()
info = r.json()
version = info["version"]
assert V("4") <= V(version) <= V("5.5"), f"version {version} must be between 4 and 5.5"
print("JupyterHub Rest API is working.")


def test_hub_login(hub_url):
"""
Tests the hub dummy authenticator login credentials. Login credentials retrieve
from /jupyter_config/config.yaml. After successfully login, user will be
redirected to /hub/spawn.
"""
username, password = "user", "sneakypass"
session = requests.Session()

response = session.get(hub_url + "/hub/login")
response.raise_for_status()

auth_params = {}
if "_xsrf" in session.cookies:
auth_params = {"_xsrf": session.cookies["_xsrf"]}

response = session.post(
hub_url + "/hub/login",
params=auth_params,
data={"username": username, "password": password},
allow_redirects=True,
)
response.raise_for_status()
assert (hub_url + "/hub/spawn-pending/user") in response.url, f"unexpected response url: got {response.url}, expected {hub_url}/hub/spawn-pending/user"
print("JupyterHub login success.")


hub_url = "http://" + sys.argv[1]

test_hub_up(hub_url)
test_api_root(hub_url)
test_hub_login(hub_url)
Loading