diff --git a/applications/osmo/.gitignore b/applications/osmo/.gitignore new file mode 100755 index 000000000..47334b4a7 --- /dev/null +++ b/applications/osmo/.gitignore @@ -0,0 +1,50 @@ +# Terraform +*.tfstate +*.tfstate.* +*.tfvars +!*.tfvars.example +!*.tfvars.*.example +.terraform/ +.terraform.lock.hcl +*.out +crash.log +crash.*.log +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# SSH keys +*.pem +id_rsa* +*.key + +# Secrets +*.secret +.env +.env.* +!.env.example + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Kubernetes +kubeconfig +kubeconfig.* +!kubeconfig.example + +# WireGuard +*.conf +!*.conf.example +wg-client-*.conf diff --git a/applications/osmo/LICENSE b/applications/osmo/LICENSE new file mode 100755 index 000000000..2b8f06340 --- /dev/null +++ b/applications/osmo/LICENSE @@ -0,0 +1,176 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to the Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/applications/osmo/README.md b/applications/osmo/README.md new file mode 100755 index 000000000..01f1b11fb --- /dev/null +++ b/applications/osmo/README.md @@ -0,0 +1,481 @@ +# Physical AI Workflow Orchestration on Nebius Cloud + +Deploy [NVIDIA OSMO](https://nvidia.github.io/OSMO/main/user_guide/index.html) on [Nebius AI Cloud](https://nebius.com/ai-cloud) in minutes. Run simulation, training, and edge workflows on the wide variety of Nebius GPU instances—write once in YAML, run anywhere. + +## Supported Regions + +| Region | Available GPU Platforms | +|--------|----------------------| +| `eu-north1` | gpu-h100-sxm, gpu-h200-sxm, gpu-l40s-a, gpu-l40s-d | +| `eu-north2` | gpu-h200-sxm | +| `eu-west1` | gpu-h200-sxm | +| `me-west1` | gpu-b200-sxm-a (NVIDIA B200) | +| `uk-south1` | gpu-b300-sxm (NVIDIA B300) | +| `us-central1` | gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) | + +## Known Gaps and TODOs + +| Gap | Current Workaround | Status | +|-----|-------------------|--------| +| No managed Redis service | Deploy Redis in-cluster via Helm | Workaround in place | +| MysteryBox lacks K8s CSI integration | Scripts retrieve secrets and create K8s secrets manually | Workaround in place | +| No External DNS service | Manual DNS configuration required | Not addressed | +| No managed SSL/TLS service | Manual certificate management | Not addressed | +| No public Load Balancer (ALB/NLB) | Use port-forwarding or WireGuard VPN for access | Workaround in place | +| IDP integration for Nebius | Using OSMO dev auth mode; Keycloak available but not integrated | TBD | +| Nebius Observability Stack integration | Using self-deployed Prometheus/Grafana/Loki | TODO | +| Single cluster for Control Plane + Backend | Using 1 MK8s cluster for both; production separation TBD | Discuss with Nebius | + +## What You Get + +Production-ready infrastructure-as-code (Terraform) and setup scripts for: +- **Managed Kubernetes (MK8s)** cluster with GPU and CPU node groups +- **GPU Infrastructure** including GPU Operator, Network Operator, and KAI Scheduler +- **Observability Stack** with Prometheus, Grafana, and Loki +- **OSMO Control Plane and Backend** for workflow orchestration +- **Supporting Services** including PostgreSQL, Object Storage, Filestore, and Container Registry +- **Secure Access** via WireGuard VPN (optional) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Nebius AI Cloud │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Nebius VPC Network │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌────────────────────────────────────────────────┐ │ │ +│ │ │ WireGuard │ │ Nebius Managed Kubernetes (MK8s) │ │ │ +│ │ │ VPN │ │ │ │ │ +│ │ │ (Optional) │ │ ┌────────────────────────────────────────┐ │ │ │ +│ │ └──────┬──────┘ │ │ OSMO Namespace │ │ │ │ +│ │ │ │ │ ┌──────────┐ ┌────────┐ ┌──────────┐ │ │ │ │ +│ │ │ │ │ │ osmo- │ │ osmo- │ │ osmo- │ │ │ │ │ +│ │ │ │ │ │ service │ │ logger │ │ agent │ │ │ │ │ +│ │ │ │ │ └────┬─────┘ └───┬────┘ └────┬─────┘ │ │ │ │ +│ │ │ │ │ └────────────┼──────────┘ │ │ │ │ +│ │ │ │ │ ┌─────┴─────┐ │ │ │ │ +│ │ │ │ │ │osmo-proxy │ │ │ │ │ +│ │ │ │ │ │ (nginx) │ │ │ │ │ +│ │ │ │ │ └─────┬─────┘ │ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ │ │ ┌─────────┐ ┌────┴────┐ ┌─────────┐ │ │ │ │ +│ │ │ │ │ │ osmo-ui │ │osmo-ctrl│ │osmo- │ │ │ │ │ +│ │ │ │ │ │ (Web UI)│ │(sidecar)│ │backend │ │ │ │ │ +│ │ │ │ │ └─────────┘ └─────────┘ └─────────┘ │ │ │ │ +│ │ │ │ └────────────────────────────────────────┘ │ │ │ +│ │ │ │ │ │ │ +│ │ └─────────┼───► ┌──────────────┐ ┌───────────────────┐ │ │ │ +│ │ │ │ CPU Nodes │ │ GPU Nodes │ │ │ │ +│ │ │ │ (cpu-d3) │ │ (L40S/H100/H200/ │ │ │ │ +│ │ │ │ │ │ B200/B300) │ │ │ │ +│ │ │ │ System pods │ │ Workflow pods │ │ │ │ +│ │ │ └──────────────┘ └───────────────────┘ │ │ │ +│ │ │ │ │ │ +│ │ │ ┌────────────────────────────────────────┐ │ │ │ +│ │ │ │ Infrastructure Stack │ │ │ │ +│ │ │ │ GPU Operator, Network Operator, Cilium│ │ │ │ +│ │ │ │ Prometheus, Grafana, Loki │ │ │ │ +│ │ │ └────────────────────────────────────────┘ │ │ │ +│ │ └────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ +│ │ Managed │ │ Object │ │ Shared │ │ Container │ │ +│ │ PostgreSQL │ │ Storage │ │ Filesystems │ │ Registry │ │ +│ │ (OSMO DB) │ │ (Workflow │ │ (Datasets) │ │ (Images) │ │ +│ │ │ │ logs/data) │ │ │ │ │ │ +│ └───────────────┘ └───────────────┘ └───────────────┘ └───────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Nebius Services Used:** + +| Service | Purpose | +|---------|---------| +| MK8s | Managed Kubernetes with CPU and GPU node groups | +| Managed PostgreSQL | Database for OSMO state and metadata | +| Object Storage | S3-compatible storage for workflow logs and artifacts | +| Shared Filesystems | NFS storage for datasets across nodes | +| Compute | VMs for WireGuard VPN (optional) | +| VPC | Private networking with subnet isolation | +| IAM | Service accounts and access keys | +| MysteryBox | Secrets management for credentials | +| Container Registry | Docker image storage for custom workflow images | + +## Prerequisites + +Before deploying, ensure you have access to Nebius AI Cloud and the required command-line tools installed. The deployment uses Terraform for infrastructure provisioning and Helm/kubectl for Kubernetes configuration. + +- [**Nebius Account**](https://console.eu.nebius.com/) with appropriate permissions (see [Required Permissions](#required-permissions)) +- [**Nebius CLI**](https://docs.nebius.com/cli/install) installed and authenticated +- [**Terraform**](https://developer.hashicorp.com/terraform/install) >= 1.5.0 for infrastructure-as-code +- [**kubectl**](https://kubernetes.io/docs/tasks/tools/) >= 1.28 for Kubernetes cluster management (should match cluster version ±1 minor) +- [**Helm**](https://helm.sh/docs/intro/install/) >= 3.0 for deploying OSMO charts +- **SSH key pair** for node access (generate with `ssh-keygen` if needed) + +## Quick Start + +> **Important:** Complete all steps in the **same terminal session**. The setup scripts export environment variables that must persist across steps. + +Please run this from a Linux Shell/Ubuntu/WSL. + +### 1. Install Required Tools + +```bash +cd deploy/000-prerequisites +./install-tools.sh # Installs: Terraform, kubectl, Helm, Nebius CLI, OSMO CLI +./install-tools.sh --check # Verify without installing +``` + +Supports Linux, WSL, and macOS. Requires Python/pip for OSMO CLI installation. See [prerequisites README](deploy/000-prerequisites/README.md) for manual installation. + +### 2. Configure Nebius Environment + +> **Note:** If not authenticated, run `nebius profile create` first and follow the authentication flow. + +```bash +source ./nebius-env-init.sh +``` + +This interactive script: +1. **Checks Nebius CLI** - Verifies installation and adds to PATH if needed +2. **Checks authentication** - If not authenticated, provides instructions to run `nebius profile create` +3. **Lists tenants** - Auto-detects if you have only one tenant +4. **Configures project** - Select existing project, create new one, or list available projects +5. **Sets region** - Choose from `eu-north1`, `eu-north2`, `eu-west1`, `me-west1`, `uk-south1`, `us-central1` +6. **Exports environment variables** - Sets `NEBIUS_*` and `TF_VAR_*` variables for Terraform + +### 3. Initialize Secrets (REQUIRED) + +```bash +source ./secrets-init.sh +``` + +> **Important:** This step is **REQUIRED** before running Terraform. If you skip it, `terraform apply` will fail with a clear error message. + +This generates secure credentials and stores them in [Nebius MysteryBox](https://docs.nebius.com/mysterybox): +- **PostgreSQL password** - Used by Managed PostgreSQL and OSMO +- **MEK (Master Encryption Key)** - Used by OSMO for data encryption + +The script exports `TF_VAR_*` environment variables that Terraform and setup scripts use to retrieve these secrets securely, keeping them out of Terraform state. + +### 4. Deploy Infrastructure + +Provision all Nebius cloud resources using Terraform: VPC network, Managed Kubernetes cluster, GPU/CPU node groups, PostgreSQL database, Object Storage, and optionally WireGuard VPN. + +```bash +cd ../001-iac + +# Recommended: Cost-optimized for development (see Appendix A) +cp terraform.tfvars.cost-optimized.example terraform.tfvars + +# Edit terraform.tfvars if needed +terraform init +terraform plan -out plan.out +terraform apply plan.out +``` + +> **Note:** If you get an error about missing `postgresql_mysterybox_secret_id`, go back to step 3 and run `source ./secrets-init.sh`. + +See [Terraform README](deploy/001-iac/README.md) for configuration options, and [Appendix A](#appendix-a-terraform-configuration-presets) for preset comparisons. + +### 5. Configure Kubernetes + +1. Get Kubernetes credentials: + ```bash + nebius mk8s cluster get-credentials --id --external + ``` + +2. Verify cluster access: + ```bash + kubectl get nodes + ``` + +3. Deploy GPU infrastructure and observability: + ```bash + cd ../002-setup + ./01-deploy-gpu-infrastructure.sh + ./02-deploy-observability.sh + ``` + + This installs: + - NVIDIA GPU Operator and Network Operator + - KAI Scheduler for GPU workload scheduling + - Prometheus, Grafana, and Loki for monitoring + +4. Deploy NGINX Ingress Controller: + ```bash + ./03-deploy-nginx-ingress.sh + ``` + + This deploys the community NGINX Ingress Controller with a LoadBalancer IP. It provides path-based routing to all OSMO services (API, router, Web UI). The LoadBalancer IP is auto-detected by later scripts. + +5. Deploy OSMO control plane: + ```bash + ./04-deploy-osmo-control-plane.sh + ``` + + This deploys the core OSMO services: + - Creates `osmo` namespace and PostgreSQL/MEK secrets + - Initializes databases on Nebius Managed PostgreSQL + - Deploys Redis and OSMO services (API, agent, worker, logger) + - Creates Kubernetes Ingress resources for path-based routing via the NGINX Ingress Controller + + > **Note:** The script automatically retrieves PostgreSQL password and MEK from MysteryBox if you ran `secrets-init.sh` earlier. + +7. Deploy OSMO backend operator: + ```bash + ./05-deploy-osmo-backend.sh + ``` + + The script automatically: + - Starts a port-forward to OSMO service + - Logs in using dev method (since Keycloak auth is disabled) + - Creates a service token for the backend operator + - Deploys the backend operator + - Cleans up the port-forward + + This deploys the backend operator that manages GPU workloads: + - Connects to OSMO control plane via `osmo-agent` + - Configures resource pools for GPU nodes + - Enables workflow execution on the Kubernetes cluster + + > **Manual alternative:** If you prefer to create the token manually, set `OSMO_SERVICE_TOKEN` environment variable before running the script. + +8. Verify backend deployment: + + Verify the backend is registered with OSMO using the NGINX Ingress LoadBalancer IP: + ```bash + # Check backend registration + curl http:///api/configs/backend + + # Or via OSMO CLI + osmo config show BACKEND default + ``` + + The Ingress LoadBalancer IP is shown in the output of `04-deploy-osmo-control-plane.sh`. + You should see the backend configuration with status `ONLINE`. + +9. Configure OSMO storage: + ```bash + ./06-configure-storage.sh + ``` + + The script automatically: + - Retrieves storage bucket details from Terraform + - Starts port-forward and logs in to OSMO + - Configures OSMO to use Nebius Object Storage for workflow artifacts + - Verifies the configuration + + > **Note:** The `osmo-storage` secret (with S3 credentials) was created automatically by `04-deploy-osmo-control-plane.sh`. + +10. Access OSMO (via NGINX Ingress LoadBalancer): + + The NGINX Ingress Controller exposes OSMO via a LoadBalancer IP. The IP is shown in the output of `04-deploy-osmo-control-plane.sh`, or retrieve it with: + ```bash + kubectl get svc -n ingress-nginx ingress-nginx-controller -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + ``` + + Access points (replace `` with your LoadBalancer IP): + - **OSMO API**: `http:///api/version` + - **OSMO Web UI**: `http://` + + Login to OSMO CLI: + ```bash + osmo login http:// --method dev --username admin + ``` + + > **Fallback:** If the LoadBalancer IP is not reachable, you can use port-forwarding: + > ```bash + > kubectl port-forward -n osmo svc/osmo-service 8080:80 + > osmo login http://localhost:8080 --method dev --username admin + > ``` + + > **Note:** The `service_base_url` (required for workflow execution) is automatically configured + > by `04-deploy-osmo-control-plane.sh` using the NGINX Ingress LoadBalancer IP. If you need to + > reconfigure it manually, run `./07-configure-service-url.sh`. + +11. Configure pool for GPU workloads: + + The default pool needs GPU platform configuration to run GPU workflows. This creates a pod template with the correct node selector and tolerations for GPU nodes: + + ```bash + ./08-configure-gpu-platform.sh + ``` + + The script: + - Creates a `gpu_tolerations` pod template with `nvidia.com/gpu.present: true` node selector + - Updates the GPU platform to reference this pod template + - Verifies GPU nodes are visible in OSMO + + You can verify the configuration: + ```bash + osmo config show POOL default + osmo config show POD_TEMPLATE gpu_tolerations + ``` + +12. Run a test workflow (optional): + + Verify the complete setup by running a test workflow from the `workflows/osmo/` directory: + + ```bash + # Set the default pool (required before submitting workflows) + osmo profile set pool default + + # Submit the hello world workflow (requires GPU) + osmo workflow submit ../../workflows/osmo/hello_nebius.yaml + + # Or specify the pool explicitly + osmo workflow submit ../../workflows/osmo/hello_nebius.yaml --pool default + + # Check workflow status + osmo workflow list + osmo workflow query + + # View workflow logs + osmo workflow query --logs + ``` + + Available test workflows in `workflows/osmo/`: + - `hello_nebius.yaml` - Simple GPU hello world + - `gpu_test.yaml` - GPU validation test + + + +## Configuration Tiers + +| Tier | GPU Type | GPU Nodes | Security | Est. Cost/6h | +|------|----------|-----------|----------|--------------| +| **Cost-Optimized Secure** (recommended) | 1x L40S | 1 | WireGuard VPN | **~$15-25** | +| **Cost-Optimized** | 1x L40S | 1 | Public endpoints | ~$10-15 | +| **Standard** | 1x H100 | 1 | Public endpoints | ~$30-40 | +| **Production** | 8x H200 | 4+ | WireGuard VPN | ~$1000+ | + +**Recommended:** Use `terraform.tfvars.cost-optimized-secure.example` for development. + +See `deploy/001-iac/terraform.tfvars.*.example` files for all configuration options. + +## GPU Options + +| Platform | Preset | GPUs | vCPUs | RAM | InfiniBand | Regions | +|----------|--------|------|-------|-----|------------|---------| +| `gpu-l40s-a` | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | No | eu-north1 | +| `gpu-l40s-d` | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | No | eu-north1 | +| `gpu-h100-sxm` | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | No | eu-north1 | +| `gpu-h100-sxm` | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | Yes | eu-north1 | +| `gpu-h200-sxm` | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | No | eu-north1, eu-north2, eu-west1, us-central1 | +| `gpu-h200-sxm` | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | Yes | eu-north1, eu-north2, eu-west1, us-central1 | +| `gpu-b200-sxm` | `1gpu-20vcpu-224gb` | 1 | 20 | 224GB | No | us-central1 | +| `gpu-b200-sxm` | `8gpu-160vcpu-1792gb` | 8 | 160 | 1792GB | Yes | us-central1 | +| `gpu-b200-sxm-a` | `1gpu-20vcpu-224gb` | 1 | 20 | 224GB | No | me-west1 | +| `gpu-b200-sxm-a` | `8gpu-160vcpu-1792gb` | 8 | 160 | 1792GB | Yes | me-west1 | +| `gpu-b300-sxm` | `1gpu-24vcpu-346gb` | 1 | 24 | 346GB | No | uk-south1 | +| `gpu-b300-sxm` | `8gpu-192vcpu-2768gb` | 8 | 192 | 2768GB | Yes | uk-south1 | + +**Recommendation:** Use `gpu-l40s-a` for development/testing in eu-north1 (cheapest option). + +## Required Permissions + +This deployment uses the [Nebius Terraform Provider](https://docs.nebius.com/terraform-provider/quickstart) to provision cloud resources. Your Nebius account needs the following IAM roles to create and manage the required infrastructure. + +### Minimum Required Roles +| Role | Purpose | +|------|---------| +| `compute.admin` | VMs, disks, shared filesystems, GPU clusters | +| `vpc.admin` | VPC networks and subnets | +| `mk8s.admin` | Managed Kubernetes clusters and node groups | +| `storage.admin` | Object Storage buckets | +| `mdb.admin` | Managed PostgreSQL clusters | +| `iam.serviceAccounts.admin` | Service accounts and access keys | +| `container-registry.admin` | Container registries | + +### For WireGuard VPN (Optional) +| Role | Purpose | +|------|---------| +| `vpc.publicIpAllocations.admin` | Allocate public IPs for VPN endpoint | + +For more information, see [Nebius IAM Roles](https://docs.nebius.com/iam/authorization/roles) and the [Terraform Provider Quickstart](https://docs.nebius.com/terraform-provider/quickstart). + +## Security Options + +### Option A: WireGuard VPN (Recommended for Production) + +Enable private-only access with WireGuard VPN: + +```hcl +# In terraform.tfvars +enable_wireguard = true +enable_public_endpoint = false +``` + +After deployment: +```bash +cd deploy/000-prerequisites +./wireguard-client-setup.sh +``` + +### Option B: Public Endpoints + +For development/testing with public access: + +```hcl +# In terraform.tfvars +enable_wireguard = false +enable_public_endpoint = true +``` + +## Cost Optimization Tips + +1. **Use preemptible GPU nodes** for non-critical workloads (up to 70% savings) +2. **Start with single-GPU nodes** for development +3. **Disable unused components** (Loki, multi-GPU support) +4. **Scale down when not in use** + +## Documentation + +- [Terraform Infrastructure](deploy/001-iac/README.md) +- [Setup Scripts](deploy/002-setup/README.md) +- [Troubleshooting Guide](docs/troubleshooting.md) +- [Security Guide](docs/SECURITY.md) + +## License + +Apache License 2.0 - See [LICENSE](LICENSE) for details. + +--- + +## Appendix A: Terraform Configuration Presets + +The `deploy/001-iac/` directory includes several pre-configured `terraform.tfvars` examples for different use cases: + +| Preset | GPU | WireGuard | Public API | Use Case | +|--------|-----|-----------|------------|----------| +| `terraform.tfvars.cost-optimized.example` | L40S | No | Yes | **Recommended for development** - Lowest cost, quick testing | +| `terraform.tfvars.cost-optimized-secure.example` | L40S | Yes | No | Development with VPN-only access | +| `terraform.tfvars.secure.example` | H100 | Yes | No | Staging with full security | +| `terraform.tfvars.production.example` | H200 | Yes | No | Production with maximum performance | +| `terraform.tfvars.example` | H100 | No | Yes | Basic template with all options documented | + +> **Note:** All configurations use **private nodes** (no public IPs on K8s nodes). Access is via WireGuard VPN or public K8s API endpoint. + +### Key Differences + +| Preset | GPU Nodes | CPU Nodes | etcd Size | Preemptible | Security | +|--------|-----------|-----------|-----------|-------------|----------| +| **cost-optimized-secure** | 1x L40S | 2x small | 1 | Yes | VPN only | +| **cost-optimized** | 1x L40S | 2x small | 1 | Yes | Public endpoints | +| **secure** | 8x H100 | 3x medium | 3 | No | VPN only | +| **production** | 4x 8-GPU H200 | 3x large | 3 | No | VPN only | + +**Recommendation:** Start with `terraform.tfvars.cost-optimized-secure.example` for development, then scale up as needed. + +## Cleanup + +To tear down the deployment, see [deploy/README.md](deploy/README.md#cleanup) for detailed instructions. The process involves: + +1. Uninstalling Kubernetes components (in reverse order) via scripts in `deploy/002-setup/cleanup/` +2. Destroying infrastructure with `terraform destroy` in `deploy/001-iac/` diff --git a/applications/osmo/deploy/example/000-prerequisites/README.md b/applications/osmo/deploy/example/000-prerequisites/README.md new file mode 100755 index 000000000..459b03d8d --- /dev/null +++ b/applications/osmo/deploy/example/000-prerequisites/README.md @@ -0,0 +1,291 @@ +# Prerequisites + +This directory contains scripts to set up your environment for deploying OSMO on Nebius. + +## Scripts + +| Script | Purpose | +|--------|---------| +| `install-tools.sh` | Install required tools (Terraform, kubectl, Helm, Nebius CLI, OSMO CLI) | +| `nebius-env-init.sh` | Configure Nebius environment variables | +| `secrets-init.sh` | **NEW** Generate and store secrets in MysteryBox | +| `wireguard-client-setup.sh` | Set up WireGuard VPN client | + +## Quick Start + +### 1. Install Required Tools + +```bash +# Install all required tools +./install-tools.sh + +# Or check what's already installed +./install-tools.sh --check +``` + +### 2. Configure Nebius Environment + +```bash +# Source the script (don't just run it) +source ./nebius-env-init.sh +``` + +This will: +1. Check Nebius CLI installation +2. Verify authentication status +3. Prompt for tenant ID +4. Let you choose to use an existing project OR create a new one +5. Set environment variables for Terraform + +#### Project Options + +When prompted for a project, you can: +- **Option 1**: Enter an existing project ID directly +- **Option 2**: Create a new project (enter a name) +- **Option 3**: List existing projects first, then choose + +Example creating a new project: +``` +Project Configuration + +Options: + 1) Use existing project (enter project ID) + 2) Create new project + 3) List existing projects first + +Choose option [1/2/3]: 2 +Enter new project name: osmo-dev +Creating new project: osmo-dev +[✓] Project created successfully + Project ID: project-abc123xyz +``` + +### 3. Initialize Secrets (Recommended) + +```bash +# Generate secrets and store in MysteryBox +source ./secrets-init.sh +``` + +This creates: +- **PostgreSQL password** - Stored in MysteryBox, NOT in Terraform state +- **MEK (Master Encryption Key)** - For OSMO service authentication + +> **Why?** Storing secrets in MysteryBox keeps them out of Terraform state, providing better security and enabling rotation without re-deploying. + +## Nebius CLI Authentication + +### First-Time Setup + +If you haven't authenticated the Nebius CLI yet: + +```bash +# Create a profile (interactive) +nebius profile create +``` + +The CLI will: +1. Ask for a profile name +2. Open a browser for authentication +3. Ask you to select tenant and project + +### WSL Users + +If the browser doesn't open automatically in WSL: +1. Copy the URL displayed in the terminal +2. Paste it into your Windows browser +3. Complete the authentication +4. Return to the terminal + +### Service Account Authentication + +For CI/CD or automated deployments, use service account authentication: + +1. **Create a service account** in Nebius Console +2. **Create an authorized key** (PEM file) +3. **Configure the CLI**: + ```bash + nebius profile create --auth-type service-account \ + --service-account-id \ + --key-file + ``` + +See [Nebius Service Accounts Documentation](https://docs.nebius.com/iam/service-accounts) for details. + +## Required Permissions + +Your Nebius account needs these permissions: + +### Compute +- `compute.instances.create/delete` - VMs for WireGuard, bastion +- `compute.disks.create/delete` - Boot and data disks +- `compute.filesystems.create/delete` - Shared filesystems + +### Kubernetes +- `mk8s.clusters.create/delete` - Kubernetes clusters +- `mk8s.nodeGroups.create/delete` - Node groups + +### Networking +- `vpc.networks.create/delete` - VPC networks +- `vpc.subnets.create/delete` - Subnets +- `vpc.publicIpAllocations.create/delete` - Public IPs (for WireGuard) + +### Storage +- `storage.buckets.create/delete` - Object storage + +### Database +- `mdb.clusters.create/delete` - Managed PostgreSQL + +### IAM +- `iam.serviceAccounts.create/delete` - Service accounts +- `iam.accessKeys.create/delete` - Access keys for S3 + +### Container Registry +- `container-registry.registries.create/delete` - Container registries + +See [Nebius IAM Roles](https://docs.nebius.com/iam/authorization/roles) for predefined roles. + +## Secrets Management + +### Using MysteryBox (Recommended) + +The `secrets-init.sh` script generates secrets and stores them in Nebius MysteryBox: + +```bash +source ./secrets-init.sh +``` + +This will: +1. Check if secrets already exist in MysteryBox (by name) +2. If not, generate a secure PostgreSQL password (32 chars) and MEK +3. Store new secrets in MysteryBox (Nebius secrets manager) +4. Export `TF_VAR_*` environment variables for Terraform + +### New Terminal Session + +If you start a new terminal session, simply run the script again: + +```bash +source ./secrets-init.sh +``` + +The script will detect existing secrets by name and export their IDs without regenerating them. + +### Retrieving Secrets + +To retrieve secrets from MysteryBox: + +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_POSTGRESQL_SECRET_ID \ + --key password \ + --format json | jq -r '.data.string_value' + +# MEK +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_MEK_SECRET_ID \ + --key mek \ + --format json | jq -r '.data.string_value' +``` + +### Security Considerations + +When using MysteryBox secrets: +- Secrets are **NOT** stored in Terraform state +- Only secret IDs are stored in Terraform +- Secrets are fetched at runtime using ephemeral resources +- The password output will be `null` (retrieve via CLI instead) + +### Without MysteryBox + +If you don't run `secrets-init.sh`, Terraform will: +1. Generate a random password for PostgreSQL +2. Store the password in Terraform state (less secure) +3. Output the password via `terraform output -json` + +## Environment Variables + +After running `nebius-env-init.sh`, these variables are set: + +| Variable | Description | +|----------|-------------| +| `NEBIUS_TENANT_ID` | Your Nebius tenant ID | +| `NEBIUS_PROJECT_ID` | Your Nebius project ID | +| `NEBIUS_REGION` | Deployment region (default: eu-north1) | +| `TF_VAR_tenant_id` | Terraform variable for tenant | +| `TF_VAR_parent_id` | Terraform variable for project | +| `TF_VAR_region` | Terraform variable for region | + +After running `secrets-init.sh`, these additional variables are set: + +| Variable | Description | +|----------|-------------| +| `OSMO_POSTGRESQL_SECRET_ID` | MysteryBox secret ID for PostgreSQL password | +| `OSMO_MEK_SECRET_ID` | MysteryBox secret ID for MEK | +| `TF_VAR_postgresql_mysterybox_secret_id` | Terraform variable for PostgreSQL secret | +| `TF_VAR_mek_mysterybox_secret_id` | Terraform variable for MEK secret | + +## WireGuard VPN Setup + +If you enabled WireGuard VPN in your deployment: + +```bash +./wireguard-client-setup.sh +``` + +This will: +1. Check if WireGuard is installed locally +2. Get server information from Terraform outputs +3. Generate client configuration template +4. Provide instructions for completing setup + +### Windows/WSL + +For WSL users, install WireGuard on Windows: +1. Download from https://www.wireguard.com/install/ +2. Import the generated configuration file +3. Connect through the Windows WireGuard app + +## Troubleshooting + +### "Nebius CLI not installed" + +Run the installer: +```bash +./install-tools.sh +``` + +Or install manually: +```bash +curl -sSL https://storage.eu-north1.nebius.cloud/nebius/install.sh | bash +export PATH="$HOME/.nebius/bin:$PATH" +``` + +### "Nebius CLI not authenticated" + +Authenticate with: +```bash +nebius profile create +``` + +### "Permission denied" + +Ensure scripts are executable: +```bash +chmod +x *.sh +``` + +### "Token error" or corrupted token + +Clear the token and re-authenticate: +```bash +unset NEBIUS_IAM_TOKEN +nebius profile create +``` + +### WSL browser doesn't open + +1. Copy the URL from the terminal output +2. Paste into your Windows browser manually +3. Complete authentication and return to terminal diff --git a/applications/osmo/deploy/example/000-prerequisites/install-tools.sh b/applications/osmo/deploy/example/000-prerequisites/install-tools.sh new file mode 100755 index 000000000..4c2c6a829 --- /dev/null +++ b/applications/osmo/deploy/example/000-prerequisites/install-tools.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# +# Install required tools for OSMO on Nebius deployment +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +print_status() { + echo -e "${GREEN}[✓]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[!]${NC} $1" +} + +print_error() { + echo -e "${RED}[✗]${NC} $1" +} + +# Version requirements +TERRAFORM_MIN_VERSION="1.5.0" +KUBECTL_MIN_VERSION="1.28.0" +HELM_MIN_VERSION="3.12.0" + +# Detect OS +detect_os() { + if [[ "$OSTYPE" == "linux-gnu"* ]]; then + if grep -q Microsoft /proc/version 2>/dev/null; then + echo "wsl" + else + echo "linux" + fi + elif [[ "$OSTYPE" == "darwin"* ]]; then + echo "macos" + else + echo "unknown" + fi +} + +OS=$(detect_os) + +# Check if command exists (including Nebius in custom locations) +check_command() { + local cmd=$1 + if [[ "$cmd" == "nebius" ]]; then + # Check PATH first, then common installation locations + if command -v nebius &>/dev/null; then + return 0 + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + return 0 + fi + return 1 + else + command -v "$cmd" &>/dev/null + fi +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Compare versions (returns 0 if version >= required) +version_ge() { + local version=$1 + local required=$2 + printf '%s\n%s' "$required" "$version" | sort -V -C +} + +check_terraform() { + if check_command terraform; then + local version=$(terraform version -json 2>/dev/null | grep -o '"terraform_version": *"[^"]*"' | cut -d'"' -f4) + if [[ -z "$version" ]]; then + version=$(terraform version | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') + fi + if version_ge "$version" "$TERRAFORM_MIN_VERSION"; then + print_status "Terraform $version installed" + return 0 + else + print_warning "Terraform $version installed, but >= $TERRAFORM_MIN_VERSION recommended" + return 0 + fi + fi + return 1 +} + +check_kubectl() { + if check_command kubectl; then + # Use --client flag with timeout to prevent hanging + # Some kubectl versions try to contact server even with --client + local version=$(timeout 5 kubectl version --client --short 2>/dev/null | grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+' | head -1 | tr -d 'v') + if [[ -z "$version" ]]; then + # Fallback: try without --short flag + version=$(timeout 5 kubectl version --client 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) + fi + if [[ -n "$version" ]]; then + print_status "kubectl $version installed" + return 0 + else + # kubectl exists but version check failed - still report as installed + print_status "kubectl installed (version check skipped)" + return 0 + fi + fi + return 1 +} + +check_helm() { + if check_command helm; then + local version=$(helm version --short 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') + if [[ -n "$version" ]]; then + print_status "Helm $version installed" + return 0 + fi + fi + return 1 +} + +check_nebius() { + if check_command nebius; then + local nebius_path=$(get_nebius_path) + local version=$("$nebius_path" version 2>/dev/null | head -1 || echo "unknown") + print_status "Nebius CLI installed ($version)" + if [[ "$nebius_path" == "$HOME/.nebius/bin/nebius" ]] && ! command -v nebius &>/dev/null; then + print_warning "Nebius CLI not in PATH. Run this first:" + echo "" + echo " export PATH=\"\$HOME/.nebius/bin:\$PATH\"" + echo "" + fi + return 0 + fi + return 1 +} + +check_osmo() { + if check_command osmo; then + local version=$(osmo --version 2>/dev/null | head -1 || echo "unknown") + print_status "OSMO CLI installed ($version)" + return 0 + fi + return 1 +} + +install_terraform() { + echo "Installing Terraform..." + case $OS in + linux|wsl) + wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list + sudo apt-get update && sudo apt-get install -y terraform + ;; + macos) + brew tap hashicorp/tap + brew install hashicorp/tap/terraform + ;; + esac +} + +install_kubectl() { + echo "Installing kubectl..." + case $OS in + linux|wsl) + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ + ;; + macos) + brew install kubectl + ;; + esac +} + +install_helm() { + echo "Installing Helm..." + case $OS in + linux|wsl) + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + ;; + macos) + brew install helm + ;; + esac +} + +install_nebius() { + echo "Installing Nebius CLI..." + # Note: URL updated per https://docs.nebius.com/cli/install + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash + + # Add to PATH for current session + export PATH="$HOME/.nebius/bin:$PATH" + + print_warning "Nebius CLI installed to ~/.nebius/bin/" + print_warning "Add to your shell profile: export PATH=\"\$HOME/.nebius/bin:\$PATH\"" +} + +install_osmo() { + echo "Installing OSMO CLI..." + # Install via official NVIDIA install script + # See: https://nvidia.github.io/OSMO/main/user_guide/getting_started/install/index.html + curl -fsSL https://raw.githubusercontent.com/NVIDIA/OSMO/refs/heads/main/install.sh | bash + + # The install script typically adds osmo to ~/.local/bin or similar + if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then + export PATH="$HOME/.local/bin:$PATH" + fi + print_status "OSMO CLI installed" +} + +# Main logic +main() { + echo "========================================" + echo "OSMO on Nebius - Tool Installer" + echo "========================================" + echo "" + echo "Detected OS: $OS" + echo "" + + local check_only=false + if [[ "$1" == "--check" ]]; then + check_only=true + echo "Checking installed tools..." + echo "" + fi + + local all_installed=true + + # Check/Install Terraform + if ! check_terraform; then + all_installed=false + if $check_only; then + print_error "Terraform not installed" + else + install_terraform + check_terraform || print_error "Failed to install Terraform" + fi + fi + + # Check/Install kubectl + if ! check_kubectl; then + all_installed=false + if $check_only; then + print_error "kubectl not installed" + else + install_kubectl + check_kubectl || print_error "Failed to install kubectl" + fi + fi + + # Check/Install Helm + if ! check_helm; then + all_installed=false + if $check_only; then + print_error "Helm not installed" + else + install_helm + check_helm || print_error "Failed to install Helm" + fi + fi + + # Check/Install Nebius CLI + if ! check_nebius; then + all_installed=false + if $check_only; then + print_error "Nebius CLI not installed" + else + install_nebius + check_nebius || print_error "Failed to install Nebius CLI" + fi + fi + + # Check/Install OSMO CLI (for backend deployment and workflow management) + if ! check_osmo; then + all_installed=false + if $check_only; then + print_error "OSMO CLI not installed" + else + install_osmo + check_osmo || print_error "Failed to install OSMO CLI" + fi + fi + + echo "" + if $all_installed; then + echo "========================================" + print_status "All required tools are installed!" + echo "========================================" + echo "" + echo "Next step: Configure your Nebius environment" + echo " source ./nebius-env-init.sh" + else + if $check_only; then + echo "========================================" + print_warning "Some tools are missing. Run without --check to install." + echo "========================================" + exit 1 + fi + fi +} + +main "$@" diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh new file mode 100755 index 000000000..cd217a6d7 --- /dev/null +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -0,0 +1,270 @@ +#!/bin/bash +# +# Nebius Environment Initialization Script +# +# This script sets up environment variables needed for Terraform deployment. +# Run with: source ./nebius-env-init.sh +# +# Configure your deployment by setting the values below. +# +# NOTE: Do NOT use 'set -e' as this script is meant to be sourced +# + +# ======================================== +# CONFIGURATION - Set your values here +# ======================================== +NEBIUS_TENANT_ID="${NEBIUS_TENANT_ID:-}" # e.g. tenant-abc123def456 +NEBIUS_PROJECT_ID="${NEBIUS_PROJECT_ID:-}" # e.g. project-abc123def456 +NEBIUS_REGION="${NEBIUS_REGION:-eu-north1}" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 + +OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" # e.g. myapp.eu-north1.osmo.nebius.cloud +KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" # e.g. auth.myapp.eu-north1.osmo.nebius.cloud +# ======================================== + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo "" +echo "========================================" +echo " Nebius Environment Initialization" +echo "========================================" +echo "" + +# Detect WSL +is_wsl() { + grep -qi microsoft /proc/version 2>/dev/null +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Check if Nebius CLI is installed +check_nebius_cli() { + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + echo -e "${RED}[ERROR]${NC} Nebius CLI is not installed." + echo "" + echo "Install it by running: ./install-tools.sh" + echo "Or manually: curl -sSL https://storage.eu-north1.nebius.cloud/nebius/install.sh | bash" + return 1 + fi + + # Add to PATH if needed + if ! command -v nebius &>/dev/null && [[ -x "$HOME/.nebius/bin/nebius" ]]; then + export PATH="$HOME/.nebius/bin:$PATH" + echo -e "${YELLOW}[INFO]${NC} Added ~/.nebius/bin to PATH" + fi + + return 0 +} + +# Check if Nebius CLI is authenticated +check_nebius_auth() { + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + return 1 + fi + + # Clear potentially corrupted token + if [[ -n "$NEBIUS_IAM_TOKEN" ]]; then + echo -e "${YELLOW}[INFO]${NC} Clearing NEBIUS_IAM_TOKEN environment variable" + unset NEBIUS_IAM_TOKEN + fi + + # Test authentication by listing profiles + if "$nebius_path" profile list &>/dev/null; then + return 0 + fi + return 1 +} + +# Main initialization +main() { + # Step 1: Check Nebius CLI + echo -e "${BLUE}Step 1: Checking Nebius CLI${NC}" + if ! check_nebius_cli; then + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI found" + echo "" + + # Step 2: Check authentication + echo -e "${BLUE}Step 2: Checking authentication${NC}" + if ! check_nebius_auth; then + echo -e "${YELLOW}[!]${NC} Nebius CLI not authenticated" + echo "" + echo "Please authenticate with Nebius CLI before running this script." + echo "" + echo "Authentication steps:" + echo " 1. Run: nebius profile create" + echo " 2. Follow the interactive prompts" + echo " 3. Complete browser-based authentication" + if is_wsl; then + echo "" + echo -e "${YELLOW}WSL Note:${NC} If browser doesn't open automatically," + echo " copy the URL from the terminal and paste it in your browser." + fi + echo "" + echo "After authentication, run this script again:" + echo " source ./nebius-env-init.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI authenticated" + echo "" + + # Step 3: Validate configuration + echo -e "${BLUE}Step 3: Validating configuration${NC}" + + if [[ -z "$NEBIUS_TENANT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_TENANT_ID is not set." + echo " Edit the CONFIGURATION section at the top of this script." + return 1 + fi + + if [[ ! "$NEBIUS_TENANT_ID" =~ ^tenant-[a-z0-9]+ ]]; then + echo -e "${RED}[ERROR]${NC} Invalid tenant ID format: '$NEBIUS_TENANT_ID'" + echo " Tenant IDs should look like: tenant-e00abc123def456" + return 1 + fi + + if [[ -z "$NEBIUS_PROJECT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_PROJECT_ID is not set." + echo " Edit the CONFIGURATION section at the top of this script." + return 1 + fi + + if [[ ! "$NEBIUS_PROJECT_ID" =~ ^project-[a-z0-9]+ ]]; then + echo -e "${RED}[ERROR]${NC} Invalid project ID format: '$NEBIUS_PROJECT_ID'" + echo " Project IDs should look like: project-e00abc123def456" + return 1 + fi + + if [[ -z "$NEBIUS_REGION" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_REGION is not set." + echo " Edit the CONFIGURATION section at the top of this script." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Configuration valid" + echo "" + + # Step 4: Export environment variables + echo -e "${BLUE}Step 4: Setting environment variables${NC}" + + local nebius_path=$(get_nebius_path) + + export NEBIUS_TENANT_ID + export NEBIUS_PROJECT_ID + export NEBIUS_REGION + export OSMO_INGRESS_HOSTNAME + export KEYCLOAK_HOSTNAME + + # Get IAM token for Terraform provider authentication + echo "Getting IAM token for Terraform..." + unset NEBIUS_IAM_TOKEN # Clear any old/corrupted token + export NEBIUS_IAM_TOKEN=$("$nebius_path" iam get-access-token) + + if [[ -z "$NEBIUS_IAM_TOKEN" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to get IAM token" + return 1 + fi + echo -e "${GREEN}[✓]${NC} IAM token obtained" + + # Terraform variables + export TF_VAR_tenant_id="$NEBIUS_TENANT_ID" + export TF_VAR_parent_id="$NEBIUS_PROJECT_ID" + export TF_VAR_region="$NEBIUS_REGION" + + echo -e "${GREEN}[✓]${NC} Core environment variables set" + + # Step 5: Discover default network and subnet + echo "" + echo -e "${BLUE}Step 5: Discovering default network and subnet${NC}" + + local network_json subnet_json + network_json=$("$nebius_path" vpc v1 network list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null) + + local network_id network_name subnet_id subnet_name + + if [[ -n "$network_json" ]]; then + network_id=$(echo "$network_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.id // empty' 2>/dev/null) + network_name=$(echo "$network_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.name // empty' 2>/dev/null) + fi + + if [[ -z "$network_id" ]]; then + echo -e "${RED}[ERROR]${NC} No default network found in project $NEBIUS_PROJECT_ID" + echo " Expected a network whose name starts with 'default'." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Found network: $network_name ($network_id)" + + subnet_json=$("$nebius_path" vpc v1 subnet list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null) + + if [[ -n "$subnet_json" ]]; then + subnet_id=$(echo "$subnet_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.id // empty' 2>/dev/null) + subnet_name=$(echo "$subnet_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.name // empty' 2>/dev/null) + fi + + if [[ -z "$subnet_id" ]]; then + echo -e "${RED}[ERROR]${NC} No default subnet found in project $NEBIUS_PROJECT_ID" + echo " Expected a subnet whose name starts with 'default'." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Found subnet: $subnet_name ($subnet_id)" + + export NEBIUS_NETWORK_ID="$network_id" + export NEBIUS_SUBNET_ID="$subnet_id" + export TF_VAR_network_id="$network_id" + export TF_VAR_subnet_id="$subnet_id" + + # Step 6: Verify connectivity + echo "" + echo -e "${BLUE}Step 6: Verifying connectivity${NC}" + + if "$nebius_path" iam project get --id "$NEBIUS_PROJECT_ID" &>/dev/null; then + echo -e "${GREEN}[✓]${NC} Successfully connected to Nebius project" + else + echo -e "${YELLOW}[!]${NC} Could not verify project access (this may be normal for new projects)" + fi + + echo "" + echo "========================================" + echo -e "${GREEN}Environment initialization complete!${NC}" + echo "========================================" + echo "" + echo -e "${GREEN}[✓]${NC} Environment variables set:" + echo " NEBIUS_TENANT_ID = $NEBIUS_TENANT_ID" + echo " NEBIUS_PROJECT_ID = $NEBIUS_PROJECT_ID" + echo " NEBIUS_REGION = $NEBIUS_REGION" + echo " NEBIUS_IAM_TOKEN = ${NEBIUS_IAM_TOKEN:0:20}... (truncated)" + echo " NEBIUS_NETWORK_ID = $NEBIUS_NETWORK_ID" + echo " NEBIUS_SUBNET_ID = $NEBIUS_SUBNET_ID" + echo " OSMO_INGRESS_HOSTNAME = $OSMO_INGRESS_HOSTNAME" + echo " KEYCLOAK_HOSTNAME = $KEYCLOAK_HOSTNAME" + echo "" + echo " Network: $network_name ($network_id)" + echo " Subnet: $subnet_name ($subnet_id)" + echo "" + echo "Next steps:" + echo " 1. cd ../001-iac" + echo " 2. cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars" + echo " 3. terraform init && terraform apply" + echo "" + + return 0 +} + +# Run main function +main diff --git a/applications/osmo/deploy/example/000-prerequisites/secrets-init_deprecated.sh b/applications/osmo/deploy/example/000-prerequisites/secrets-init_deprecated.sh new file mode 100755 index 000000000..8a37f4203 --- /dev/null +++ b/applications/osmo/deploy/example/000-prerequisites/secrets-init_deprecated.sh @@ -0,0 +1,473 @@ +#!/bin/bash +# +# OSMO on Nebius - Secrets Initialization Script +# +# This script generates secrets and stores them in Nebius MysteryBox. +# Secrets are NOT stored in Terraform state - only the secret IDs are used. +# +# Usage: +# source ./secrets-init.sh +# +# Prerequisites: +# - Nebius CLI installed and authenticated +# - Environment variables set (run nebius-env-init.sh first) +# - jq installed +# + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +# Default secret names +POSTGRESQL_SECRET_NAME="${OSMO_POSTGRESQL_SECRET_NAME:-osmo-postgresql-password}" +MEK_SECRET_NAME="${OSMO_MEK_SECRET_NAME:-osmo-mek}" + +echo "" +echo "========================================" +echo " OSMO Secrets Initialization" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Helper Functions +# ----------------------------------------------------------------------------- + +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +# Return a random integer in range [min, max] using /dev/urandom. +rand_int() { + local min=$1 + local max=$2 + local range=$((max - min + 1)) + local num="" + + while :; do + num=$(od -An -N2 -tu2 /dev/urandom | tr -d ' ') + if [[ -n "$num" ]]; then + echo $((min + num % range)) + return 0 + fi + done +} + +# Pick a random character from a set. +rand_char_from_set() { + local set=$1 + local idx + idx=$(rand_int 0 $((${#set} - 1))) + printf "%s" "${set:$idx:1}" +} + +# Shuffle a string using Fisher-Yates. +shuffle_string() { + local input=$1 + local -a chars + local i j tmp + local len=${#input} + + if [[ -n "${BASH_VERSION:-}" ]]; then + for ((i = 0; i < len; i++)); do + chars[i]="${input:i:1}" + done + for ((i = len - 1; i > 0; i--)); do + j=$(rand_int 0 "$i") + tmp="${chars[i]}" + chars[i]="${chars[j]}" + chars[j]="$tmp" + done + local out="" + for ((i = 0; i < len; i++)); do + out+="${chars[i]}" + done + printf "%s" "$out" + else + # zsh uses 1-based indexing for arrays and string subscripts + for ((i = 1; i <= len; i++)); do + chars[i]="${input[$i]}" + done + for ((i = len; i > 1; i--)); do + j=$(rand_int 1 "$i") + tmp="${chars[i]}" + chars[i]="${chars[j]}" + chars[j]="$tmp" + done + local out="" + for ((i = 1; i <= len; i++)); do + out+="${chars[i]}" + done + printf "%s" "$out" + fi +} + +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +check_prerequisites() { + echo -e "${BLUE}Step 1: Checking prerequisites${NC}" + + # Check Nebius CLI + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + echo -e "${RED}[ERROR]${NC} Nebius CLI not found" + echo " Run: ./install-tools.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI found" + + # Check jq + if ! command -v jq &>/dev/null; then + echo -e "${RED}[ERROR]${NC} jq not found" + echo " Install: sudo apt-get install jq" + return 1 + fi + echo -e "${GREEN}[✓]${NC} jq found" + + # Check openssl + if ! command -v openssl &>/dev/null; then + echo -e "${RED}[ERROR]${NC} openssl not found" + return 1 + fi + echo -e "${GREEN}[✓]${NC} openssl found" + + # Check environment variables + if [[ -z "$NEBIUS_PROJECT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_PROJECT_ID not set" + echo " Run: source ./nebius-env-init.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} NEBIUS_PROJECT_ID set: $NEBIUS_PROJECT_ID" + + echo "" + return 0 +} + +# Generate secure password meeting Nebius PostgreSQL requirements +generate_postgresql_password() { + # Requirements: + # - Min 8 characters (we use 32) + # - At least one lowercase, uppercase, digit, special char + # - No % character + + local password="" + local attempts=0 + local max_attempts=10 + + while [[ $attempts -lt $max_attempts ]]; do + # Generate base password + password=$(openssl rand -base64 32 | tr -d '/+=\n' | head -c 28) + + # Add required character types + local lower=$(rand_char_from_set "abcdefghijklmnopqrstuvwxyz") + local upper=$(rand_char_from_set "ABCDEFGHIJKLMNOPQRSTUVWXYZ") + local digit=$(rand_char_from_set "0123456789") + local special=$(rand_char_from_set '!#$^&*()-_=+') + + password="${password}${lower}${upper}${digit}${special}" + + # Shuffle the password + password=$(shuffle_string "$password") + + # Verify requirements + if [[ ${#password} -ge 32 ]] && \ + [[ "$password" =~ [a-z] ]] && \ + [[ "$password" =~ [A-Z] ]] && \ + [[ "$password" =~ [0-9] ]] && \ + [[ "$password" =~ [\!\#\$\^\&\*\(\)\-\_\=\+] ]] && \ + [[ ! "$password" =~ [%@:\/\;\[\]\{\}\|\<\>\,\.\?] ]]; then + echo "$password" + return 0 + fi + + ((attempts++)) + done + + echo -e "${RED}[ERROR]${NC} Failed to generate valid password after $max_attempts attempts" + return 1 +} + +# Generate MEK (Master Encryption Key) for OSMO +generate_mek() { + # MEK is a JWK (JSON Web Key) format + # OSMO expects: {"currentMek": "key1", "meks": {"key1": ""}} + + # Generate a 256-bit key + local key_bytes=$(openssl rand 32) + local key_base64=$(echo -n "$key_bytes" | base64 | tr -d '\n') + + # Create JWK structure (symmetric key) + local jwk=$(cat </dev/null) + + # Extract JSON from output (CLI may print info messages before JSON) + local json_result=$(echo "$result" | awk '/^{/,0') + + if [[ -n "$json_result" && "$json_result" != "null" ]]; then + echo "$json_result" | jq -r '.metadata.id' + return 0 + fi + return 1 +} + +# Create secret in MysteryBox +create_secret() { + local parent_id=$1 + local secret_name=$2 + local key=$3 + local value=$4 + local nebius_path=$(get_nebius_path) + + # Escape special characters in value for JSON + local escaped_value=$(echo -n "$value" | jq -Rs '.') + # Remove surrounding quotes added by jq + escaped_value=${escaped_value:1:-1} + + local payload="[{\"key\":\"$key\",\"string_value\":\"$escaped_value\"}]" + + local result=$("$nebius_path" mysterybox v1 secret create \ + --parent-id "$parent_id" \ + --name "$secret_name" \ + --secret-version-payload "$payload" \ + --format json 2>&1) + + local exit_code=$? + + # Extract JSON from output (CLI may print info messages before JSON) + # Find the first line starting with '{' and print everything from there + local json_result=$(echo "$result" | awk '/^{/,0') + + if [[ $exit_code -eq 0 && -n "$json_result" ]]; then + echo "$json_result" | jq -r '.metadata.id' + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create secret: $result" + return 1 + fi +} + +# Delete secret from MysteryBox +delete_secret() { + local secret_id=$1 + local nebius_path=$(get_nebius_path) + + "$nebius_path" mysterybox v1 secret delete --id "$secret_id" 2>/dev/null +} + +# ----------------------------------------------------------------------------- +# Main Secret Creation Functions +# ----------------------------------------------------------------------------- + +create_postgresql_secret() { + echo -e "${BLUE}Creating PostgreSQL password secret...${NC}" + + # Check if secret already exists + local existing_id=$(secret_exists "$NEBIUS_PROJECT_ID" "$POSTGRESQL_SECRET_NAME") + + if [[ -n "$existing_id" ]]; then + echo -e "${YELLOW}[!]${NC} Secret '$POSTGRESQL_SECRET_NAME' already exists (ID: $existing_id)" + read_prompt_var " Replace existing secret? (y/N)" replace "" + if [[ "$replace" =~ ^[Yy]$ ]]; then + echo " Deleting existing secret..." + delete_secret "$existing_id" + sleep 2 + else + echo " Using existing secret" + export OSMO_POSTGRESQL_SECRET_ID="$existing_id" + export TF_VAR_postgresql_mysterybox_secret_id="$existing_id" + return 0 + fi + fi + + # Generate password + echo " Generating secure password..." + local password=$(generate_postgresql_password) + if [[ $? -ne 0 || -z "$password" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to generate password" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Password generated (length: ${#password})" + + # Store in MysteryBox + echo " Storing in MysteryBox..." + local secret_id=$(create_secret "$NEBIUS_PROJECT_ID" "$POSTGRESQL_SECRET_NAME" "password" "$password") + + if [[ $? -eq 0 && -n "$secret_id" ]]; then + echo -e "${GREEN}[✓]${NC} PostgreSQL secret created: $secret_id" + export OSMO_POSTGRESQL_SECRET_ID="$secret_id" + export TF_VAR_postgresql_mysterybox_secret_id="$secret_id" + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create PostgreSQL secret" + return 1 + fi +} + +create_mek_secret() { + echo -e "${BLUE}Creating MEK (Master Encryption Key) secret...${NC}" + + # Check if secret already exists + local existing_id=$(secret_exists "$NEBIUS_PROJECT_ID" "$MEK_SECRET_NAME") + + if [[ -n "$existing_id" ]]; then + echo -e "${YELLOW}[!]${NC} Secret '$MEK_SECRET_NAME' already exists (ID: $existing_id)" + read_prompt_var " Replace existing secret? (y/N)" replace "" + if [[ "$replace" =~ ^[Yy]$ ]]; then + echo " Deleting existing secret..." + delete_secret "$existing_id" + sleep 2 + else + echo " Using existing secret" + export OSMO_MEK_SECRET_ID="$existing_id" + export TF_VAR_mek_mysterybox_secret_id="$existing_id" + return 0 + fi + fi + + # Generate MEK + echo " Generating Master Encryption Key..." + local mek=$(generate_mek) + if [[ $? -ne 0 || -z "$mek" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to generate MEK" + return 1 + fi + echo -e "${GREEN}[✓]${NC} MEK generated" + + # Store in MysteryBox + echo " Storing in MysteryBox..." + local secret_id=$(create_secret "$NEBIUS_PROJECT_ID" "$MEK_SECRET_NAME" "mek" "$mek") + + if [[ $? -eq 0 && -n "$secret_id" ]]; then + echo -e "${GREEN}[✓]${NC} MEK secret created: $secret_id" + export OSMO_MEK_SECRET_ID="$secret_id" + export TF_VAR_mek_mysterybox_secret_id="$secret_id" + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create MEK secret" + return 1 + fi +} + + +# ----------------------------------------------------------------------------- +# Main +# ----------------------------------------------------------------------------- + +main() { + # Check prerequisites + if ! check_prerequisites; then + return 1 + fi + + echo -e "${BLUE}Step 2: Creating secrets in MysteryBox${NC}" + echo "" + + local success=true + + # Create PostgreSQL secret + if ! create_postgresql_secret; then + success=false + fi + echo "" + + # Create MEK secret + if ! create_mek_secret; then + success=false + fi + echo "" + + if ! $success; then + echo -e "${RED}[ERROR]${NC} Some secrets failed to create" + return 1 + fi + + # Summary + echo "========================================" + echo -e "${GREEN}Secrets initialization complete!${NC}" + echo "========================================" + echo "" + echo "Environment variables exported:" + echo " TF_VAR_postgresql_mysterybox_secret_id = $TF_VAR_postgresql_mysterybox_secret_id" + echo " TF_VAR_mek_mysterybox_secret_id = $TF_VAR_mek_mysterybox_secret_id" + echo "" + echo "Secrets are stored in MysteryBox. Run this script again in a new" + echo "terminal session to retrieve existing secrets by name." + echo "" + echo "To retrieve secret values manually:" + echo " # PostgreSQL password:" + echo " nebius mysterybox v1 payload get-by-key --secret-id $TF_VAR_postgresql_mysterybox_secret_id --key password --format json | jq -r '.data.string_value'" + echo "" + echo " # MEK:" + echo " nebius mysterybox v1 payload get-by-key --secret-id $TF_VAR_mek_mysterybox_secret_id --key mek --format json | jq -r '.data.string_value'" + echo "" + echo "Next steps:" + echo " 1. cd ../001-iac" + echo " 2. cp terraform.tfvars.cost-optimized.example terraform.tfvars # or another preset" + echo " 3. terraform init && terraform apply" + echo "" + + return 0 +} + +# Run main +main diff --git a/applications/osmo/deploy/example/000-prerequisites/wireguard-client-setup.sh b/applications/osmo/deploy/example/000-prerequisites/wireguard-client-setup.sh new file mode 100755 index 000000000..25f0bbe48 --- /dev/null +++ b/applications/osmo/deploy/example/000-prerequisites/wireguard-client-setup.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# +# WireGuard Client Setup Script +# +# This script helps configure a WireGuard client to connect to the +# OSMO cluster's private network. +# + +set -e + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo "" +echo "========================================" +echo " WireGuard Client Setup" +echo "========================================" +echo "" + +# Detect OS +detect_os() { + if [[ "$OSTYPE" == "linux-gnu"* ]]; then + if grep -q Microsoft /proc/version 2>/dev/null; then + echo "wsl" + else + echo "linux" + fi + elif [[ "$OSTYPE" == "darwin"* ]]; then + echo "macos" + elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then + echo "windows" + else + echo "unknown" + fi +} + +OS=$(detect_os) +echo "Detected OS: $OS" +echo "" + +# Check if WireGuard is installed +check_wireguard() { + case $OS in + linux|wsl) + if command -v wg &>/dev/null; then + return 0 + fi + ;; + macos) + if [[ -d "/Applications/WireGuard.app" ]] || command -v wg &>/dev/null; then + return 0 + fi + ;; + windows) + if [[ -d "/c/Program Files/WireGuard" ]]; then + return 0 + fi + ;; + esac + return 1 +} + +# Install WireGuard +install_wireguard() { + echo -e "${BLUE}Installing WireGuard...${NC}" + case $OS in + linux) + sudo apt-get update + sudo apt-get install -y wireguard wireguard-tools + ;; + wsl) + echo -e "${YELLOW}For WSL, install WireGuard on Windows:${NC}" + echo " 1. Download from: https://www.wireguard.com/install/" + echo " 2. Install the Windows application" + echo " 3. Import the configuration file" + return 1 + ;; + macos) + echo "Install WireGuard from the App Store or:" + echo " brew install wireguard-tools" + return 1 + ;; + *) + echo "Please install WireGuard manually from: https://www.wireguard.com/install/" + return 1 + ;; + esac +} + +# Get Terraform outputs +get_terraform_output() { + local output_name=$1 + cd ../001-iac 2>/dev/null || { + echo -e "${RED}[ERROR]${NC} Cannot find ../001-iac directory" + return 1 + } + terraform output -raw "$output_name" 2>/dev/null + cd - >/dev/null +} + +# Generate client keys +generate_client_keys() { + local private_key=$(wg genkey) + local public_key=$(echo "$private_key" | wg pubkey) + echo "$private_key|$public_key" +} + +# Main setup +main() { + # Check/Install WireGuard + if ! check_wireguard; then + install_wireguard || { + echo "" + echo -e "${RED}[ERROR]${NC} WireGuard not installed. Please install manually." + exit 1 + } + fi + echo -e "${GREEN}[✓]${NC} WireGuard installed" + echo "" + + # Check if WireGuard was enabled in Terraform + echo -e "${BLUE}Retrieving WireGuard server information...${NC}" + + local wg_public_ip=$(get_terraform_output "wireguard.public_ip" 2>/dev/null || echo "") + + if [[ -z "$wg_public_ip" || "$wg_public_ip" == "null" ]]; then + echo -e "${RED}[ERROR]${NC} WireGuard VPN was not deployed." + echo "" + echo "To enable WireGuard, set in terraform.tfvars:" + echo " enable_wireguard = true" + echo "" + echo "Then run: terraform apply" + exit 1 + fi + + local wg_ui_url=$(get_terraform_output "wireguard.ui_url" 2>/dev/null || echo "") + + echo "" + echo -e "${GREEN}[✓]${NC} WireGuard server found" + echo " Public IP: $wg_public_ip" + if [[ -n "$wg_ui_url" && "$wg_ui_url" != "null" ]]; then + echo " Web UI: $wg_ui_url" + fi + echo "" + + # Instructions for manual configuration + echo "========================================" + echo -e "${BLUE}Configuration Instructions${NC}" + echo "========================================" + echo "" + echo "Option 1: Use WireGuard Web UI (Recommended)" + echo " 1. Open in browser: $wg_ui_url" + echo " 2. Login with the generated password (check Terraform output)" + echo " 3. Create a new client configuration" + echo " 4. Download the configuration file" + echo " 5. Import into WireGuard client" + echo "" + echo "Option 2: Manual Configuration" + echo " 1. Generate client keys: wg genkey | tee privatekey | wg pubkey > publickey" + echo " 2. SSH to WireGuard server and add peer" + echo " 3. Create local configuration file" + echo "" + + # Create config template + local config_file="wg-client-osmo.conf" + + if [[ "$OS" == "linux" ]] && command -v wg &>/dev/null; then + echo -e "${BLUE}Generating client configuration template...${NC}" + + local keys=$(generate_client_keys) + local client_private_key=$(echo "$keys" | cut -d'|' -f1) + local client_public_key=$(echo "$keys" | cut -d'|' -f2) + + cat > "$config_file" << EOF +[Interface] +# Client private key (generated) +PrivateKey = $client_private_key +# Client IP address in VPN network (change if needed) +Address = 10.8.0.2/24 +DNS = 8.8.8.8 + +[Peer] +# WireGuard server public key (get from server) +PublicKey = +# Allowed IPs - route all traffic through VPN +AllowedIPs = 10.8.0.0/24, 10.0.0.0/16 +# WireGuard server endpoint +Endpoint = $wg_public_ip:51820 +# Keep connection alive +PersistentKeepalive = 25 +EOF + + echo "" + echo -e "${GREEN}[✓]${NC} Configuration template created: $config_file" + echo "" + echo "Your client public key (add this to server):" + echo " $client_public_key" + echo "" + echo "Next steps:" + echo " 1. Get server public key from WireGuard Web UI or server" + echo " 2. Add your client public key to server" + echo " 3. Update in $config_file" + echo " 4. Start VPN: sudo wg-quick up ./$config_file" + fi + + echo "" + echo "========================================" + echo -e "${GREEN}Setup guide complete!${NC}" + echo "========================================" +} + +main "$@" diff --git a/applications/osmo/deploy/example/001-iac/README.md b/applications/osmo/deploy/example/001-iac/README.md new file mode 100755 index 000000000..6f2ae927d --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/README.md @@ -0,0 +1,245 @@ +# Infrastructure as Code (Terraform) + +This directory contains Terraform configurations for deploying OSMO infrastructure on Nebius. + +## Prerequisites + +1. Install required tools: + ```bash + cd ../000-prerequisites + ./install-tools.sh + ``` + +2. Configure Nebius environment: + ```bash + source ../000-prerequisites/nebius-env-init.sh + ``` + +3. **(Recommended)** Initialize secrets in MysteryBox: + ```bash + source ../000-prerequisites/secrets-init.sh + ``` + This generates secure passwords/keys and stores them in MysteryBox, keeping them OUT of Terraform state. + +## Quick Start + +```bash +# Recommended: Cost-optimized with secure private access +cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars + +# Edit if needed (tenant_id and parent_id set via environment) +vim terraform.tfvars + +# Deploy +terraform init +terraform plan +terraform apply +``` + +## Configuration Tiers + +| File | Use Case | GPU | Security | Est. Cost/6h | +|------|----------|-----|----------|--------------| +| `terraform.tfvars.cost-optimized-secure.example` (recommended) | Dev | 1x L40S | WireGuard | **~$15-25** | +| `terraform.tfvars.cost-optimized.example` | Dev (cheapest) | 1x L40S | Public | ~$10-15 | +| `terraform.tfvars.secure.example` | Staging | 8x H100 | WireGuard | ~$300-400 | +| `terraform.tfvars.production.example` | Production | 32x H200 | WireGuard | ~$1000+ | + +## Resources Created + +### Network +- VPC Network +- Subnet with configurable CIDR + +### Kubernetes +- Managed Kubernetes Cluster (MK8s) +- CPU Node Group (for system workloads) +- GPU Node Group(s) (for training) +- Service Account for node groups + +### Storage +- Object Storage Bucket (S3-compatible) +- Shared Filesystem (Filestore) +- Service Account with access keys + +### Database +- Managed PostgreSQL Cluster + +### Container Registry +- Nebius Container Registry (when `enable_container_registry = true`) + +### Optional +- WireGuard VPN Server (when `enable_wireguard = true`) +- GPU Cluster for InfiniBand (when `enable_gpu_cluster = true`) + +## Module Structure + +``` +001-iac/ +├── main.tf # Root module +├── variables.tf # Input variables +├── outputs.tf # Output values +├── locals.tf # Local values +├── versions.tf # Provider versions +├── terraform.tfvars.*.example +└── modules/ + ├── platform/ # VPC, Storage, DB, Container Registry + ├── k8s/ # Kubernetes cluster + └── wireguard/ # VPN server +``` + +## GPU Options + +### Available Platforms (eu-north1) + +| Platform | GPU | VRAM | ~Cost/hr | Best For | +|----------|-----|------|----------|----------| +| `gpu-l40s-a` | L40S Intel | 48GB | **$1.55** | Dev/Testing (cheapest) | +| `gpu-l40s-d` | L40S AMD | 48GB | **$1.55** | Dev/Testing | +| `gpu-h100-sxm` | H100 | 80GB | ~$4-5 | Training | +| `gpu-h200-sxm` | H200 | 141GB | ~$5-6 | Large models | + +### Presets + +| Platform | Preset | GPUs | vCPUs | RAM | +|----------|--------|------|-------|-----| +| L40S | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | +| L40S | `2gpu-16vcpu-64gb` | 2 | 16 | 64GB | +| H100/H200 | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | +| H100/H200 | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | + +## Security Options + +### Public Access (Default) + +```hcl +enable_public_endpoint = true +cpu_nodes_assign_public_ip = true +enable_wireguard = false +``` + +### Private Access (WireGuard) + +```hcl +enable_public_endpoint = false +cpu_nodes_assign_public_ip = false +gpu_nodes_assign_public_ip = false +enable_wireguard = true +``` + +After deployment, set up VPN client: +```bash +cd ../000-prerequisites +./wireguard-client-setup.sh +``` + +## Cost Optimization + +### Use Preemptible GPUs +```hcl +gpu_nodes_preemptible = true # Up to 70% savings +``` + +### Use Single-GPU Nodes for Dev +```hcl +gpu_nodes_preset = "1gpu-16vcpu-200gb" +enable_gpu_cluster = false +``` + +### Minimize Storage +```hcl +filestore_size_gib = 256 +postgresql_disk_size_gib = 20 +``` + +## Secrets Management (MysteryBox) + +This module supports two approaches for secrets: + +### Option A: MysteryBox (Recommended) +Secrets are stored in Nebius MysteryBox and read at runtime. **Not stored in Terraform state.** + +```bash +# Before terraform apply: +cd ../000-prerequisites +source ./secrets-init.sh # Creates secrets in MysteryBox +cd ../001-iac +terraform apply # Uses TF_VAR_* env vars set by script +``` + +**Benefits:** +- Secrets never in Terraform state file +- Centralized secret management +- Easier rotation without re-deploying +- Better audit trail + +**Retrieving Secrets:** +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_POSTGRESQL_SECRET_ID \ + --key password \ + --format json | jq -r '.data.string_value' + +# MEK +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_MEK_SECRET_ID \ + --key mek \ + --format json | jq -r '.data.string_value' +``` + +### Option B: Terraform-Generated (Fallback) +If MysteryBox secret IDs are not set, Terraform generates secrets automatically. + +```hcl +# Secrets stored in Terraform state (less secure) +postgresql_mysterybox_secret_id = null # Default +mek_mysterybox_secret_id = null # Default +``` + +**Retrieving Secrets:** +```bash +terraform output -json postgresql_password +``` + +### MysteryBox Variables + +| Variable | Description | +|----------|-------------| +| `postgresql_mysterybox_secret_id` | Secret ID for PostgreSQL password | +| `mek_mysterybox_secret_id` | Secret ID for OSMO MEK | + +## Outputs + +After `terraform apply`, you'll see: + +- `cluster_id` - Kubernetes cluster ID +- `cluster_endpoint` - Kubernetes API endpoint +- `storage_bucket` - Object storage details +- `container_registry` - Container Registry details (endpoint, name) +- `postgresql` - Database connection info +- `wireguard` - VPN details (if enabled) +- `next_steps` - Instructions for next deployment phase + +## Cleanup + +```bash +terraform destroy +``` + +**Warning**: This will delete all resources including data in PostgreSQL and Object Storage. + +## Troubleshooting + +### Authentication Error +```bash +source ../000-prerequisites/nebius-env-init.sh +``` + +### Resource Quota Exceeded +Check your Nebius quota in the console and request increases if needed. + +### Invalid GPU Platform +Verify the platform is available in your region: +- `eu-north1`: H100 +- `eu-west1`: H200 diff --git a/applications/osmo/deploy/example/001-iac/locals.tf b/applications/osmo/deploy/example/001-iac/locals.tf new file mode 100755 index 000000000..f1b9e0dd6 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/locals.tf @@ -0,0 +1,124 @@ +# ============================================================================= +# Local Values +# ============================================================================= + +locals { + # Resource naming + name_prefix = "${var.project_name}-${var.environment}" + + # SSH key handling + ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( + fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null + ) + + # Region-specific defaults + region_defaults = { + eu-north1 = { + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-3" + postgresql_platform = "cpu-e2" + postgresql_disk_type = "network-ssd" + } + eu-north2 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "eu-north2-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" + } + eu-west1 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-5" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" + } + me-west1 = { + gpu_nodes_platform = "gpu-b200-sxm-a" + gpu_nodes_preset = "8gpu-160vcpu-1792gb" + infiniband_fabric = "me-west1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" + } + uk-south1 = { + gpu_nodes_platform = "gpu-b300-sxm" + gpu_nodes_preset = "8gpu-192vcpu-2768gb" + infiniband_fabric = "uk-south1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" + } + us-central1 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "us-central1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" + } + } + + # Available GPU platforms by region (for reference) + # + # eu-north1: + # - gpu-h100-sxm (NVIDIA H100 80GB HBM3) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-l40s-a (L40S Intel, 48GB VRAM) presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb + # - gpu-l40s-d (L40S AMD, 48GB VRAM) presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb + # + # eu-north2: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # + # eu-west1: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # + # me-west1: + # - gpu-b200-sxm-a (NVIDIA B200) presets: 1gpu-20vcpu-224gb, 8gpu-160vcpu-1792gb + # + # uk-south1: + # - gpu-b300-sxm (NVIDIA B300 SXM6 AC) presets: 1gpu-24vcpu-346gb, 8gpu-192vcpu-2768gb + # + # us-central1: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-b200-sxm (NVIDIA B200) presets: 1gpu-20vcpu-224gb, 8gpu-160vcpu-1792gb + + # Current region config with overrides + current_region = local.region_defaults[var.region] + + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region.infiniband_fabric) + + postgresql_platform = coalesce(var.postgresql_platform, local.current_region.postgresql_platform) + postgresql_disk_type = coalesce(var.postgresql_disk_type, local.current_region.postgresql_disk_type) + + # Driverfull image: map GPU platform to CUDA driver preset + platform_to_cuda = { + gpu-b200-sxm-a = "cuda12.8" + gpu-b200-sxm = "cuda12.8" + gpu-b300-sxm = "cuda13.0" + } + gpu_drivers_preset = lookup(local.platform_to_cuda, local.gpu_nodes_platform, "cuda12") + + # Generate unique storage bucket name if not provided + storage_bucket_name = var.storage_bucket_name != "" ? var.storage_bucket_name : "${local.name_prefix}-storage-${random_string.suffix.result}" + + # Common tags/labels + common_labels = { + project = var.project_name + environment = var.environment + managed_by = "terraform" + } +} + +# Random suffix for unique naming +resource "random_string" "suffix" { + length = 8 + lower = true + upper = false + numeric = true + special = false + + keepers = { + project_id = var.parent_id + } +} diff --git a/applications/osmo/deploy/example/001-iac/main.tf b/applications/osmo/deploy/example/001-iac/main.tf new file mode 100755 index 000000000..461c4f28e --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/main.tf @@ -0,0 +1,133 @@ +# ============================================================================= +# OSMO on Nebius - Root Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Platform Module (VPC, Storage, PostgreSQL, Container Registry) +# ----------------------------------------------------------------------------- +module "platform" { + source = "./modules/platform" + + parent_id = var.parent_id + tenant_id = var.tenant_id + region = var.region + name_prefix = local.name_prefix + + # Network (existing default network and subnet) + network_id = var.network_id + subnet_id = var.subnet_id + + # Storage + storage_bucket_name = local.storage_bucket_name + + # Filestore + enable_filestore = var.enable_filestore + filestore_disk_type = var.filestore_disk_type + filestore_size_gib = var.filestore_size_gib + filestore_block_size_kib = var.filestore_block_size_kib + + # PostgreSQL (optional - can use in-cluster PostgreSQL instead) + enable_managed_postgresql = var.enable_managed_postgresql + postgresql_version = var.postgresql_version + postgresql_public_access = var.postgresql_public_access + postgresql_platform = local.postgresql_platform + postgresql_preset = var.postgresql_preset + postgresql_disk_type = local.postgresql_disk_type + postgresql_disk_size_gib = var.postgresql_disk_size_gib + postgresql_host_count = var.postgresql_host_count + postgresql_database_name = var.postgresql_database_name + postgresql_username = var.postgresql_username + + # Container Registry + enable_container_registry = var.enable_container_registry + container_registry_name = var.container_registry_name +} + +# ----------------------------------------------------------------------------- +# Kubernetes Module +# ----------------------------------------------------------------------------- +module "k8s" { + source = "./modules/k8s" + + parent_id = var.parent_id + tenant_id = var.tenant_id + region = var.region + name_prefix = local.name_prefix + + # Network + subnet_id = var.subnet_id + + # Cluster config + k8s_version = var.k8s_version + etcd_cluster_size = var.etcd_cluster_size + enable_public_endpoint = var.enable_public_endpoint + + # SSH + ssh_user_name = var.ssh_user_name + ssh_public_key = local.ssh_public_key + + # CPU nodes + cpu_nodes_count = var.cpu_nodes_count + cpu_nodes_platform = var.cpu_nodes_platform + cpu_nodes_preset = var.cpu_nodes_preset + cpu_disk_type = var.cpu_disk_type + cpu_disk_size_gib = var.cpu_disk_size_gib + cpu_nodes_assign_public_ip = var.cpu_nodes_assign_public_ip + + # GPU nodes + gpu_nodes_count_per_group = var.gpu_nodes_count_per_group + gpu_node_groups = var.gpu_node_groups + gpu_nodes_platform = local.gpu_nodes_platform + gpu_nodes_preset = local.gpu_nodes_preset + gpu_disk_type = var.gpu_disk_type + gpu_disk_size_gib = var.gpu_disk_size_gib + gpu_nodes_assign_public_ip = var.gpu_nodes_assign_public_ip + enable_gpu_cluster = var.enable_gpu_cluster + infiniband_fabric = local.infiniband_fabric + enable_gpu_taints = var.enable_gpu_taints + gpu_nodes_preemptible = var.gpu_nodes_preemptible + gpu_nodes_driverfull_image = var.gpu_nodes_driverfull_image + gpu_drivers_preset = local.gpu_drivers_preset + gpu_reservation_ids = var.gpu_reservation_ids + + # Filestore + enable_filestore = var.enable_filestore + filestore_id = var.enable_filestore ? module.platform.filestore_id : null + + # Note: No explicit depends_on needed - Terraform infers dependencies from: + # - subnet_id (waits for subnet) + # - filestore_id (waits for filestore if enabled) + # This allows k8s to start as soon as subnet/filestore are ready, + # without waiting for PostgreSQL (which takes 5-15 min) +} + +# ----------------------------------------------------------------------------- +# WireGuard VPN Module (Optional) +# ----------------------------------------------------------------------------- +module "wireguard" { + count = var.enable_wireguard ? 1 : 0 + source = "./modules/wireguard" + + parent_id = var.parent_id + region = var.region + name_prefix = local.name_prefix + + # Network + subnet_id = var.subnet_id + wg_network = var.wireguard_network + + # Instance config + platform = var.wireguard_platform + preset = var.wireguard_preset + disk_size_gib = var.wireguard_disk_size_gib + + # WireGuard config + wg_port = var.wireguard_port + ui_port = var.wireguard_ui_port + + # SSH + ssh_user_name = var.ssh_user_name + ssh_public_key = local.ssh_public_key + + # Note: No explicit depends_on needed - Terraform infers from subnet_id +} diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf new file mode 100755 index 000000000..6ebf374d9 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf @@ -0,0 +1,185 @@ +# ============================================================================= +# Kubernetes Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Service Account for Node Groups +# ----------------------------------------------------------------------------- +data "nebius_iam_v1_group" "editors" { + name = "editors" + parent_id = var.tenant_id +} + +resource "nebius_iam_v1_service_account" "k8s_nodes" { + parent_id = var.parent_id + name = "${var.name_prefix}-k8s-nodes-sa" +} + +resource "nebius_iam_v1_group_membership" "k8s_nodes" { + parent_id = data.nebius_iam_v1_group.editors.id + member_id = nebius_iam_v1_service_account.k8s_nodes.id +} + +# ----------------------------------------------------------------------------- +# GPU Cluster (InfiniBand) +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_gpu_cluster" "main" { + count = var.enable_gpu_cluster && var.gpu_nodes_count_per_group > 0 ? 1 : 0 + + parent_id = var.parent_id + name = "${var.name_prefix}-gpu-cluster" + infiniband_fabric = var.infiniband_fabric +} + +# ----------------------------------------------------------------------------- +# Managed Kubernetes Cluster +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_cluster" "main" { + parent_id = var.parent_id + name = "${var.name_prefix}-cluster" + + control_plane = { + subnet_id = var.subnet_id + version = var.k8s_version + etcd_cluster_size = var.etcd_cluster_size + + endpoints = var.enable_public_endpoint ? { + public_endpoint = {} + } : {} + } + + lifecycle { + ignore_changes = [labels] + } +} + +# ----------------------------------------------------------------------------- +# CPU Node Group +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_node_group" "cpu" { + parent_id = nebius_mk8s_v1_cluster.main.id + name = "${var.name_prefix}-cpu-nodes" + fixed_node_count = var.cpu_nodes_count + version = var.k8s_version + + labels = { + "node-type" = "cpu" + } + + template = { + boot_disk = { + size_gibibytes = var.cpu_disk_size_gib + type = var.cpu_disk_type + } + + service_account_id = nebius_iam_v1_service_account.k8s_nodes.id + + network_interfaces = [ + { + subnet_id = var.subnet_id + public_ip_address = var.cpu_nodes_assign_public_ip ? {} : null + } + ] + + resources = { + platform = var.cpu_nodes_platform + preset = var.cpu_nodes_preset + } + + filesystems = var.enable_filestore && var.filestore_id != null ? [ + { + attach_mode = "READ_WRITE" + mount_tag = "data" + existing_filesystem = { + id = var.filestore_id + } + } + ] : null + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + enable_filestore = var.enable_filestore + }) + } +} + +# ----------------------------------------------------------------------------- +# GPU Node Groups +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_node_group" "gpu" { + count = var.gpu_nodes_count_per_group > 0 ? var.gpu_node_groups : 0 + + parent_id = nebius_mk8s_v1_cluster.main.id + name = "${var.name_prefix}-gpu-nodes-${count.index}" + fixed_node_count = var.gpu_nodes_count_per_group + version = var.k8s_version + + labels = { + "node-type" = "gpu" + } + + template = { + boot_disk = { + size_gibibytes = var.gpu_disk_size_gib + type = var.gpu_disk_type + } + + service_account_id = nebius_iam_v1_service_account.k8s_nodes.id + + network_interfaces = [ + { + subnet_id = var.subnet_id + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null + } + ] + + resources = { + platform = var.gpu_nodes_platform + preset = var.gpu_nodes_preset + } + + # GPU cluster for InfiniBand + gpu_cluster = var.enable_gpu_cluster ? nebius_compute_v1_gpu_cluster.main[0] : null + + # Driverfull images (pre-installed NVIDIA drivers, no GPU Operator driver needed) + gpu_settings = var.gpu_nodes_driverfull_image ? { drivers_preset = var.gpu_drivers_preset } : null + + # Reservation policy for capacity block groups + reservation_policy = length(var.gpu_reservation_ids) > 0 ? { + policy = "STRICT" + reservation_ids = var.gpu_reservation_ids + } : null + + # Preemptible configuration + preemptible = var.gpu_nodes_preemptible ? { + on_preemption = "STOP" + priority = 3 + } : null + + # Taints for GPU nodes + taints = var.enable_gpu_taints ? [ + { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + ] : null + + filesystems = var.enable_filestore && var.filestore_id != null ? [ + { + attach_mode = "READ_WRITE" + mount_tag = "data" + existing_filesystem = { + id = var.filestore_id + } + } + ] : null + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + enable_filestore = var.enable_filestore + }) + } +} diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/outputs.tf new file mode 100755 index 000000000..bdae92cf3 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/outputs.tf @@ -0,0 +1,38 @@ +# ============================================================================= +# Kubernetes Module Outputs +# ============================================================================= + +output "cluster_id" { + description = "Kubernetes cluster ID" + value = nebius_mk8s_v1_cluster.main.id +} + +output "cluster_name" { + description = "Kubernetes cluster name" + value = nebius_mk8s_v1_cluster.main.name +} + +output "cluster_endpoint" { + description = "Kubernetes API endpoint" + value = var.enable_public_endpoint ? ( + nebius_mk8s_v1_cluster.main.status.control_plane.endpoints.public_endpoint + ) : ( + try(nebius_mk8s_v1_cluster.main.status.control_plane.endpoints.private_endpoint, "") + ) +} + +output "cluster_ca_certificate" { + description = "Kubernetes cluster CA certificate" + value = nebius_mk8s_v1_cluster.main.status.control_plane.auth.cluster_ca_certificate + sensitive = true +} + +output "service_account_id" { + description = "Service account ID for node groups" + value = nebius_iam_v1_service_account.k8s_nodes.id +} + +output "gpu_cluster_id" { + description = "GPU cluster ID" + value = var.enable_gpu_cluster && var.gpu_nodes_count_per_group > 0 ? nebius_compute_v1_gpu_cluster.main[0].id : null +} diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml b/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml new file mode 100755 index 000000000..66ba3fa2a --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml @@ -0,0 +1,28 @@ +#cloud-config +%{ if ssh_public_key != null && ssh_public_key != "" ~} +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} +%{ endif ~} + +package_update: true +packages: + - nfs-common + - curl + - jq + +%{ if enable_filestore ~} +runcmd: + # Mount filestore if attached (virtiofs is not a block device, check sysfs tags) + - | + if grep -qs '^data$' /sys/fs/virtiofs/*/tag 2>/dev/null; then + mkdir -p /mnt/data + mount -t virtiofs data /mnt/data || true + if ! grep -qs 'virtiofs.*mnt/data' /etc/fstab; then + echo "data /mnt/data virtiofs defaults 0 0" >> /etc/fstab + fi + fi +%{ endif ~} diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf new file mode 100755 index 000000000..9e10cf9d1 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf @@ -0,0 +1,210 @@ +# ============================================================================= +# Kubernetes Module Variables +# ============================================================================= + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "tenant_id" { + description = "Nebius tenant ID" + type = string +} + +variable "region" { + description = "Nebius region" + type = string +} + +variable "name_prefix" { + description = "Prefix for resource names" + type = string +} + +# ----------------------------------------------------------------------------- +# Network Configuration +# ----------------------------------------------------------------------------- + +variable "subnet_id" { + description = "Subnet ID for the cluster" + type = string +} + +# ----------------------------------------------------------------------------- +# Cluster Configuration +# ----------------------------------------------------------------------------- + +variable "k8s_version" { + description = "Kubernetes version" + type = string + default = null +} + +variable "etcd_cluster_size" { + description = "Size of etcd cluster" + type = number + default = 3 +} + +variable "enable_public_endpoint" { + description = "Enable public endpoint for Kubernetes API" + type = bool + default = true +} + +# ----------------------------------------------------------------------------- +# SSH Access +# ----------------------------------------------------------------------------- + +variable "ssh_user_name" { + description = "SSH username for node access" + type = string + default = "ubuntu" +} + +variable "ssh_public_key" { + description = "SSH public key for node access" + type = string +} + +# ----------------------------------------------------------------------------- +# CPU Node Group Configuration +# ----------------------------------------------------------------------------- + +variable "cpu_nodes_count" { + description = "Number of CPU nodes" + type = number + default = 3 +} + +variable "cpu_nodes_platform" { + description = "Platform for CPU nodes" + type = string + default = "cpu-d3" +} + +variable "cpu_nodes_preset" { + description = "Resource preset for CPU nodes" + type = string + default = "16vcpu-64gb" +} + +variable "cpu_disk_type" { + description = "Disk type for CPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "cpu_disk_size_gib" { + description = "Disk size in GiB for CPU nodes" + type = number + default = 128 +} + +variable "cpu_nodes_assign_public_ip" { + description = "Assign public IPs to CPU nodes" + type = bool + default = true +} + +# ----------------------------------------------------------------------------- +# GPU Node Group Configuration +# ----------------------------------------------------------------------------- + +variable "gpu_nodes_count_per_group" { + description = "Number of GPU nodes per group" + type = number + default = 1 +} + +variable "gpu_node_groups" { + description = "Number of GPU node groups" + type = number + default = 1 +} + +variable "gpu_nodes_platform" { + description = "Platform for GPU nodes" + type = string +} + +variable "gpu_nodes_preset" { + description = "Resource preset for GPU nodes" + type = string +} + +variable "gpu_disk_type" { + description = "Disk type for GPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "gpu_disk_size_gib" { + description = "Disk size in GiB for GPU nodes" + type = number + default = 1023 +} + +variable "gpu_nodes_assign_public_ip" { + description = "Assign public IPs to GPU nodes" + type = bool + default = false +} + +variable "enable_gpu_cluster" { + description = "Enable GPU cluster with InfiniBand" + type = bool + default = true +} + +variable "infiniband_fabric" { + description = "InfiniBand fabric name" + type = string +} + +variable "enable_gpu_taints" { + description = "Add NoSchedule taint to GPU nodes" + type = bool + default = true +} + +variable "gpu_nodes_preemptible" { + description = "Use preemptible GPU nodes" + type = bool + default = false +} + +variable "gpu_reservation_ids" { + description = "List of capacity block group IDs for GPU reservations" + type = list(string) + default = [] +} + +variable "gpu_nodes_driverfull_image" { + description = "Use Nebius driverfull images with pre-installed NVIDIA drivers" + type = bool + default = false +} + +variable "gpu_drivers_preset" { + description = "CUDA driver preset for driverfull images (e.g. cuda12, cuda12.8, cuda13.0)" + type = string + default = "cuda12" +} + +# ----------------------------------------------------------------------------- +# Filestore Configuration +# ----------------------------------------------------------------------------- + +variable "enable_filestore" { + description = "Enable filestore attachment" + type = bool + default = true +} + +variable "filestore_id" { + description = "Filestore ID to attach" + type = string + default = null +} diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/versions.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/versions.tf new file mode 100755 index 000000000..4505d171a --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/versions.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + } +} diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/main.tf b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf new file mode 100755 index 000000000..38af4f957 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf @@ -0,0 +1,238 @@ +# ============================================================================= +# Platform Module - VPC, Storage, PostgreSQL, Container Registry +# ============================================================================= + +# ----------------------------------------------------------------------------- +# VPC Network (uses existing default network and subnet from the project) +# Set via nebius-env-init.sh -> TF_VAR_network_id / TF_VAR_subnet_id +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# Service Account for Storage +# ----------------------------------------------------------------------------- +resource "nebius_iam_v1_service_account" "storage" { + parent_id = var.parent_id + name = "${var.name_prefix}-storage-sa" +} + +# Get the "editors" group from tenant (grants storage.editor permissions) +# Reference: nebius-solutions-library/anyscale/deploy/bucket_key.tf +data "nebius_iam_v1_group" "editors" { + name = "editors" + parent_id = var.tenant_id +} + +# Add the storage service account to the editors group +# This grants the service account permissions to write to storage buckets +resource "nebius_iam_v1_group_membership" "storage_editor" { + parent_id = data.nebius_iam_v1_group.editors.id + member_id = nebius_iam_v1_service_account.storage.id +} + +resource "nebius_iam_v2_access_key" "storage" { + parent_id = var.parent_id + name = "${var.name_prefix}-storage-key" + description = "Access key for OSMO storage bucket" + + # Store secret in MysteryBox instead of returning directly + # Reference: nebius-solutions-library/modules/o11y/loki.tf + secret_delivery_mode = "MYSTERY_BOX" + + account = { + service_account = { + id = nebius_iam_v1_service_account.storage.id + } + } + + depends_on = [nebius_iam_v1_group_membership.storage_editor] +} + +# ----------------------------------------------------------------------------- +# MysteryBox - Read storage secret (ephemeral, NOT stored in state) +# Reference: nebius-solutions-library/modules/o11y/mysterybox.tf +# Requires Terraform >= 1.10.0 +# ----------------------------------------------------------------------------- +ephemeral "nebius_mysterybox_v1_secret_payload_entry" "storage_secret" { + secret_id = nebius_iam_v2_access_key.storage.status.secret_reference_id + key = "secret" +} + +# ----------------------------------------------------------------------------- +# Object Storage Bucket +# ----------------------------------------------------------------------------- +resource "nebius_storage_v1_bucket" "main" { + parent_id = var.parent_id + name = var.storage_bucket_name + versioning_policy = "ENABLED" +} + +# ----------------------------------------------------------------------------- +# Shared Filesystem (Filestore) +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_filesystem" "shared" { + count = var.enable_filestore ? 1 : 0 + + parent_id = var.parent_id + name = "${var.name_prefix}-filestore" + type = var.filestore_disk_type + size_bytes = var.filestore_size_gib * 1024 * 1024 * 1024 + block_size_bytes = var.filestore_block_size_kib * 1024 + + lifecycle { + ignore_changes = [labels] + } +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Password (generated + stored in MysteryBox) +# ----------------------------------------------------------------------------- +# Password is generated by Terraform and stored in MysteryBox using write-only +# fields, so the password is NOT stored in MysteryBox's Terraform state. +# The random_password IS in state (marked sensitive) — acceptable since state +# is stored encrypted in Nebius Object Storage. +# ----------------------------------------------------------------------------- + +# Generate PostgreSQL password +resource "random_password" "postgresql" { + count = var.enable_managed_postgresql ? 1 : 0 + length = 32 + special = true + # Nebius PostgreSQL forbids % @ : / ; [ ] { } | < > , . ? + override_special = "!#$^&*()-_=+" + + keepers = { + # Password stays stable unless you taint this resource + parent_id = var.parent_id + } +} + +# Store PostgreSQL password in MysteryBox (write-only — not in TF state) +resource "nebius_mysterybox_v1_secret" "postgresql_password" { + count = var.enable_managed_postgresql ? 1 : 0 + parent_id = var.parent_id + name = "${var.name_prefix}-postgresql-password" + + sensitive = { + secret_version = { + payload = [{ + key = "password" + string_value = random_password.postgresql[0].result + }] + } + version = random_password.postgresql[0].result + } +} + +# Read password back from MysteryBox (ephemeral - NOT stored in state) +ephemeral "nebius_mysterybox_v1_secret_payload_entry" "postgresql_password" { + count = var.enable_managed_postgresql ? 1 : 0 + secret_id = nebius_mysterybox_v1_secret.postgresql_password[0].id + key = "password" +} + +# Local to get the password from MysteryBox +locals { + postgresql_password = ( + var.enable_managed_postgresql + ? ephemeral.nebius_mysterybox_v1_secret_payload_entry.postgresql_password[0].data.string_value + : null + ) +} + +# ----------------------------------------------------------------------------- +# MEK (Master Encryption Key) — generated + stored in MysteryBox +# ----------------------------------------------------------------------------- + +# Generate MEK +resource "random_bytes" "mek_key" { + length = 32 + + keepers = { + parent_id = var.parent_id + } +} + +locals { + mek_jwk = jsonencode({ + kty = "oct" + k = random_bytes.mek_key.base64 + alg = "A256GCM" + use = "enc" + }) + mek_json = jsonencode({ + currentMek = "key1" + meks = { key1 = base64encode(local.mek_jwk) } + }) +} + +# Store MEK in MysteryBox (write-only) +resource "nebius_mysterybox_v1_secret" "mek" { + parent_id = var.parent_id + name = "${var.name_prefix}-mek" + + sensitive = { + secret_version = { + payload = [{ + key = "mek" + string_value = local.mek_json + }] + } + version = random_bytes.mek_key.base64 + } +} + +# ----------------------------------------------------------------------------- +# Managed PostgreSQL (MSP) - Nebius Managed Service for PostgreSQL +# Enabled by default for production-ready database service +# ----------------------------------------------------------------------------- +resource "nebius_msp_postgresql_v1alpha1_cluster" "main" { + count = var.enable_managed_postgresql ? 1 : 0 + parent_id = var.parent_id + name = "${var.name_prefix}-postgresql" + network_id = var.network_id + + config = { + version = var.postgresql_version + public_access = var.postgresql_public_access + + template = { + disk = { + size_gibibytes = var.postgresql_disk_size_gib + type = var.postgresql_disk_type + } + resources = { + platform = var.postgresql_platform + preset = var.postgresql_preset + } + hosts = { + count = var.postgresql_host_count + } + } + } + + bootstrap = { + db_name = var.postgresql_database_name + user_name = var.postgresql_username + # NOTE: user_password moved to sensitive block (write-only, not stored in state) + } + + # Write-only field - password is NOT stored in Terraform state (more secure) + # Requires Terraform >= 1.11.0 + sensitive = { + bootstrap = { + user_password = local.postgresql_password + } + } +} + +# ----------------------------------------------------------------------------- +# Container Registry +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# Registry endpoint format: cr..nebius.cloud/ +# ----------------------------------------------------------------------------- +resource "nebius_registry_v1_registry" "main" { + count = var.enable_container_registry ? 1 : 0 + + parent_id = var.parent_id + name = var.container_registry_name != "" ? var.container_registry_name : "${var.name_prefix}-registry" +} diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf new file mode 100755 index 000000000..01aea7674 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf @@ -0,0 +1,140 @@ +# ============================================================================= +# Platform Module Outputs +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Network Outputs +# ----------------------------------------------------------------------------- +output "network_id" { + description = "VPC network ID" + value = var.network_id +} + +output "subnet_id" { + description = "VPC subnet ID" + value = var.subnet_id +} + +# ----------------------------------------------------------------------------- +# Storage Outputs +# ----------------------------------------------------------------------------- +output "storage_bucket_name" { + description = "Storage bucket name" + value = nebius_storage_v1_bucket.main.name +} + +output "storage_endpoint" { + description = "S3-compatible storage endpoint (dynamic from region)" + value = "https://storage.${var.region}.nebius.cloud" +} + +# TOS format endpoint for OSMO workflows +# See: TODO.md Issue #9 - s3:// doesn't work, tos:// does +output "storage_tos_endpoint" { + description = "TOS-format endpoint for OSMO workflow configuration" + value = "tos://storage.${var.region}.nebius.cloud/${nebius_storage_v1_bucket.main.name}" +} + +output "storage_access_key_id" { + description = "Storage access key ID" + value = nebius_iam_v2_access_key.storage.status.aws_access_key_id + sensitive = true +} + +# Storage secret is ephemeral (not stored in state) - retrieve via CLI: +# nebius mysterybox v1 payload get-by-key \ +# --secret-id $(terraform output -raw storage_secret_reference_id) \ +# --key secret_access_key --format json | jq -r '.data.string_value' +output "storage_secret_access_key" { + description = "Storage secret access key - use CLI command above to retrieve (ephemeral, not in state)" + value = null # Ephemeral values cannot be output; use MysteryBox CLI + sensitive = true +} + +# MysteryBox secret reference ID (for external secret management tools) +output "storage_secret_reference_id" { + description = "MysteryBox secret reference ID for storage credentials" + value = nebius_iam_v2_access_key.storage.status.secret_reference_id +} + +# ----------------------------------------------------------------------------- +# Filestore Outputs +# ----------------------------------------------------------------------------- +output "filestore_id" { + description = "Filestore ID" + value = var.enable_filestore ? nebius_compute_v1_filesystem.shared[0].id : null +} + +output "filestore_size_bytes" { + description = "Filestore size in bytes" + value = var.enable_filestore ? nebius_compute_v1_filesystem.shared[0].size_bytes : null +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Outputs (Nebius Managed Service) +# ----------------------------------------------------------------------------- +output "enable_managed_postgresql" { + description = "Whether managed PostgreSQL is enabled" + value = var.enable_managed_postgresql +} + +output "postgresql_host" { + description = "PostgreSQL host (null if using in-cluster PostgreSQL)" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].status.connection_endpoints.private_read_write : null +} + +output "postgresql_port" { + description = "PostgreSQL port" + value = 5432 +} + +output "postgresql_database" { + description = "PostgreSQL database name" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].bootstrap.db_name : var.postgresql_database_name +} + +output "postgresql_username" { + description = "PostgreSQL username" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].bootstrap.user_name : var.postgresql_username +} + +output "postgresql_password" { + description = "PostgreSQL password (null - always use MysteryBox to retrieve)" + # Note: Password is stored in MysteryBox and cannot be output directly. + # Use the CLI to retrieve: nebius mysterybox v1 payload get-by-key --secret-id --key password + value = null + sensitive = true +} + +output "postgresql_mysterybox_secret_id" { + description = "MysteryBox secret ID for PostgreSQL password" + value = var.enable_managed_postgresql ? nebius_mysterybox_v1_secret.postgresql_password[0].id : null +} + +output "mek_mysterybox_secret_id" { + description = "MysteryBox secret ID for MEK" + value = nebius_mysterybox_v1_secret.mek.id +} + +# ----------------------------------------------------------------------------- +# Container Registry Outputs +# ----------------------------------------------------------------------------- +output "enable_container_registry" { + description = "Whether Container Registry is enabled" + value = var.enable_container_registry +} + +output "container_registry_id" { + description = "Container Registry ID" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].id : null +} + +output "container_registry_name" { + description = "Container Registry name" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].name : null +} + +output "container_registry_endpoint" { + description = "Container Registry endpoint for docker login/push" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].status.registry_fqdn : null +} diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf new file mode 100755 index 000000000..a832a9d66 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf @@ -0,0 +1,175 @@ +# ============================================================================= +# Platform Module Variables +# ============================================================================= + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "tenant_id" { + description = "Nebius tenant ID (required for IAM group membership)" + type = string +} + +variable "region" { + description = "Nebius region" + type = string +} + +variable "name_prefix" { + description = "Prefix for resource names" + type = string +} + +# ----------------------------------------------------------------------------- +# Network Configuration (existing default network and subnet) +# ----------------------------------------------------------------------------- + +variable "network_id" { + description = "Existing VPC network ID (set by nebius-env-init.sh)" + type = string +} + +variable "subnet_id" { + description = "Existing VPC subnet ID (set by nebius-env-init.sh)" + type = string +} + +# ----------------------------------------------------------------------------- +# Storage Configuration +# ----------------------------------------------------------------------------- + +variable "storage_bucket_name" { + description = "Name for storage bucket" + type = string +} + +# ----------------------------------------------------------------------------- +# Filestore Configuration +# ----------------------------------------------------------------------------- + +variable "enable_filestore" { + description = "Enable shared filesystem" + type = bool + default = true +} + +variable "filestore_disk_type" { + description = "Filestore disk type" + type = string + default = "NETWORK_SSD" +} + +variable "filestore_size_gib" { + description = "Filestore size in GiB" + type = number + default = 1024 +} + +variable "filestore_block_size_kib" { + description = "Filestore block size in KiB" + type = number + default = 4 +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Configuration +# ----------------------------------------------------------------------------- + +variable "enable_managed_postgresql" { + description = "Enable Nebius Managed PostgreSQL deployment" + type = bool + default = true +} + +variable "postgresql_version" { + description = "PostgreSQL version" + type = number + default = 16 + + validation { + condition = contains([14, 15, 16], var.postgresql_version) + error_message = "PostgreSQL version must be 14, 15, or 16." + } +} + +variable "postgresql_public_access" { + description = "Enable public access to PostgreSQL (for testing only, not recommended for production)" + type = bool + default = false +} + +variable "postgresql_platform" { + description = "PostgreSQL platform (cpu-e2 for managed PostgreSQL in all regions)" + type = string + default = "cpu-e2" + + validation { + condition = contains(["cpu-d3", "cpu-e2"], var.postgresql_platform) + error_message = "PostgreSQL platform must be cpu-e2 (recommended) or cpu-d3." + } +} + +variable "postgresql_preset" { + description = "PostgreSQL resource preset (2vcpu-8gb is minimum)" + type = string + default = "2vcpu-8gb" + + validation { + condition = contains(["2vcpu-8gb", "4vcpu-16gb", "8vcpu-32gb"], var.postgresql_preset) + error_message = "PostgreSQL preset must be 2vcpu-8gb, 4vcpu-16gb, or 8vcpu-32gb." + } +} + +variable "postgresql_disk_type" { + description = "PostgreSQL disk type (network-ssd for managed PostgreSQL in all regions)" + type = string + default = "network-ssd" + + validation { + condition = contains(["nbs-csi-sc", "network-ssd"], var.postgresql_disk_type) + error_message = "PostgreSQL disk type must be network-ssd (recommended) or nbs-csi-sc." + } +} + +variable "postgresql_disk_size_gib" { + description = "PostgreSQL disk size in GiB" + type = number + default = 50 +} + +variable "postgresql_host_count" { + description = "Number of PostgreSQL hosts" + type = number + default = 1 +} + +variable "postgresql_database_name" { + description = "PostgreSQL database name" + type = string + default = "osmo" +} + +variable "postgresql_username" { + description = "PostgreSQL admin username" + type = string + default = "osmo_admin" +} + +# ----------------------------------------------------------------------------- +# Container Registry Configuration +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# ----------------------------------------------------------------------------- + +variable "enable_container_registry" { + description = "Enable Nebius Container Registry for storing container images" + type = bool + default = true +} + +variable "container_registry_name" { + description = "Custom name for container registry (defaults to -registry)" + type = string + default = "" +} diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/versions.tf b/applications/osmo/deploy/example/001-iac/modules/platform/versions.tf new file mode 100755 index 000000000..d056669b0 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/platform/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + } +} diff --git a/applications/osmo/deploy/example/001-iac/modules/wireguard/main.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/main.tf new file mode 100755 index 000000000..5cb6027db --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/wireguard/main.tf @@ -0,0 +1,70 @@ +# ============================================================================= +# WireGuard VPN Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Public IP Allocation +# ----------------------------------------------------------------------------- +resource "nebius_vpc_v1_allocation" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard-ip" + + ipv4_public = { + cidr = "/32" + subnet_id = var.subnet_id + } + + lifecycle { + create_before_destroy = true + } +} + +# ----------------------------------------------------------------------------- +# Boot Disk +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_disk" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard-boot" + size_bytes = var.disk_size_gib * 1024 * 1024 * 1024 + block_size_bytes = 4096 + type = "NETWORK_SSD" + source_image_family = { image_family = "ubuntu22.04-driverless" } +} + +# ----------------------------------------------------------------------------- +# WireGuard Instance +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_instance" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard" + + boot_disk = { + attach_mode = "READ_WRITE" + existing_disk = nebius_compute_v1_disk.wireguard + } + + network_interfaces = [ + { + name = "eth0" + subnet_id = var.subnet_id + ip_address = {} + public_ip_address = { + allocation_id = nebius_vpc_v1_allocation.wireguard.id + } + } + ] + + resources = { + platform = var.platform + preset = var.preset + } + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + wg_port = var.wg_port + wg_network = var.wg_network + vpc_cidr = var.vpc_cidr + ui_port = var.ui_port + }) +} diff --git a/applications/osmo/deploy/example/001-iac/modules/wireguard/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/outputs.tf new file mode 100755 index 000000000..93a54f4f5 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/wireguard/outputs.tf @@ -0,0 +1,28 @@ +# ============================================================================= +# WireGuard Module Outputs +# ============================================================================= + +output "public_ip" { + description = "WireGuard server public IP" + value = nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr +} + +output "private_ip" { + description = "WireGuard server private IP" + value = nebius_compute_v1_instance.wireguard.status.network_interfaces[0].ip_address.address +} + +output "ui_url" { + description = "WireGuard Web UI URL" + value = "http://${nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr}:${var.ui_port}" +} + +output "ssh_command" { + description = "SSH command to connect" + value = "ssh ${var.ssh_user_name}@${nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr}" +} + +output "instance_id" { + description = "WireGuard instance ID" + value = nebius_compute_v1_instance.wireguard.id +} diff --git a/applications/osmo/deploy/example/001-iac/modules/wireguard/templates/cloud-init.yaml b/applications/osmo/deploy/example/001-iac/modules/wireguard/templates/cloud-init.yaml new file mode 100755 index 000000000..a8a3dca80 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/modules/wireguard/templates/cloud-init.yaml @@ -0,0 +1,122 @@ +#cloud-config +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} + +package_update: true +package_upgrade: true +packages: + - wireguard + - wireguard-tools + - ufw + +write_files: + - content: | + [Unit] + Description=Restart WireGuard + After=network.target + [Service] + Type=oneshot + ExecStart=/usr/bin/systemctl restart wg-quick@wg0.service + [Install] + RequiredBy=wgui.path + path: /etc/systemd/system/wgui.service + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=Watch /etc/wireguard/wg0.conf for changes + [Path] + PathModified=/etc/wireguard/wg0.conf + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui.path + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=WireGuard UI Server + After=network.target + Wants=network-online.target systemd-networkd-wait-online.service + + [Service] + ExecStart=/opt/wireguard-ui + Restart=on-abnormal + User=root + Group=root + WorkingDirectory=/var/lib/wireguard-ui + Environment="WGUI_PASSWORD_FILE=/var/lib/wireguard-ui/initial_password" + Environment="WGUI_LOG_LEVEL=DEBUG" + + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui_server.service + permissions: "0400" + owner: root:root + +runcmd: + # Generate WireGuard keys + - wg genkey | sudo tee /etc/wireguard/private.key + - sudo chmod go= /etc/wireguard/private.key + - sudo cat /etc/wireguard/private.key | wg pubkey | sudo tee /etc/wireguard/public.key + + # Create WireGuard configuration + - export PRIVATE_KEY=$(sudo cat /etc/wireguard/private.key) + - export INTERFACE=$(ip route list default | awk '{for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}') + - | + sudo tee /etc/wireguard/wg0.conf < + ./05-deploy-osmo-control-plane.sh + ./06-deploy-osmo-backend.sh + + ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 05-deploy-osmo-control-plane.sh)"} + + Object Storage: + Bucket: ${module.platform.storage_bucket_name} + Endpoint: ${module.platform.storage_endpoint} + + ${var.enable_container_registry ? "Container Registry:\n Name: ${module.platform.container_registry_name}\n Endpoint: ${module.platform.container_registry_endpoint}\n Docker login: docker login ${module.platform.container_registry_endpoint}" : "Container Registry: Disabled (set enable_container_registry = true to enable)"} + + EOT +} diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example new file mode 100755 index 000000000..d4eecdb5a --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example @@ -0,0 +1,104 @@ +# ============================================================================= +# OSMO on Nebius - Cost-Optimized & Secure Configuration (RECOMMENDED) +# ============================================================================= +# This configuration balances cost optimization with security best practices. +# - Uses L40S GPU (cheapest) with preemptible instances +# - All resources in private network (no public IPs on nodes) +# - WireGuard VPN for secure access +# +# Estimated cost: ~$15-25 per 6-hour session +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (set via environment or uncomment) +# ----------------------------------------------------------------------------- +# Run: source ../000-prerequisites/nebius-env-init.sh +# This will set TF_VAR_tenant_id and TF_VAR_parent_id automatically +# +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "dev" +project_name = "osmo" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster (PRIVATE - access via WireGuard) +# ----------------------------------------------------------------------------- +k8s_version = null # Use latest +etcd_cluster_size = 1 # Single node for dev (use 3 for prod) +enable_public_endpoint = false # Private API - access via WireGuard + +# ----------------------------------------------------------------------------- +# CPU Nodes (minimal, private) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 2 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "4vcpu-16gb" # Smallest viable for K8s +cpu_disk_size_gib = 64 +cpu_nodes_assign_public_ip = false # Private only + +# CPU preset options (cpu-d3): +# 2vcpu-8gb - ~$0.08/hr (may be too small for K8s) +# 4vcpu-16gb - ~$0.11/hr (minimum recommended) +# 8vcpu-32gb - ~$0.22/hr (comfortable for dev) +# 16vcpu-64gb - ~$0.44/hr (production) + +# ----------------------------------------------------------------------------- +# GPU Nodes (L40S - cheapest, private) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-l40s-a" # L40S Intel (cheapest) +gpu_nodes_preset = "1gpu-8vcpu-32gb" # Single L40S GPU +gpu_disk_size_gib = 256 +gpu_nodes_assign_public_ip = false # Private only +enable_gpu_cluster = false # No InfiniBand for L40S +enable_gpu_taints = true +gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) + +# GPU options by region (see locals.tf for full list): +# eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm +# eu-north2: gpu-h200-sxm +# eu-west1: gpu-h200-sxm +# me-west1: gpu-b200-sxm-a (NVIDIA B200) +# uk-south1: gpu-b300-sxm (NVIDIA B300) +# us-central1: gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) + +# ----------------------------------------------------------------------------- +# Storage (minimal) +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 256 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service - minimal) +# ----------------------------------------------------------------------------- +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) +postgresql_disk_size_gib = 20 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (ENABLED for secure access) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-d3" +wireguard_preset = "2vcpu-8gb" # Smallest for VPN +wireguard_disk_size_gib = 32 +wireguard_port = 51820 +wireguard_network = "10.8.0.0/24" +wireguard_ui_port = 5000 + +# ============================================================================= +# After deployment: +# 1. Set up WireGuard client: cd ../000-prerequisites && ./wireguard-client-setup.sh +# 2. Connect to VPN +# 3. Get kubectl credentials: nebius mk8s cluster get-credentials --id +# 4. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example new file mode 100755 index 000000000..5da602bef --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example @@ -0,0 +1,83 @@ +# ============================================================================= +# OSMO on Nebius - Cost-Optimized Configuration +# ============================================================================= +# This configuration minimizes costs for development and testing. +# Estimated cost: ~$15-25 per 6-hour session (with L40S + small CPU nodes) +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "dev" +project_name = "osmo-dev" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # Use latest +etcd_cluster_size = 1 # Single node for dev +enable_public_endpoint = true # Direct API access + +# ----------------------------------------------------------------------------- +# CPU Nodes (minimal for system workloads) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 2 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "4vcpu-16gb" # Smallest viable for K8s +cpu_disk_size_gib = 64 +cpu_nodes_assign_public_ip = false # Private nodes only + +# CPU preset options (cpu-d3): +# 2vcpu-8gb - ~$0.08/hr (may be too small for K8s) +# 4vcpu-16gb - ~$0.11/hr (minimum recommended) +# 8vcpu-32gb - ~$0.22/hr (comfortable for dev) +# 16vcpu-64gb - ~$0.44/hr (production) + +# ----------------------------------------------------------------------------- +# GPU Nodes (L40S - cheapest option ~$1.55/hr vs H100 ~$4-5/hr) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-l40s-a" # L40S Intel (cheapest) +gpu_nodes_preset = "1gpu-8vcpu-32gb" # Single L40S GPU +gpu_disk_size_gib = 256 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = false # No InfiniBand for L40S +enable_gpu_taints = true +gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) + +# GPU options by region (see locals.tf for full list): +# eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm +# eu-north2: gpu-h200-sxm +# eu-west1: gpu-h200-sxm +# me-west1: gpu-b200-sxm-a (NVIDIA B200) +# uk-south1: gpu-b300-sxm (NVIDIA B300) +# us-central1: gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) + +# ----------------------------------------------------------------------------- +# Storage (minimal) +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 256 # Smaller filestore + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service - minimal) +# ----------------------------------------------------------------------------- +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) +postgresql_disk_size_gib = 20 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (disabled for dev) +# ----------------------------------------------------------------------------- +enable_wireguard = false diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.example new file mode 100755 index 000000000..aa12368d1 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.example @@ -0,0 +1,73 @@ +# ============================================================================= +# OSMO on Nebius - Terraform Variables +# ============================================================================= +# Copy this file to terraform.tfvars and customize for your deployment. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings +# ----------------------------------------------------------------------------- +# Get these from: source ../000-prerequisites/nebius-env-init.sh + +tenant_id = "your-tenant-id" # From NEBIUS_TENANT_ID +parent_id = "your-project-id" # From NEBIUS_PROJECT_ID + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 +environment = "dev" # dev, staging, prod +project_name = "osmo" # Used for resource naming + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # null for latest +etcd_cluster_size = 3 # 1, 3, or 5 +enable_public_endpoint = true # Set false if using WireGuard + +# ----------------------------------------------------------------------------- +# CPU Node Group +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false # Private nodes only + +# ----------------------------------------------------------------------------- +# GPU Node Group +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" # See locals.tf for all GPU platforms per region +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false # Private nodes only +enable_gpu_cluster = true # InfiniBand +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +# Platform depends on region: cpu-e2 (eu-north1), cpu-d3 (all other regions) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "4vcpu-16gb" # Available presets vary by region +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (Optional) +# ----------------------------------------------------------------------------- +enable_wireguard = false +# wireguard_port = 51820 +# wireguard_ui_port = 5000 diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example new file mode 100755 index 000000000..ee61b2716 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example @@ -0,0 +1,79 @@ +# ============================================================================= +# OSMO on Nebius - Production Configuration +# ============================================================================= +# This configuration provides full redundancy and performance. +# Estimated cost: ~$1000+ per 6-hour session +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "prod" +project_name = "osmo-prod" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # Use latest +etcd_cluster_size = 3 # HA etcd +enable_public_endpoint = false # Private endpoint only + +# ----------------------------------------------------------------------------- +# CPU Nodes (production grade) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_type = "NETWORK_SSD" +cpu_disk_size_gib = 256 +cpu_nodes_assign_public_ip = false + +# ----------------------------------------------------------------------------- +# GPU Nodes (full 8-GPU nodes with InfiniBand) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 2 +gpu_node_groups = 2 +gpu_nodes_platform = "gpu-h200-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_type = "NETWORK_SSD" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = true # InfiniBand enabled +infiniband_fabric = null # Use region default +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = true # Nebius pre-installed drivers (recommended for B200/B300) + +# ----------------------------------------------------------------------------- +# Storage (production grade) +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_disk_type = "NETWORK_SSD" +filestore_size_gib = 4096 # 4 TiB +filestore_block_size_kib = 4 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service - HA) +# ----------------------------------------------------------------------------- +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "4vcpu-16gb" # Production size (available in all regions) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) +postgresql_disk_size_gib = 100 +postgresql_host_count = 3 # HA with replicas + +# ----------------------------------------------------------------------------- +# WireGuard VPN (enabled for secure access) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-d3" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 64 +wireguard_port = 51820 +wireguard_ui_port = 5000 diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example new file mode 100644 index 000000000..1da5a72ee --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example @@ -0,0 +1,100 @@ +# ============================================================================= +# OSMO on Nebius - Reserved GPU Configuration +# ============================================================================= +# This configuration uses reserved GPU capacity (Capacity Block Groups). +# Reserved instances guarantee availability and are billed at a fixed rate. +# +# Prerequisites: +# - A Capacity Block Group must be created in the Nebius Console +# - Set the capacity block group ID below +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (set via environment or uncomment) +# ----------------------------------------------------------------------------- +# Run: source ../000-prerequisites/nebius-env-init.sh +# This will set TF_VAR_tenant_id and TF_VAR_parent_id automatically +# +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "pro" +project_name = "osmo" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null +etcd_cluster_size = 3 +enable_public_endpoint = false # Private API - access via WireGuard + +# ----------------------------------------------------------------------------- +# CPU Nodes +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false + +# ----------------------------------------------------------------------------- +# GPU Nodes (RESERVED) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 2 # Must match your reservation size +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = true # InfiniBand for multi-node training +enable_gpu_taints = true +gpu_nodes_preemptible = false # Reserved nodes are not preemptible +gpu_nodes_driverfull_image = false +infiniband_fabric = null # Use region default + +# RESERVATION: Set your Capacity Block Group ID here +gpu_reservation_ids = ["capacityblockgroup-e00xxxxx"] + +# GPU reservation options by region: +# eu-north1: gpu-h100-sxm (8gpu-128vcpu-1600gb), gpu-h200-sxm (8gpu-128vcpu-1600gb) +# eu-north2: gpu-h200-sxm (8gpu-128vcpu-1600gb) +# eu-west1: gpu-h200-sxm (8gpu-128vcpu-1600gb) +# me-west1: gpu-b200-sxm-a (8gpu-160vcpu-1792gb) +# uk-south1: gpu-b300-sxm (8gpu-192vcpu-2768gb) +# us-central1: gpu-h200-sxm (8gpu-128vcpu-1600gb), gpu-b200-sxm (8gpu-160vcpu-1792gb) + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +postgresql_preset = "4vcpu-16gb" +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (recommended for private clusters) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-d3" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 32 +wireguard_port = 51820 +wireguard_network = "10.8.0.0/24" +wireguard_ui_port = 5000 + +# ============================================================================= +# After deployment: +# 1. Set up WireGuard client: cd ../000-prerequisites && ./wireguard-client-setup.sh +# 2. Connect to VPN +# 3. Get kubectl credentials: nebius mk8s cluster get-credentials --id +# 4. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example new file mode 100755 index 000000000..00d9eaf29 --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example @@ -0,0 +1,82 @@ +# ============================================================================= +# OSMO on Nebius - Secure Configuration with WireGuard +# ============================================================================= +# This configuration prioritizes security with private-only access. +# All cluster access goes through WireGuard VPN. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "staging" +project_name = "osmo-secure" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster (PRIVATE ONLY) +# ----------------------------------------------------------------------------- +k8s_version = null +etcd_cluster_size = 3 +enable_public_endpoint = false # No public K8s API + +# ----------------------------------------------------------------------------- +# CPU Nodes (NO public IPs) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false # Private only + +# ----------------------------------------------------------------------------- +# GPU Nodes (NO public IPs) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false # Private only +enable_gpu_cluster = true +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "2vcpu-8gb" # Minimum preset +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (REQUIRED for this config) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-d3" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 64 +wireguard_port = 51820 +wireguard_network = "10.8.0.0/24" +wireguard_ui_port = 5000 + +# ============================================================================= +# After deployment: +# 1. Set up WireGuard client: ./000-prerequisites/wireguard-client-setup.sh +# 2. Connect to VPN +# 3. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/example/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf new file mode 100755 index 000000000..cd251d6cb --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/variables.tf @@ -0,0 +1,413 @@ +# ============================================================================= +# Global Configuration +# ============================================================================= + +variable "tenant_id" { + description = "Nebius tenant ID" + type = string +} + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "region" { + description = "Nebius region for deployment" + type = string + default = "eu-north1" + + validation { + condition = contains(["eu-north1", "eu-north2", "eu-west1", "me-west1", "uk-south1", "us-central1"], var.region) + error_message = "Region must be one of: eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1" + } +} + +variable "environment" { + description = "Environment name (dev, stg, tst, pro)" + type = string + default = "dev" + + validation { + condition = contains(["dev", "stg", "tst", "pro"], var.environment) + error_message = "Environment must be one of: dev, stg, tst, pro" + } +} + +variable "project_name" { + description = "Project name used for resource naming" + type = string + default = "osmo" +} + +# ============================================================================= +# Network Configuration (existing default network and subnet) +# Set automatically by nebius-env-init.sh via TF_VAR_network_id / TF_VAR_subnet_id +# ============================================================================= + +variable "network_id" { + description = "Existing VPC network ID (set by nebius-env-init.sh)" + type = string +} + +variable "subnet_id" { + description = "Existing VPC subnet ID (set by nebius-env-init.sh)" + type = string +} + +# ============================================================================= +# SSH Access +# ============================================================================= + +variable "ssh_user_name" { + description = "SSH username for node access" + type = string + default = "ubuntu" +} + +variable "ssh_public_key" { + description = "SSH public key for node access" + type = object({ + key = optional(string) + path = optional(string, "~/.ssh/id_rsa.pub") + }) + default = {} +} + +# ============================================================================= +# Kubernetes Cluster Configuration +# ============================================================================= + +variable "k8s_version" { + description = "Kubernetes version (null for latest)" + type = string + default = null +} + +variable "etcd_cluster_size" { + description = "Size of etcd cluster (1, 3, or 5)" + type = number + default = 3 + + validation { + condition = contains([1, 3, 5], var.etcd_cluster_size) + error_message = "etcd cluster size must be 1, 3, or 5" + } +} + +variable "enable_public_endpoint" { + description = "Enable public endpoint for Kubernetes API" + type = bool + default = true +} + +# ============================================================================= +# CPU Node Group Configuration +# ============================================================================= + +variable "cpu_nodes_count" { + description = "Number of CPU nodes" + type = number + default = 3 + + validation { + condition = var.cpu_nodes_count >= 1 && var.cpu_nodes_count <= 100 + error_message = "CPU nodes count must be between 1 and 100" + } +} + +variable "cpu_nodes_platform" { + description = "Platform for CPU nodes" + type = string + default = "cpu-d3" +} + +variable "cpu_nodes_preset" { + description = "Resource preset for CPU nodes" + type = string + default = "16vcpu-64gb" +} + +variable "cpu_disk_type" { + description = "Disk type for CPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "cpu_disk_size_gib" { + description = "Disk size in GiB for CPU nodes" + type = number + default = 128 +} + +variable "cpu_nodes_assign_public_ip" { + description = "Assign public IPs to CPU nodes" + type = bool + default = false # Private by default for security +} + +# ============================================================================= +# GPU Node Group Configuration +# ============================================================================= + +variable "gpu_nodes_count_per_group" { + description = "Number of GPU nodes per group" + type = number + default = 1 + + validation { + condition = var.gpu_nodes_count_per_group >= 0 && var.gpu_nodes_count_per_group <= 32 + error_message = "GPU nodes per group must be between 0 and 32" + } +} + +variable "gpu_node_groups" { + description = "Number of GPU node groups" + type = number + default = 1 +} + +variable "gpu_nodes_platform" { + description = "Platform for GPU nodes" + type = string + default = null +} + +variable "gpu_nodes_preset" { + description = "Resource preset for GPU nodes" + type = string + default = null +} + +variable "gpu_disk_type" { + description = "Disk type for GPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "gpu_disk_size_gib" { + description = "Disk size in GiB for GPU nodes" + type = number + default = 1023 +} + +variable "gpu_nodes_assign_public_ip" { + description = "Assign public IPs to GPU nodes" + type = bool + default = false +} + +variable "enable_gpu_cluster" { + description = "Enable GPU cluster with InfiniBand" + type = bool + default = true +} + +variable "infiniband_fabric" { + description = "InfiniBand fabric name (null for region default)" + type = string + default = null +} + +variable "enable_gpu_taints" { + description = "Add NoSchedule taint to GPU nodes" + type = bool + default = true +} + +variable "gpu_nodes_preemptible" { + description = "Use preemptible GPU nodes (up to 70% cost savings)" + type = bool + default = false +} + +variable "gpu_reservation_ids" { + description = "List of capacity block group IDs for GPU reservations (e.g. [\"capacityblockgroup-e00xxxxx\"]). When set, reservation_policy is STRICT." + type = list(string) + default = [] +} + +variable "gpu_nodes_driverfull_image" { + description = "Use Nebius driverfull images (pre-installed NVIDIA drivers). When true, GPU Operator driver installation is not needed." + type = bool + default = false +} + +# ============================================================================= +# Filestore Configuration +# ============================================================================= + +variable "enable_filestore" { + description = "Enable shared filesystem" + type = bool + default = true +} + +variable "filestore_disk_type" { + description = "Filestore disk type" + type = string + default = "NETWORK_SSD" +} + +variable "filestore_size_gib" { + description = "Filestore size in GiB" + type = number + default = 1024 +} + +variable "filestore_block_size_kib" { + description = "Filestore block size in KiB" + type = number + default = 4 +} + +# ============================================================================= +# Object Storage Configuration +# ============================================================================= + +variable "storage_bucket_name" { + description = "Name for the storage bucket (must be globally unique)" + type = string + default = "" +} + +# ============================================================================= +# PostgreSQL Configuration +# Region-specific defaults (auto-selected when set to null): +# eu-north1: platform=cpu-e2, disk=network-ssd +# All other regions: platform=cpu-d3, disk=network-ssd +# Safe preset across all regions: 2vcpu-8gb or 4vcpu-16gb +# ============================================================================= + +variable "enable_managed_postgresql" { + description = "Enable Nebius Managed PostgreSQL deployment" + type = bool + default = true +} + +variable "postgresql_version" { + description = "PostgreSQL version (14, 15, or 16)" + type = number + default = 16 + + validation { + condition = contains([14, 15, 16], var.postgresql_version) + error_message = "PostgreSQL version must be 14, 15, or 16." + } +} + +variable "postgresql_public_access" { + description = "Enable public access to PostgreSQL (for testing only, not recommended for production)" + type = bool + default = false +} + +variable "postgresql_platform" { + description = "PostgreSQL platform (null for region default: cpu-e2 in eu-north1, cpu-d3 elsewhere)" + type = string + default = null +} + +variable "postgresql_preset" { + description = "PostgreSQL resource preset (2vcpu-8gb is minimum)" + type = string + default = "4vcpu-16gb" +} + +variable "postgresql_disk_type" { + description = "PostgreSQL disk type (null for region default: network-ssd in eu-north1, nbs-csi-sc elsewhere)" + type = string + default = null +} + +variable "postgresql_disk_size_gib" { + description = "PostgreSQL disk size in GiB" + type = number + default = 50 +} + +variable "postgresql_host_count" { + description = "Number of PostgreSQL hosts" + type = number + default = 1 + + validation { + condition = var.postgresql_host_count >= 1 && var.postgresql_host_count <= 3 + error_message = "PostgreSQL host count must be between 1 and 3" + } +} + +variable "postgresql_database_name" { + description = "PostgreSQL database name" + type = string + default = "osmo" +} + +variable "postgresql_username" { + description = "PostgreSQL admin username" + type = string + default = "osmo_admin" +} + +# ============================================================================= +# Container Registry Configuration +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# ============================================================================= + +variable "enable_container_registry" { + description = "Enable Nebius Container Registry for storing container images" + type = bool + default = true +} + +variable "container_registry_name" { + description = "Custom name for the container registry (defaults to --registry)" + type = string + default = "" +} + +# ============================================================================= +# WireGuard VPN Configuration +# ============================================================================= + +variable "enable_wireguard" { + description = "Enable WireGuard VPN for private access" + type = bool + default = false +} + +variable "wireguard_platform" { + description = "Platform for WireGuard instance (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" + type = string + default = "cpu-d3" +} + +variable "wireguard_preset" { + description = "Resource preset for WireGuard instance" + type = string + default = "2vcpu-8gb" +} + +variable "wireguard_disk_size_gib" { + description = "Disk size for WireGuard instance" + type = number + default = 64 +} + +variable "wireguard_port" { + description = "WireGuard UDP port" + type = number + default = 51820 +} + +variable "wireguard_network" { + description = "WireGuard VPN network CIDR" + type = string + default = "10.8.0.0/24" +} + +variable "wireguard_ui_port" { + description = "WireGuard Web UI port" + type = number + default = 5000 +} diff --git a/applications/osmo/deploy/example/001-iac/versions.tf b/applications/osmo/deploy/example/001-iac/versions.tf new file mode 100755 index 000000000..6042f66dd --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/versions.tf @@ -0,0 +1,25 @@ +terraform { + # Requires >= 1.10.0 for ephemeral resources (MysteryBox integration) + # Requires >= 1.11.0 for write-only sensitive fields (PostgreSQL password) + required_version = ">= 1.11.0" + + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + units = { + source = "dstaroff/units" + version = ">= 1.1.1" + } + } +} + +provider "nebius" { + domain = "api.eu.nebius.cloud:443" +} + +provider "random" {} diff --git a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh new file mode 100755 index 000000000..b257ddb40 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# +# Deploy GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " GPU Infrastructure Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update +helm repo update + +# Auto-detect driverfull images from Terraform config +if [[ -z "${USE_DRIVERFULL_IMAGES:-}" ]]; then + TF_DRIVERFULL=$(get_tf_output "gpu_nodes_driverfull_image" "../001-iac" || echo "") + if [[ "$TF_DRIVERFULL" == "true" ]]; then + USE_DRIVERFULL_IMAGES="true" + log_info "Auto-detected driverfull images from Terraform" + fi +fi + +# ----------------------------------------------------------------------------- +# Deploy GPU Operator (skipped when using driverfull images) +# ----------------------------------------------------------------------------- +if [[ "${USE_DRIVERFULL_IMAGES:-false}" == "true" ]]; then + log_info "Skipping GPU Operator (using Nebius driverfull images with pre-installed drivers)" + log_info "Installing NVIDIA device plugin for driverfull mode..." + + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + # With driverfull images, we still need the GPU Operator for toolkit, device-plugin, + # dcgm, etc. - but driver installation is disabled. + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --set driver.enabled=false \ + --timeout 10m + + log_success "GPU Operator deployed (driver disabled - using driverfull images)" +else + log_info "Deploying NVIDIA GPU Operator (with driver installation)..." + + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + DRIVER_VERSION_SET=() + if [[ -n "${GPU_DRIVER_VERSION:-}" ]]; then + log_info "Using pinned driver version: ${GPU_DRIVER_VERSION}" + DRIVER_VERSION_SET=(--set "driver.version=${GPU_DRIVER_VERSION}") + fi + + # shellcheck disable=SC2086 + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + ${DRIVER_VERSION_SET[@]:+${DRIVER_VERSION_SET[@]}} \ + --timeout 10m + + log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" +fi + +# Brief wait for core operator pod only (not GPU node components) +sleep 10 +kubectl get pods -n "${GPU_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Deploy Network Operator (for InfiniBand) - OPTIONAL +# ----------------------------------------------------------------------------- +if [[ "${ENABLE_NETWORK_OPERATOR:-false}" == "true" ]]; then + log_info "Deploying NVIDIA Network Operator (InfiniBand support)..." + + kubectl create namespace "${NETWORK_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + helm upgrade --install network-operator nvidia/network-operator \ + --namespace "${NETWORK_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/network-operator.yaml" \ + --timeout 10m + + log_success "Network Operator deployed" + + # Brief wait and show status + sleep 5 + kubectl get pods -n "${NETWORK_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true +else + log_info "Skipping Network Operator (set ENABLE_NETWORK_OPERATOR=true to install)" +fi + +# ----------------------------------------------------------------------------- +# Deploy KAI Scheduler (from NVIDIA OCI registry) +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html +# ----------------------------------------------------------------------------- +log_info "Deploying KAI Scheduler..." + +kubectl create namespace "${KAI_SCHEDULER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Install directly from OCI registry +KAI_VERSION="${KAI_SCHEDULER_VERSION:-0.4.0}" +helm upgrade --install kai-scheduler \ + oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler \ + --version "${KAI_VERSION}" \ + --namespace "${KAI_SCHEDULER_NAMESPACE}" \ + --values "${VALUES_DIR}/kai-scheduler.yaml" \ + --timeout 5m + +log_success "KAI Scheduler deployed" + +# Brief wait and show status +sleep 5 +kubectl get pods -n "${KAI_SCHEDULER_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Verify Installation +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying GPU infrastructure..." + +# Check GPU nodes +GPU_NODES=$(kubectl get nodes -l node-type=gpu -o name 2>/dev/null | wc -l) +if [[ $GPU_NODES -gt 0 ]]; then + log_success "Found $GPU_NODES GPU node(s)" + kubectl get nodes -l node-type=gpu -o wide +else + log_warning "No GPU nodes found yet (they may still be provisioning)" +fi + +echo "" +echo "========================================" +log_success "GPU Infrastructure deployment complete!" +echo "========================================" +echo "" +echo "Next step: ./02-deploy-observability.sh" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh new file mode 100755 index 000000000..c77761c90 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Deploy Observability Stack (Prometheus, Grafana, Loki) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " Observability Stack Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update +helm repo add grafana https://grafana.github.io/helm-charts --force-update +helm repo update + +# Create namespace +kubectl create namespace "${MONITORING_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Generate Grafana password if not set +if [[ -z "$GRAFANA_ADMIN_PASSWORD" ]]; then + GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 16) + log_info "Generated Grafana admin password" +fi + +# ----------------------------------------------------------------------------- +# Deploy Prometheus +# ----------------------------------------------------------------------------- +log_info "Deploying Prometheus..." + +helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/prometheus.yaml" \ + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD}" \ + --wait --timeout 10m + +log_success "Prometheus stack deployed" + +# ----------------------------------------------------------------------------- +# Deploy Loki +# ----------------------------------------------------------------------------- +log_info "Deploying Loki..." + +helm upgrade --install loki grafana/loki-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/loki.yaml" \ + --wait --timeout 10m + +log_success "Loki deployed" + +# ----------------------------------------------------------------------------- +# Deploy Promtail +# ----------------------------------------------------------------------------- +log_info "Deploying Promtail..." + +helm upgrade --install promtail grafana/promtail \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/promtail.yaml" \ + --wait --timeout 5m + +log_success "Promtail deployed" + +# ----------------------------------------------------------------------------- +# Configure Grafana Datasources +# ----------------------------------------------------------------------------- +log_info "Configuring Grafana datasources..." + +# Loki datasource is auto-configured via values + +# Wait for Grafana +wait_for_pods "${MONITORING_NAMESPACE}" "app.kubernetes.io/name=grafana" 180 + +# ----------------------------------------------------------------------------- +# Output Access Information +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +log_success "Observability stack deployment complete!" +echo "========================================" +echo "" +echo "Access Grafana:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-grafana 3000:80" +echo " URL: http://localhost:3000" +echo " Username: admin" +echo " Password: ${GRAFANA_ADMIN_PASSWORD}" +echo "" +echo "Access Prometheus:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-kube-prometheus-prometheus 9090:9090" +echo " URL: http://localhost:9090" +echo "" +echo "Next step: ./03-deploy-nginx-ingress.sh" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh new file mode 100755 index 000000000..87cfd5fde --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Deploy NGINX Ingress Controller (community) +# Provides path-based routing for all OSMO services (API, router, Web UI). +# +# This installs the same controller OSMO uses elsewhere: +# - OSMO quick-start chart (Chart.yaml) depends on ingress-nginx from the same Helm repo. +# - OSMO Kind runner (run/start_service_kind.py) installs ingress-nginx the same way. +# We do not use the quick-start umbrella chart here (Nebius uses managed DB, etc.), +# so we install the controller explicitly. Not a duplicate of OSMO—same upstream chart. +# +# Run before 05-deploy-osmo-control-plane.sh. +# See: https://kubernetes.github.io/ingress-nginx/deploy/ + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" + +echo "" +echo "========================================" +echo " NGINX Ingress Controller Deployment" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Add Helm repo +# ----------------------------------------------------------------------------- +log_info "Adding ingress-nginx Helm repository..." +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create namespace and install +# ----------------------------------------------------------------------------- +log_info "Creating namespace ${INGRESS_NAMESPACE}..." +kubectl create namespace "${INGRESS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +log_info "Installing NGINX Ingress Controller..." +# --set controller.progressDeadlineSeconds=600: chart v4.14+ defaults to 0 which +# K8s 1.32+ rejects ("must be greater than minReadySeconds"). Without this fix the +# Deployment is invalid, the controller never starts, and the admission webhook +# blocks all Ingress resource creation in downstream scripts. +helm upgrade --install "${INGRESS_RELEASE_NAME}" ingress-nginx/ingress-nginx \ + --namespace "${INGRESS_NAMESPACE}" \ + --set controller.service.type=LoadBalancer \ + --set controller.progressDeadlineSeconds=600 \ + --wait --timeout 5m || { + log_warning "Helm install returned non-zero; controller may still be starting." +} + +log_success "NGINX Ingress Controller deployed" + +# ----------------------------------------------------------------------------- +# Wait for LoadBalancer IP (optional; may take 1–2 min on cloud) +# ----------------------------------------------------------------------------- +log_info "Waiting for LoadBalancer IP (up to 120s)..." +for i in $(seq 1 24); do + LB_IP=$(kubectl get svc -n "${INGRESS_NAMESPACE}" -l app.kubernetes.io/name=ingress-nginx -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + log_success "LoadBalancer IP: ${LB_IP}" + echo "" + echo "OSMO will be accessible at:" + echo " http://${LB_IP}" + echo "" + echo "This URL is auto-detected by 05-deploy-osmo-control-plane.sh." + echo "" + break + fi + sleep 5 +done +if [[ -z "${LB_IP:-}" ]]; then + log_warning "LoadBalancer IP not yet assigned. Check: kubectl get svc -n ${INGRESS_NAMESPACE}" +fi + +echo "========================================" +log_success "NGINX Ingress deployment complete" +echo "========================================" +echo "" +echo "Next: run 04-enable-tls.sh (optional, recommended)" +echo " then 05-deploy-osmo-control-plane.sh" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh new file mode 100755 index 000000000..b6b25fb47 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh @@ -0,0 +1,741 @@ +#!/bin/bash +# +# Enable TLS/HTTPS for OSMO using Let's Encrypt +# +# Supports two certificate methods: +# 1) cert-manager (default) — automated HTTP-01 challenges via in-cluster cert-manager +# 2) certbot — interactive manual DNS-01 challenges via local certbot binary +# +# Set OSMO_TLS_MODE=certbot or OSMO_TLS_MODE=cert-manager to skip the prompt. +# +# Can be run at two points in the deployment flow: +# +# A) Right after 03-deploy-nginx-ingress.sh (RECOMMENDED): +# Issues the TLS certificate early. When 04-deploy-osmo-control-plane.sh +# runs later, it auto-detects the certificate and creates TLS-enabled Ingress. +# +# B) After 04-deploy-osmo-control-plane.sh (retrofit existing deployment): +# Does everything in (A) plus patches existing OSMO Ingress resources +# and updates service_base_url to HTTPS. +# +# Prerequisites: +# 1. NGINX Ingress Controller deployed (03-deploy-nginx-ingress.sh) +# 2. A DNS record pointing your domain to the LoadBalancer IP +# (A record for cert-manager/HTTP-01; TXT record for certbot/DNS-01) +# +# Usage: +# ./04-enable-tls.sh [hostname] +# +# Optional environment variables: +# OSMO_TLS_MODE - "cert-manager" or "certbot" (skips prompt) +# OSMO_TLS_EMAIL - Email for Let's Encrypt (default: noreply@) +# OSMO_TLS_SECRET_NAME - K8s Secret name for certificate (default: osmo-tls) +# LETSENCRYPT_EMAIL - Alias for OSMO_TLS_EMAIL (certbot path) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +MAIN_HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" +MAIN_HOSTNAME="${MAIN_HOSTNAME%.}" # Strip trailing dot (FQDN notation) +TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" +CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" + +echo "" +echo "========================================" +echo " Enable TLS/HTTPS" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Validate inputs +# ----------------------------------------------------------------------------- +if [[ -z "$MAIN_HOSTNAME" ]]; then + log_error "Hostname is required." + echo "" + echo "Usage: $0 " + echo " or: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + echo "" + LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + echo "Your LoadBalancer IP is: ${LB_IP}" + echo "Create a DNS A record pointing your domain to this IP, then re-run this script." + fi + exit 1 +fi + +check_kubectl || exit 1 + +log_info "Hostname: ${MAIN_HOSTNAME}" +log_info "TLS secret: ${TLS_SECRET}" + +# Keycloak auth subdomain support +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +AUTH_HOSTNAME="" +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_HOSTNAME="${KEYCLOAK_HOSTNAME}" + else + AUTH_HOSTNAME="auth.${MAIN_HOSTNAME}" + fi + log_info "Keycloak auth hostname: ${AUTH_HOSTNAME}" + log_info "Keycloak TLS secret: ${KC_TLS_SECRET}" +fi + +# Get LoadBalancer IP +LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + +# ----------------------------------------------------------------------------- +# Select TLS method +# ----------------------------------------------------------------------------- +TLS_MODE="${OSMO_TLS_MODE:-}" +if [[ -z "$TLS_MODE" ]]; then + echo "" + echo "Select TLS certificate method:" + echo "" + echo " 1) cert-manager — automated HTTP-01 challenges (requires DNS A record)" + echo " 2) certbot — interactive DNS-01 challenges (requires DNS TXT record)" + echo "" + while true; do + printf "Enter choice [1-2] (default: 1): " + read -r _tls_choice + case "${_tls_choice:-1}" in + 1) TLS_MODE="cert-manager"; break ;; + 2) TLS_MODE="certbot"; break ;; + *) echo "Invalid selection." ;; + esac + done +fi + +log_info "TLS method: ${TLS_MODE}" + +# ----------------------------------------------------------------------------- +# DNS info +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +echo " DNS Record Setup Required" +echo "========================================" +echo "" +if [[ -n "$LB_IP" ]]; then + echo "Create the following DNS A record(s) pointing to your LoadBalancer IP:" + echo "" + echo " ${MAIN_HOSTNAME} -> ${LB_IP}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME} -> ${LB_IP}" + fi +else + echo "LoadBalancer IP not yet assigned. Check with:" + echo " kubectl get svc -n ${INGRESS_NS} ingress-nginx-controller" + echo "" + echo "Once the IP is available, create DNS A record(s) for:" + echo " ${MAIN_HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME}" + fi +fi +echo "" +if [[ "$TLS_MODE" == "certbot" ]]; then + echo "Certbot DNS-01 challenges require you to create TXT records when prompted." +else + echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." +fi +echo "" +read_prompt_var "Press Enter once DNS records are configured (or type 'skip' to skip DNS check)" DNS_CONFIRM "" + +# Verify DNS resolves to the LoadBalancer IP +if [[ "$DNS_CONFIRM" != "skip" ]]; then + DNS_IP=$(dig +short "$MAIN_HOSTNAME" 2>/dev/null | tail -1 || true) + + if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then + if [[ "$DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${MAIN_HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${MAIN_HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" + log_warning "Let's Encrypt challenge may fail if DNS doesn't point to the LoadBalancer." + fi + elif [[ -z "$DNS_IP" ]]; then + log_warning "Could not resolve ${MAIN_HOSTNAME}. Make sure the DNS record exists." + fi + + if [[ -n "$AUTH_HOSTNAME" ]]; then + AUTH_DNS_IP=$(dig +short "$AUTH_HOSTNAME" 2>/dev/null | tail -1 || true) + if [[ -n "$LB_IP" && -n "$AUTH_DNS_IP" ]]; then + if [[ "$AUTH_DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP}, but LoadBalancer IP is ${LB_IP}" + fi + elif [[ -z "$AUTH_DNS_IP" ]]; then + log_warning "Could not resolve ${AUTH_HOSTNAME}. Keycloak TLS cert may fail." + fi + fi +fi + +# Check if OSMO is already deployed (determines whether to patch Ingress / update config) +INGRESS_COUNT=$(kubectl get ingress -n "${OSMO_NS}" --no-headers 2>/dev/null | wc -l | tr -d ' ') +if [[ "$INGRESS_COUNT" -gt 0 ]]; then + log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS} (will patch with TLS)" + OSMO_DEPLOYED="true" +else + log_info "No OSMO Ingress resources yet — preparing certificate" + log_info "04-deploy-osmo-control-plane.sh will auto-detect the TLS cert" + OSMO_DEPLOYED="false" +fi + +# Ensure the OSMO namespace exists +kubectl create namespace "${OSMO_NS}" --dry-run=client -o yaml | kubectl apply -f - + +# ============================================================================= +# Helper: create K8s TLS secret in both namespaces from cert/key files +# ============================================================================= +create_tls_secret_from_files() { + local secret_name="$1" + local cert_path="$2" + local key_path="$3" + + log_info "Creating TLS secret '${secret_name}' in namespace '${INGRESS_NS}'..." + kubectl create secret tls "${secret_name}" \ + --cert="${cert_path}" \ + --key="${key_path}" \ + --namespace "${INGRESS_NS}" \ + --dry-run=client -o yaml | kubectl apply -f - + + if [[ "$OSMO_NS" != "$INGRESS_NS" ]]; then + log_info "Creating TLS secret '${secret_name}' in namespace '${OSMO_NS}'..." + kubectl create secret tls "${secret_name}" \ + --cert="${cert_path}" \ + --key="${key_path}" \ + --namespace "${OSMO_NS}" \ + --dry-run=client -o yaml | kubectl apply -f - + fi + log_success "TLS secret '${secret_name}' created" +} + +# ============================================================================= +# Helper: copy cert-manager secret to the other namespace if needed +# ============================================================================= +copy_secret_across_namespaces() { + local secret_name="$1" + if [[ "$OSMO_NS" != "$INGRESS_NS" ]]; then + # cert-manager creates the secret in the Certificate's namespace (OSMO_NS). + # Copy it to the ingress namespace so both can reference it. + if kubectl get secret "${secret_name}" -n "${OSMO_NS}" &>/dev/null; then + if ! kubectl get secret "${secret_name}" -n "${INGRESS_NS}" &>/dev/null; then + log_info "Copying secret '${secret_name}' to namespace '${INGRESS_NS}'..." + kubectl get secret "${secret_name}" -n "${OSMO_NS}" -o json \ + | jq 'del(.metadata.namespace,.metadata.resourceVersion,.metadata.uid,.metadata.creationTimestamp)' \ + | kubectl apply -n "${INGRESS_NS}" -f - + fi + fi + fi +} + +# ############################################################################# +# CERTBOT PATH +# ############################################################################# +if [[ "$TLS_MODE" == "certbot" ]]; then + + # Check certbot + if ! command -v certbot &>/dev/null; then + log_error "certbot is not installed." + echo "" + echo "Install certbot using one of these methods:" + echo " Ubuntu/Debian: sudo apt install certbot" + echo " macOS: brew install certbot" + echo " pip: pip install certbot" + echo " snap: sudo snap install certbot --classic" + echo "" + exit 1 + fi + log_success "certbot found: $(certbot --version 2>&1 | head -1)" + + TLS_EMAIL="${LETSENCRYPT_EMAIL:-${OSMO_TLS_EMAIL:-}}" + if [[ -z "$TLS_EMAIL" ]]; then + echo "Enter your email for Let's Encrypt registration:" + printf " Email: " + read -r TLS_EMAIL + echo "" + if [[ -z "$TLS_EMAIL" ]]; then + log_error "Email is required for certbot." + exit 1 + fi + fi + + # Build list of domains to process: "domain:secret_name" + DOMAINS_TO_PROCESS=("${MAIN_HOSTNAME}:${TLS_SECRET}") + if [[ -n "$AUTH_HOSTNAME" ]]; then + DOMAINS_TO_PROCESS+=("${AUTH_HOSTNAME}:${KC_TLS_SECRET}") + fi + + # Show plan + echo "" + echo "========================================" + echo " Certificate Plan (certbot DNS-01)" + echo "========================================" + echo "" + echo " Email: ${TLS_EMAIL}" + echo " Cert directory: ${CERT_DIR}" + echo "" + echo " Certificates to obtain:" + for entry in "${DOMAINS_TO_PROCESS[@]}"; do + d="${entry%%:*}" + s="${entry##*:}" + echo " ${d} -> secret '${s}'" + done + echo "" + if [[ ${#DOMAINS_TO_PROCESS[@]} -gt 1 ]]; then + echo " Certbot will run once per domain. Each requires a separate DNS TXT record." + echo "" + fi + printf " Press Enter to continue (or Ctrl-C to abort)..." + read -r + echo "" + + # Process each domain + FAILED=() + for entry in "${DOMAINS_TO_PROCESS[@]}"; do + domain="${entry%%:*}" + secret_name="${entry##*:}" + + echo "" + echo "========================================" + echo " Certificate: ${domain}" + echo " Secret: ${secret_name}" + echo "========================================" + echo "" + + mkdir -p "${CERT_DIR}/work" "${CERT_DIR}/logs" + + echo "Certbot will ask you to create a DNS TXT record." + echo "When prompted:" + echo " 1. Log in to your DNS provider" + echo " 2. Create a TXT record for _acme-challenge.${domain}" + echo " 3. Wait for DNS propagation (1-5 minutes)" + echo " 4. Press Enter in this terminal to continue" + echo "" + log_info "Starting certbot for ${domain}..." + + if ! certbot certonly \ + --manual \ + --preferred-challenges dns \ + -d "${domain}" \ + --email "${TLS_EMAIL}" \ + --agree-tos \ + --no-eff-email \ + --config-dir "${CERT_DIR}" \ + --work-dir "${CERT_DIR}/work" \ + --logs-dir "${CERT_DIR}/logs"; then + log_error "certbot failed for ${domain}. Check the output above." + FAILED+=("$domain") + continue + fi + + cert_path="${CERT_DIR}/live/${domain}/fullchain.pem" + key_path="${CERT_DIR}/live/${domain}/privkey.pem" + + if [[ ! -f "$cert_path" || ! -f "$key_path" ]]; then + log_error "Certificate files not found for ${domain}." + echo " Expected cert: ${cert_path}" + echo " Expected key: ${key_path}" + FAILED+=("$domain") + continue + fi + + log_success "Certificate obtained for ${domain}" + echo " Full chain: ${cert_path}" + echo " Private key: ${key_path}" + echo "" + log_info "Certificate details:" + openssl x509 -in "${cert_path}" -noout -subject -issuer -dates 2>/dev/null || true + + # Create K8s TLS secrets in both namespaces + create_tls_secret_from_files "$secret_name" "$cert_path" "$key_path" + done + + if [[ ${#FAILED[@]} -gt 0 ]]; then + log_warning "Some certificates failed:" + for d in "${FAILED[@]}"; do + echo " - ${d}" + done + echo " Fix the issues above and re-run this script." + fi + +# ############################################################################# +# CERT-MANAGER PATH +# ############################################################################# +else + + check_helm || exit 1 + + # ------------------------------------------------------------------------- + # Install cert-manager + # ------------------------------------------------------------------------- + log_info "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update + helm repo update jetstack + + if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "cert-manager already installed" + else + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait --timeout 5m + fi + log_success "cert-manager ready" + + # ------------------------------------------------------------------------- + # Create Let's Encrypt ClusterIssuer + # ------------------------------------------------------------------------- + TLS_EMAIL="${OSMO_TLS_EMAIL:-${LETSENCRYPT_EMAIL:-noreply@${MAIN_HOSTNAME#*.}}}" + log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." + + kubectl apply -f - </dev/null || true + kubectl delete order --all -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete certificaterequest --all -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete certificate "${TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete secret "${TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + # Clean up any lingering solver pods from previous attempts + kubectl delete pods -n "${OSMO_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true + + # ------------------------------------------------------------------------- + # Helper: wait for a certificate to become ready + # ------------------------------------------------------------------------- + wait_for_certificate() { + local cert_name="$1" + local max_wait="${2:-300}" + local interval=5 + local elapsed=0 + + log_info "Waiting for certificate '${cert_name}' (up to ${max_wait}s)..." + while [[ $elapsed -lt $max_wait ]]; do + local cert_status + cert_status=$(kubectl get certificate "${cert_name}" -n "${OSMO_NS}" \ + -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") + if [[ "$cert_status" == "True" ]]; then + log_success "Certificate '${cert_name}' issued and ready" + return 0 + fi + if (( elapsed > 0 && elapsed % 30 == 0 )); then + local challenge_state + challenge_state=$(kubectl get challenge -n "${OSMO_NS}" \ + -o jsonpath='{.items[0].status.state}' 2>/dev/null || echo "unknown") + log_info " Still waiting... (elapsed: ${elapsed}s, challenge: ${challenge_state})" + fi + sleep $interval + elapsed=$((elapsed + interval)) + done + + log_warning "Certificate '${cert_name}' not ready after ${max_wait}s" + kubectl describe certificate "${cert_name}" -n "${OSMO_NS}" 2>/dev/null | tail -15 + echo "" + echo "Debugging commands:" + echo " kubectl get certificate,certificaterequest,order,challenge -n ${OSMO_NS}" + echo " kubectl describe challenge -n ${OSMO_NS}" + return 1 + } + + # ------------------------------------------------------------------------- + # Issue TLS certificate for main domain + # + # When OSMO is already deployed (Mode B), the Envoy sidecar on OSMO + # services intercepts HTTP requests (including the ACME challenge path) + # and redirects them to Keycloak OAuth, which breaks Let's Encrypt. + # + # To work around this, we temporarily remove OSMO Ingress resources + # that have catch-all paths, create a clean bootstrap Ingress for the + # challenge, and restore everything with TLS once the cert is ready. + # ------------------------------------------------------------------------- + REMOVED_INGRESSES=() + + if [[ "$OSMO_DEPLOYED" == "true" ]]; then + log_info "Temporarily removing OSMO Ingress resources for certificate issuance..." + log_info "(Envoy sidecars intercept ACME challenges; we need a clean path)" + + # Save and remove all OSMO ingresses to prevent Envoy from intercepting + mkdir -p /tmp/osmo-tls-backup + for ing in $(kubectl get ingress -n "${OSMO_NS}" -o name 2>/dev/null); do + ing_name="${ing#*/}" + kubectl get "$ing" -n "${OSMO_NS}" -o yaml > "/tmp/osmo-tls-backup/${ing_name}.yaml" + kubectl delete "$ing" -n "${OSMO_NS}" 2>/dev/null || true + REMOVED_INGRESSES+=("$ing_name") + log_info " Removed ingress/${ing_name} (backed up)" + done + fi + + # Create bootstrap Ingress — no Envoy, no auth, just cert-manager + log_info "Creating bootstrap Ingress for certificate issuance..." + kubectl apply -f - </dev/null || true + + # ------------------------------------------------------------------------- + # Issue TLS certificate for Keycloak auth subdomain + # ------------------------------------------------------------------------- + AUTH_CERT_READY="False" + if [[ -n "$AUTH_HOSTNAME" ]]; then + log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." + + # Clean up previous auth cert attempts + kubectl delete certificate "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + + kubectl apply -f - </dev/null || true + fi + + # ------------------------------------------------------------------------- + # Restore OSMO Ingress resources with TLS (Mode B) + # ------------------------------------------------------------------------- + if [[ "$OSMO_DEPLOYED" == "true" && "$CERT_READY" == "True" ]]; then + log_info "Restoring OSMO Ingress resources with TLS..." + + for ing_name in "${REMOVED_INGRESSES[@]}"; do + backup_file="/tmp/osmo-tls-backup/${ing_name}.yaml" + [[ ! -f "$backup_file" ]] && continue + + # Determine which hostname/secret this ingress should use + local_host=$(yq -r '.spec.rules[0].host // ""' "$backup_file" 2>/dev/null || \ + python3 -c "import yaml,sys; d=yaml.safe_load(open('$backup_file')); print(d.get('spec',{}).get('rules',[{}])[0].get('host',''))" 2>/dev/null || echo "") + tls_secret_name="${TLS_SECRET}" + tls_host="${MAIN_HOSTNAME}" + if [[ "$local_host" == *"auth."* && -n "$AUTH_HOSTNAME" && "$AUTH_CERT_READY" == "True" ]]; then + tls_secret_name="${KC_TLS_SECRET}" + tls_host="${AUTH_HOSTNAME}" + fi + + # Re-apply the backup, then patch in TLS (no cert-manager annotation) + kubectl apply -f "$backup_file" 2>/dev/null || true + kubectl patch ingress "$ing_name" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || true + log_info " ${ing_name}: restored (no TLS)" + done + rm -rf /tmp/osmo-tls-backup + log_info "Fix the certificate issue and re-run this script." + fi + + # ------------------------------------------------------------------------- + # Final cleanup: remove any lingering solver pods + # ------------------------------------------------------------------------- + kubectl delete pods -n "${OSMO_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true + kubectl delete pods -n "${INGRESS_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true + +fi # end TLS_MODE + +# ============================================================================= +# Update OSMO service_base_url to HTTPS (only if OSMO is already deployed) +# ============================================================================= +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + log_info "Updating OSMO service_base_url to https://${MAIN_HOSTNAME}..." + + kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & + _PF_PID=$! + trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT + + _pf_ready=false + for i in $(seq 1 15); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done + + if [[ "$_pf_ready" == "true" ]]; then + if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + cat > /tmp/service_url_tls.json </dev/null; then + NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + log_success "service_base_url updated to: ${NEW_URL}" + else + log_warning "Could not update service_base_url automatically." + log_info "Run: ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" + fi + rm -f /tmp/service_url_tls.json + else + log_warning "Could not login to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" + fi + else + log_warning "Could not connect to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" + fi +else + log_info "Skipping service_base_url update (OSMO not deployed yet)" + log_info "04-deploy-osmo-control-plane.sh will auto-detect TLS and use https://" +fi + +# ============================================================================= +# Done +# ============================================================================= +echo "" +echo "========================================" +log_success "TLS setup complete (${TLS_MODE})" +echo "========================================" +echo "" + +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + echo "OSMO is now accessible at:" + echo " https://${MAIN_HOSTNAME}" + echo " https://${MAIN_HOSTNAME}/api/version" + echo "" + echo "CLI login:" + echo " osmo login https://${MAIN_HOSTNAME} --method dev --username admin" +else + echo "TLS certificate prepared for: ${MAIN_HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo "Auth TLS certificate prepared for: ${AUTH_HOSTNAME}" + fi + echo "" + echo "Next steps:" + if [[ "$TLS_MODE" == "cert-manager" ]]; then + echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" + else + echo " 1. Certificates stored in: ${CERT_DIR}" + echo " Renewal: re-run this script before the 90-day expiry" + fi + echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" + echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " 3. Keycloak will be exposed at https://${AUTH_HOSTNAME}" + fi +fi +echo "" diff --git a/applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh new file mode 100755 index 000000000..a97c5990a --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh @@ -0,0 +1,2013 @@ +#!/bin/bash +# +# Deploy OSMO Service (Control Plane) +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +# +# Components: API Service, Router, Web UI, Worker, Logger, Agent, Keycloak +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Service Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Validate hostname requirements (early, before any deployment work) +# ----------------------------------------------------------------------------- +if [[ "${OSMO_TLS_ENABLED:-false}" == "true" && -z "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + log_error "OSMO_TLS_ENABLED=true requires OSMO_INGRESS_HOSTNAME to be set." + echo " TLS certificates are issued for a domain name, not a bare IP." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 +fi + +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" && -z "${OSMO_INGRESS_HOSTNAME:-}" && -z "${KEYCLOAK_HOSTNAME:-}" ]]; then + log_error "DEPLOY_KEYCLOAK=true requires OSMO_INGRESS_HOSTNAME or KEYCLOAK_HOSTNAME to be set." + echo " KEYCLOAK_HOSTNAME is auto-derived as auth- if not set explicitly." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +# Deploy Keycloak in same namespace as PostgreSQL to simplify DNS resolution +KEYCLOAK_NAMESPACE="${OSMO_NAMESPACE}" +OSMO_DOMAIN="${OSMO_DOMAIN:-osmo.local}" + +# Keycloak admin password - check for existing secret first to maintain consistency +if [[ -z "${KEYCLOAK_ADMIN_PASSWORD:-}" ]]; then + # Try to get existing password from secret + EXISTING_KC_PASS=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null || true) + if [[ -n "${EXISTING_KC_PASS}" ]]; then + KEYCLOAK_ADMIN_PASSWORD="${EXISTING_KC_PASS}" + log_info "Using existing Keycloak admin password from secret" + else + KEYCLOAK_ADMIN_PASSWORD="$(openssl rand -base64 12)" + log_info "Generated new Keycloak admin password" + fi +fi + +# ----------------------------------------------------------------------------- +# Get Database Configuration from Terraform (Nebius Managed PostgreSQL) +# ----------------------------------------------------------------------------- +log_info "Using Nebius Managed PostgreSQL..." + log_info "Retrieving database configuration..." + + # Get connection details from Terraform outputs + POSTGRES_HOST=$(get_tf_output "postgresql.host" "../001-iac" || echo "") + POSTGRES_PORT=$(get_tf_output "postgresql.port" "../001-iac" || echo "5432") + POSTGRES_DB=$(get_tf_output "postgresql.database" "../001-iac" || echo "osmo") + POSTGRES_USER=$(get_tf_output "postgresql.username" "../001-iac" || echo "osmo_admin") + + # Get password - try MysteryBox first, then Terraform output, then env vars + # MysteryBox secret ID is set by secrets-init.sh as TF_VAR_postgresql_mysterybox_secret_id + POSTGRES_SECRET_ID="${TF_VAR_postgresql_mysterybox_secret_id:-${OSMO_POSTGRESQL_SECRET_ID:-}}" + + if [[ -n "$POSTGRES_SECRET_ID" ]]; then + log_info "Reading PostgreSQL password from MysteryBox (secret: $POSTGRES_SECRET_ID)..." + POSTGRES_PASSWORD=$(get_mysterybox_secret "$POSTGRES_SECRET_ID" "password" || echo "") + if [[ -n "$POSTGRES_PASSWORD" ]]; then + log_success "PostgreSQL password retrieved from MysteryBox" + else + log_warning "Failed to read password from MysteryBox" + fi + fi + + # Fall back to Terraform output (only works if not using MysteryBox) + if [[ -z "$POSTGRES_PASSWORD" ]]; then + POSTGRES_PASSWORD=$(get_tf_output "postgresql_password" "../001-iac" || echo "") + fi + + # Fall back to environment variables or prompt + if [[ -z "$POSTGRES_HOST" || -z "$POSTGRES_PASSWORD" ]]; then + log_warning "Could not retrieve PostgreSQL configuration automatically" + log_info "Checking environment variables..." + + POSTGRES_HOST=${POSTGRES_HOST:-${OSMO_POSTGRES_HOST:-""}} + POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-${OSMO_POSTGRES_PASSWORD:-""}} + + if [[ -z "$POSTGRES_HOST" ]]; then + read_prompt_var "PostgreSQL Host" POSTGRES_HOST "" + fi + if [[ -z "$POSTGRES_PASSWORD" ]]; then + read_secret_var "PostgreSQL Password" POSTGRES_PASSWORD + fi + fi + +log_success "Database: ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" + +# ----------------------------------------------------------------------------- +# Select Nebius Region +# ----------------------------------------------------------------------------- +VALID_REGIONS=("eu-north1" "me-west1") + +if [[ -n "${NEBIUS_REGION:-}" ]]; then + NEBIUS_SELECTED_REGION="$NEBIUS_REGION" + matched=false + for r in "${VALID_REGIONS[@]}"; do + [[ "$r" == "$NEBIUS_SELECTED_REGION" ]] && matched=true && break + done + if ! $matched; then + log_error "Invalid NEBIUS_REGION '${NEBIUS_SELECTED_REGION}'. Valid options: ${VALID_REGIONS[*]}" + exit 1 + fi + log_info "Using region from NEBIUS_REGION: ${NEBIUS_SELECTED_REGION}" +else + echo "Select the Nebius region for storage:" + echo "" + _idx=1 + for _r in "${VALID_REGIONS[@]}"; do + echo " ${_idx}) ${_r}" + _idx=$((_idx + 1)) + done + echo "" + while true; do + printf "Enter choice [1-${#VALID_REGIONS[@]}]: " + read -r choice + if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then + NEBIUS_SELECTED_REGION="${VALID_REGIONS[$((choice - 1))]}" + break + fi + echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." + done + log_info "Selected region: ${NEBIUS_SELECTED_REGION}" +fi + +S3_NEBIUS_ENDPOINT="https://storage.${NEBIUS_SELECTED_REGION}.nebius.cloud" + +# ----------------------------------------------------------------------------- +# Get Storage Configuration +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "${S3_NEBIUS_ENDPOINT}") +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" || echo "") + +# Secret access key is stored in MysteryBox (ephemeral, not in Terraform state) +S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" || echo "") +S3_SECRET_KEY="" + +if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" || echo "") + if [[ -n "$S3_SECRET_KEY" ]]; then + log_success "Storage secret retrieved from MysteryBox" + else + log_warning "Could not retrieve storage secret from MysteryBox" + fi +fi + +if [[ -n "$S3_BUCKET" ]]; then + log_success "Storage: ${S3_BUCKET} @ ${S3_ENDPOINT}" +fi + +# ----------------------------------------------------------------------------- +# Add Helm Repositories +# ----------------------------------------------------------------------------- +log_info "Adding Helm repositories..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo add bitnami https://charts.bitnami.com/bitnami --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Step 1: Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespace..." +kubectl create namespace "${OSMO_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +# Note: Keycloak is deployed in the same namespace as OSMO (no separate namespace needed) + +# ----------------------------------------------------------------------------- +# Step 2: Configure PostgreSQL - Verify Connection and Create Databases +# ----------------------------------------------------------------------------- +log_info "Verifying PostgreSQL connection..." + + # Delete any existing test/init pods + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + + # Create a temporary secret with DB credentials + # NOTE: PGDATABASE must be the bootstrap database ('osmo') for Nebius MSP PostgreSQL + kubectl create secret generic osmo-db-init-creds \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=PGPASSWORD="${POSTGRES_PASSWORD}" \ + --from-literal=PGHOST="${POSTGRES_HOST}" \ + --from-literal=PGPORT="${POSTGRES_PORT}" \ + --from-literal=PGUSER="${POSTGRES_USER}" \ + --from-literal=PGDATABASE="${POSTGRES_DB}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # ----------------------------------------------------------------------------- + # Connection Test - Verify credentials before proceeding + # ----------------------------------------------------------------------------- + log_info "Testing PostgreSQL connection (this may take a moment)..." + + kubectl run osmo-db-test \ + --namespace "${OSMO_NAMESPACE}" \ + --image=postgres:16-alpine \ + --restart=Never \ + --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ + --env="PGHOST=${POSTGRES_HOST}" \ + --env="PGPORT=${POSTGRES_PORT}" \ + --env="PGUSER=${POSTGRES_USER}" \ + --env="PGDATABASE=${POSTGRES_DB}" \ + --command -- sh -c 'psql -c "SELECT 1" >/dev/null 2>&1 && echo "CONNECTION_OK" || echo "CONNECTION_FAILED"' \ + >/dev/null 2>&1 + + # Wait for test pod to complete + test_elapsed=0 + test_status="" + while [[ $test_elapsed -lt 60 ]]; do + test_status=$(kubectl get pod osmo-db-test -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + if [[ "$test_status" == "Succeeded" || "$test_status" == "Failed" ]]; then + break + fi + sleep 2 + test_elapsed=$((test_elapsed + 2)) + done + + # Check test result + test_result=$(kubectl logs osmo-db-test -n "${OSMO_NAMESPACE}" 2>/dev/null || echo "") + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found >/dev/null 2>&1 + + if [[ "$test_result" != *"CONNECTION_OK"* ]]; then + log_error "PostgreSQL connection test failed!" + echo "" + echo "Connection details:" + echo " Host: ${POSTGRES_HOST}" + echo " Port: ${POSTGRES_PORT}" + echo " Database: ${POSTGRES_DB}" + echo " User: ${POSTGRES_USER}" + echo " Password: (from MysteryBox secret ${TF_VAR_postgresql_mysterybox_secret_id:-'not set'})" + echo "" + echo "Possible causes:" + echo " 1. Password mismatch - MysteryBox password doesn't match PostgreSQL" + echo " Fix: Update MysteryBox or recreate PostgreSQL cluster" + echo " 2. Network issue - Cluster cannot reach PostgreSQL" + echo " 3. PostgreSQL not ready - Wait and retry" + echo "" + echo "To debug manually:" + echo " kubectl run psql-debug --rm -it --image=postgres:16-alpine -n osmo -- sh" + echo " PGPASSWORD='' psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB}" + exit 1 + fi + + log_success "PostgreSQL connection verified" + + # ----------------------------------------------------------------------------- + # Database Creation + # ----------------------------------------------------------------------------- + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Creating OSMO and Keycloak databases (if not exist)..." + else + log_info "Verifying OSMO database..." + fi + + # NOTE: Nebius MSP PostgreSQL creates the bootstrap database ('osmo') automatically. + # The bootstrap user can only connect to this database, not 'postgres'. + # We connect to 'osmo' and create additional databases from there. + # Pass DEPLOY_KEYCLOAK to the init pod + kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null 2>&1; then + echo "ERROR: Cannot connect to PostgreSQL" + echo "Debug: PGHOST=\$PGHOST, PGPORT=\$PGPORT, PGUSER=\$PGUSER, PGDATABASE=\${PGDATABASE:-osmo}" + # Try with verbose error + psql -d "\${PGDATABASE:-osmo}" -c "SELECT 1" 2>&1 || true + exit 1 + fi + echo "Connection successful to database '\${PGDATABASE:-osmo}'" + + # The 'osmo' database already exists (created by Nebius bootstrap) + echo "Database 'osmo' exists (created by Nebius MSP bootstrap)" + + # Create keycloak database only if Keycloak deployment is enabled + DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" + if [ "\$DEPLOY_KEYCLOAK" = "true" ]; then + # Note: This requires the user to have CREATEDB privilege + if psql -d "\${PGDATABASE:-osmo}" -tAc "SELECT 1 FROM pg_database WHERE datname='keycloak'" | grep -q 1; then + echo "Database 'keycloak' already exists" + else + echo "Creating database 'keycloak'..." + psql -d "\${PGDATABASE:-osmo}" -c "CREATE DATABASE keycloak;" || { + echo "WARNING: Could not create 'keycloak' database." + echo "The bootstrap user may not have CREATEDB privilege." + echo "Keycloak will use a schema in the 'osmo' database instead." + } + fi + fi + + # Verify databases exist + echo "" + echo "Verifying databases..." + psql -d "\${PGDATABASE:-osmo}" -c "\l" | grep -E "osmo" || true + + echo "" + echo "SUCCESS: Database initialization complete" + restartPolicy: Never +EOF + + # Wait for pod to complete (init pods may finish before Ready condition is detected) + log_info "Running database initialization..." + + # Poll for completion - init pods go directly to Completed/Succeeded very quickly + max_wait=120 + elapsed=0 + pod_status="" + + while [[ $elapsed -lt $max_wait ]]; do + pod_status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + + if [[ "$pod_status" == "Succeeded" ]]; then + break + elif [[ "$pod_status" == "Failed" ]]; then + log_error "Database initialization failed. Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + sleep 2 + elapsed=$((elapsed + 2)) + done + + if [[ "$pod_status" != "Succeeded" ]]; then + log_error "Database initialization timed out (status: $pod_status). Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" 2>/dev/null || true + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + # Show logs for verification + log_info "Database initialization output:" + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + + # Cleanup + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "Databases verified and ready" + +# ----------------------------------------------------------------------------- +# Step 3: Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating secrets..." + +# keycloak-db-secret is created later in Step 4 when DEPLOY_KEYCLOAK=true (with other Keycloak secrets) + +# Create the postgres-secret that OSMO chart expects +# The chart looks for passwordSecretName: postgres-secret, passwordSecretKey: password +kubectl create secret generic postgres-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# OIDC secrets (only needed if Keycloak is deployed) +# These are placeholder values that get overwritten with real Keycloak client secrets +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + HMAC_SECRET=$(openssl rand -base64 32) + CLIENT_SECRET=$(openssl rand -base64 32) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${CLIENT_SECRET}" \ + --from-literal=hmac_secret="${HMAC_SECRET}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# Storage secret (if available) +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + kubectl create secret generic osmo-storage \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# MEK (Master Encryption Key) Configuration +# OSMO expects MEK in JWK (JSON Web Key) format, base64-encoded +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +MEK_ID="${MEK_ID:-key1}" +log_info "Configuring MEK (Master Encryption Key)..." + +# Try to read MEK from MysteryBox first (set by secrets-init.sh) +# MysteryBox secret ID is set as TF_VAR_mek_mysterybox_secret_id +MEK_SECRET_ID="${TF_VAR_mek_mysterybox_secret_id:-${OSMO_MEK_SECRET_ID:-}}" +MEK_DATA="" + +if [[ -n "$MEK_SECRET_ID" ]]; then + log_info "Reading MEK from MysteryBox (secret: $MEK_SECRET_ID)..." + MEK_DATA=$(get_mysterybox_secret "$MEK_SECRET_ID" "mek" || echo "") + if [[ -n "$MEK_DATA" ]]; then + log_success "MEK retrieved from MysteryBox" + # MEK from secrets-init.sh is in format: {"currentMek":"key1","meks":{"key1":""}} + # Extract the key ID and encoded value + MEK_ID=$(echo "$MEK_DATA" | jq -r '.currentMek // "key1"' 2>/dev/null || echo "key1") + MEK_ENCODED=$(echo "$MEK_DATA" | jq -r ".meks.${MEK_ID} // empty" 2>/dev/null || echo "") + + if [[ -z "$MEK_ENCODED" ]]; then + log_warning "Could not parse MEK from MysteryBox, will generate new one" + MEK_DATA="" + fi + else + log_warning "Failed to read MEK from MysteryBox" + fi +fi + +# Generate new MEK if not retrieved from MysteryBox +if [[ -z "$MEK_DATA" || -z "$MEK_ENCODED" ]]; then + log_info "Generating new MEK in JWK format..." + MEK_KEY_RAW="$(openssl rand -base64 32 | tr -d '\n')" + MEK_JWK="{\"k\":\"${MEK_KEY_RAW}\",\"kid\":\"${MEK_ID}\",\"kty\":\"oct\"}" + MEK_ENCODED="$(echo -n "$MEK_JWK" | base64 | tr -d '\n')" + log_success "New MEK generated" +fi + +# Create MEK ConfigMap (OSMO expects ConfigMap, not Secret) +kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null; then + log_info "Redis already deployed" +else + helm upgrade --install redis bitnami/redis \ + --namespace "${OSMO_NAMESPACE}" \ + --version 25.3.1 \ + --set architecture=standalone \ + --set auth.enabled=false \ + --set networkPolicy.enabled=false \ + --set master.persistence.size=50Gi \ + --set master.resources.requests.cpu=8 \ + --set master.resources.requests.memory=52820Mi \ + --set master.resources.limits.cpu=8 \ + --set master.resources.limits.memory=52820Mi \ + --set commonConfiguration="aof-load-corrupt-tail-max-size 10000000" \ + --wait --timeout 10m + + log_success "Redis deployed" +fi + +REDIS_HOST="redis-master.${OSMO_NAMESPACE}.svc.cluster.local" + +# ----------------------------------------------------------------------------- +# Step 4: Deploy Keycloak (Enable with DEPLOY_KEYCLOAK=true) +# ----------------------------------------------------------------------------- +# Keycloak provides authentication for OSMO +# Required for: osmo login, osmo token, backend operator +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak + +# Keycloak service URL (same namespace as OSMO) +KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" +KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" + +# Derive Keycloak external hostname +# Priority: KEYCLOAK_HOSTNAME env var > auto-derive from OSMO_INGRESS_HOSTNAME +if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="${KEYCLOAK_HOSTNAME}" +elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="auth-${OSMO_INGRESS_HOSTNAME}" +else + AUTH_DOMAIN="auth-${OSMO_DOMAIN}" +fi +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" + +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Deploying Keycloak for OSMO authentication..." + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + + # Keycloak database was already created in Step 2 (osmo-db-init pod) when DEPLOY_KEYCLOAK=true + + # ------------------------------------------------------------------------- + # Step 1: Create secrets for Keycloak + # ------------------------------------------------------------------------- + log_info "Creating Keycloak secrets..." + + # Save admin password to secret for future re-runs + kubectl create secret generic keycloak-admin-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${KEYCLOAK_ADMIN_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create keycloak-db-secret for external database (per OSMO docs) + # Uses the managed PostgreSQL credentials + kubectl create secret generic keycloak-db-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "Keycloak secrets created" + + # ------------------------------------------------------------------------- + # Step 2: Install Keycloak using Bitnami Helm chart + # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#install-keycloak-using-bitnami-helm-chart + # ------------------------------------------------------------------------- + log_info "Installing Keycloak using Bitnami Helm chart..." + + # Add Bitnami repo + helm repo add bitnami https://charts.bitnami.com/bitnami --force-update 2>/dev/null || true + helm repo update bitnami + + # Determine if Keycloak should use external TLS ingress + KC_EXTERNAL="false" + if [[ "${OSMO_TLS_ENABLED:-false}" == "true" && -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + # Check TLS secret for auth domain exists + if kubectl get secret "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null || \ + kubectl get secret "${KC_TLS_SECRET}" -n "${INGRESS_NAMESPACE:-ingress-nginx}" &>/dev/null; then + KC_EXTERNAL="true" + log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" + else + # Auto-recover: if local cert files exist for the auth domain, recreate the secret + KC_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" + KC_LOCAL_CERT="${KC_CERT_DIR}/live/${AUTH_DOMAIN}/fullchain.pem" + KC_LOCAL_KEY="${KC_CERT_DIR}/live/${AUTH_DOMAIN}/privkey.pem" + if [[ -f "$KC_LOCAL_CERT" && -f "$KC_LOCAL_KEY" ]]; then + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found, but local certs exist." + log_info "Auto-recovering: recreating secret from ${KC_CERT_DIR}/live/${AUTH_DOMAIN}/..." + kubectl create secret tls "${KC_TLS_SECRET}" \ + --cert="${KC_LOCAL_CERT}" \ + --key="${KC_LOCAL_KEY}" \ + --namespace "${OSMO_NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - + kubectl create secret tls "${KC_TLS_SECRET}" \ + --cert="${KC_LOCAL_CERT}" \ + --key="${KC_LOCAL_KEY}" \ + --namespace "${INGRESS_NAMESPACE:-ingress-nginx}" \ + --dry-run=client -o yaml | kubectl apply -f - + log_success "TLS secret '${KC_TLS_SECRET}' recreated from local cert files" + KC_EXTERNAL="true" + log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" + else + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." + log_warning "Run: OSMO_INGRESS_HOSTNAME=${AUTH_DOMAIN} ./03a-setup-tls-certificate.sh" + log_warning "Keycloak will be internal-only (port-forward access)" + fi + fi + fi + + # Create keycloak-values.yaml per OSMO documentation + cat > /tmp/keycloak-values.yaml </dev/null | grep -q keycloak; then + break + fi + echo " Waiting for Keycloak pod to be created... ($i/30)" + sleep 5 + done + + # Now wait for it to be ready + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ + -n "${OSMO_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak pod not ready yet, checking logs..." + kubectl logs -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --tail=30 || true + } + + # Additional wait for Keycloak to fully initialize + log_info "Waiting for Keycloak to fully initialize..." + sleep 30 + + # Configure Keycloak realm using the official OSMO realm JSON + # Source: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#post-installation-keycloak-configuration + # The official sample_osmo_realm.json includes everything needed for OSMO RBAC: + # - Roles: osmo-user, osmo-admin, osmo-backend, grafana-*, dashboard-* + # - Groups: Admin, User, Backend Operator (with proper client-role mappings) + # - Clients: osmo-device (public, device code flow), osmo-browser-flow (confidential, auth code) + # - Mappers: "Create roles claim" protocol mapper on both clients (JWT roles claim) + # - Scopes: Standard OIDC scopes (profile, email, roles, etc.) + log_info "Configuring Keycloak realm using official OSMO realm JSON..." + + # Generate client secret for osmo-browser-flow (confidential client) + OIDC_CLIENT_SECRET=$(openssl rand -hex 16) + + # Determine OSMO base URL for client redirect URIs + if [[ "$KC_EXTERNAL" == "true" ]]; then + OSMO_BASE_URL="https://${OSMO_INGRESS_HOSTNAME}" + else + OSMO_BASE_URL="http://localhost:8080" + fi + + # Upload the official realm JSON as a ConfigMap (so the job can mount it) + log_info "Creating ConfigMap from sample_osmo_realm.json..." + kubectl create configmap keycloak-realm-json \ + --namespace "${OSMO_NAMESPACE}" \ + --from-file=realm.json="${SCRIPT_DIR}/sample_osmo_realm.json" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create a job to import the realm and configure a test user + cat > /tmp/keycloak-config-job.yaml < /dev/null 2>&1; then + echo "Keycloak is ready" + break + fi + echo " Attempt \$i: Keycloak not ready yet..." + sleep 15 + done + echo "" + + # ── Step 3: Get admin token ───────────────────────────── + echo "=== Step 3: Get admin token ===" + for i in 1 2 3 4 5; do + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + if [ -n "\$TOKEN" ]; then break; fi + echo " Retry \$i: waiting for token..." + sleep 10 + done + + if [ -z "\$TOKEN" ]; then + echo "FATAL: Failed to get admin token" + exit 1 + fi + echo "Got admin token" + echo "" + + # ── Step 4: Import OSMO realm ─────────────────────────── + echo "=== Step 4: Import OSMO realm ===" + + # Delete existing realm if present (idempotent re-runs) + REALM_STATUS=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$REALM_STATUS" = "200" ]; then + echo " Existing 'osmo' realm found – deleting for fresh import..." + curl -s -X DELETE "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN" + echo " Old realm deleted" + sleep 5 + fi + + echo "Importing official OSMO realm from sample_osmo_realm.json..." + IMPORT_HTTP=\$(curl -s -o /tmp/import-resp.txt -w "%{http_code}" \ + -X POST "\${KEYCLOAK_URL}/admin/realms" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/realm-import.json) + + if [ "\$IMPORT_HTTP" = "201" ] || [ "\$IMPORT_HTTP" = "204" ]; then + echo "Realm imported successfully (HTTP \$IMPORT_HTTP)" + else + echo "WARNING: Realm import returned HTTP \$IMPORT_HTTP" + cat /tmp/import-resp.txt 2>/dev/null || true + echo "" + # Attempt partial import as fallback + echo "Trying partial import as fallback..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/partialImport" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/realm-import.json || echo "Partial import also failed" + fi + + # Verify realm exists + sleep 3 + VERIFY=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$VERIFY" != "200" ]; then + echo "FATAL: Realm 'osmo' not found after import (HTTP \$VERIFY)" + exit 1 + fi + echo "Realm 'osmo' verified" + echo "" + + # ── Step 4b: Set client secret for osmo-browser-flow ─── + # Keycloak ignores the "secret" field during realm import and + # generates its own random secret. We MUST explicitly set it via the + # admin API so it matches the oidc-secrets Kubernetes secret that + # Envoy reads at runtime. + echo "=== Step 4b: Set osmo-browser-flow client secret ===" + + # Refresh token (import may have been slow) + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + # Find the internal UUID for the osmo-browser-flow client + BROWSER_CLIENT_UUID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients?clientId=osmo-browser-flow" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$BROWSER_CLIENT_UUID" ]; then + echo " Client UUID: \$BROWSER_CLIENT_UUID" + + # GET the full client representation, replace ONLY the secret field, PUT it back. + # This preserves redirect URIs, scopes, mappers, and all other config. + curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}" \ + -H "Authorization: Bearer \$TOKEN" > /tmp/browser-client.json + + # Replace the masked secret with our generated secret + # Handle both compact ("secret":"...") and spaced ("secret" : "...") JSON + sed -i 's/"secret"[ ]*:[ ]*"[^"]*"/"secret":"${OIDC_CLIENT_SECRET}"/' /tmp/browser-client.json + + SET_SECRET_HTTP=\$(curl -s -o /dev/null -w "%{http_code}" \ + -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/browser-client.json) + + if [ "\$SET_SECRET_HTTP" = "204" ] || [ "\$SET_SECRET_HTTP" = "200" ]; then + echo " Client secret set successfully (HTTP \$SET_SECRET_HTTP)" + else + echo " WARNING: Failed to set client secret (HTTP \$SET_SECRET_HTTP)" + echo " OAuth browser flow may fail – check Keycloak logs" + fi + + # Verify: read back the secret and compare + ACTUAL_SECRET=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}/client-secret" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"value":"[^"]*"' | cut -d'"' -f4) + if [ "\$ACTUAL_SECRET" = "${OIDC_CLIENT_SECRET}" ]; then + echo " Verified: client secret matches oidc-secrets" + else + echo " WARNING: Client secret mismatch!" + echo " Expected: ${OIDC_CLIENT_SECRET:0:8}..." + echo " Got: \${ACTUAL_SECRET:0:8}..." + echo " This will cause 'OAuth flow failed' errors" + fi + else + echo " WARNING: osmo-browser-flow client not found after import" + echo " OAuth browser flow will not work" + fi + echo "" + + # ── Step 5: Create test user ──────────────────────────── + echo "=== Step 5: Create test user ===" + + # Refresh admin token (import may have taken a while) + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + echo "Creating osmo-admin test user..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/users" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "username": "osmo-admin", + "enabled": true, + "emailVerified": true, + "firstName": "OSMO", + "lastName": "Admin", + "email": "osmo-admin@example.com", + "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] + }' || echo "User may already exist" + echo "" + + # ── Step 6: Assign user to Admin group ────────────────── + echo "=== Step 6: Assign user to Admin group ===" + + # Get user internal ID + USER_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/users?username=osmo-admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$USER_ID" ]; then + echo " User ID: \$USER_ID" + + # Get Admin group internal ID + ADMIN_GROUP_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/groups?search=Admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$ADMIN_GROUP_ID" ]; then + echo " Admin Group ID: \$ADMIN_GROUP_ID" + curl -s -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/users/\${USER_ID}/groups/\${ADMIN_GROUP_ID}" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{}' || echo "Failed to assign group" + echo " User 'osmo-admin' assigned to Admin group (osmo-admin + osmo-user roles)" + else + echo " WARNING: Admin group not found – user roles may need manual assignment" + fi + else + echo " WARNING: Could not find osmo-admin user ID" + fi + echo "" + + # ── Done ──────────────────────────────────────────────── + echo "=========================================" + echo " Keycloak OSMO Configuration Complete" + echo "=========================================" + echo "" + echo "Realm: osmo (imported from official sample_osmo_realm.json)" + echo "Clients: osmo-device (public, device code + direct access)" + echo " osmo-browser-flow (confidential, authorization code)" + echo "Groups: Admin, User, Backend Operator" + echo "Roles: osmo-admin, osmo-user, osmo-backend, grafana-*, dashboard-*" + echo "Mappers: JWT 'roles' claim configured on both clients" + echo "Test user: osmo-admin / osmo-admin (Admin group)" + echo "" +EOF + + # Delete any previous config job + kubectl delete job keycloak-osmo-setup -n "${KEYCLOAK_NAMESPACE}" --ignore-not-found 2>/dev/null || true + + kubectl apply -f /tmp/keycloak-config-job.yaml + + log_info "Waiting for Keycloak realm import job..." + kubectl wait --for=condition=complete job/keycloak-osmo-setup \ + -n "${KEYCLOAK_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak configuration may have failed, check logs:" + kubectl logs -n "${KEYCLOAK_NAMESPACE}" -l job-name=keycloak-osmo-setup --tail=50 || true + } + + # Store the client secret for OIDC (used by Envoy sidecar) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${OIDC_CLIENT_SECRET}" \ + --from-literal=hmac_secret="$(openssl rand -base64 32)" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Clean up temporary files and ConfigMap + rm -f /tmp/keycloak-values.yaml /tmp/keycloak-config-job.yaml + kubectl delete configmap keycloak-realm-json -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + + log_success "Keycloak deployed and configured" + echo "" + if [[ "$KC_EXTERNAL" == "true" ]]; then + echo "Keycloak Access (external):" + echo " URL: https://${AUTH_DOMAIN}" + echo " Admin console: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints:" + echo " Token: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/token" + echo " Auth: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Enable OSMO auth with Envoy sidecars (production mode) + AUTH_ENABLED="true" + KEYCLOAK_EXTERNAL_URL="https://${AUTH_DOMAIN}" + log_success "OSMO authentication will be ENABLED with Envoy sidecars" + else + echo "Keycloak Access (port-forward only):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints (in-cluster):" + echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" + echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Auth disabled when Keycloak is internal-only (no Envoy, open API) + AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" + log_info "Note: OSMO auth disabled (Keycloak is internal-only, no TLS ingress)" + log_info "To enable auth, set up TLS for the auth subdomain and re-run." + fi +else + log_info "Skipping Keycloak (set DEPLOY_KEYCLOAK=true to enable)" + log_warning "Without Keycloak, 'osmo login' and token creation will not work" + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" +fi + +# ----------------------------------------------------------------------------- +# Step 5: Create OSMO Values File +# ----------------------------------------------------------------------------- +log_info "Creating OSMO values file..." + +# NGINX Ingress – run 03-deploy-nginx-ingress.sh before this script +# When OSMO_INGRESS_HOSTNAME is empty (default), ingress matches any Host header, +# allowing direct IP-based access. Set it to a real domain for host-based routing. +INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +TLS_ENABLED="${OSMO_TLS_ENABLED:-false}" +TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +TLS_MODE="${OSMO_TLS_MODE:-}" + +if [[ -n "$INGRESS_HOSTNAME" ]]; then + log_info "Ingress hostname: ${INGRESS_HOSTNAME}" +else + log_info "Ingress hostname: (any — IP-based access)" +fi + +# TLS validation +if [[ "$TLS_ENABLED" == "true" ]]; then + log_info "TLS is ENABLED" + if [[ -z "$INGRESS_HOSTNAME" ]]; then + log_error "TLS is enabled but OSMO_INGRESS_HOSTNAME is not set." + echo " TLS certificates are issued for a domain name, not a bare IP." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 + fi + # Check that the TLS secret exists (created by 03a or 03c) + OSMO_NS_CHECK="${OSMO_NAMESPACE:-osmo}" + INGRESS_NS_CHECK="${INGRESS_NAMESPACE:-ingress-nginx}" + CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" + TLS_SECRET_FOUND="false" + if kubectl get secret "${TLS_SECRET_NAME}" -n "${OSMO_NS_CHECK}" &>/dev/null || \ + kubectl get secret "${TLS_SECRET_NAME}" -n "${INGRESS_NS_CHECK}" &>/dev/null; then + TLS_SECRET_FOUND="true" + fi + + log_success "TLS secret '${TLS_SECRET_NAME}' found" +else + log_info "TLS is disabled (HTTP only). Set OSMO_TLS_ENABLED=true to enable." +fi + +# Create the values file with proper extraEnv and extraVolumes for each service +# This configures PostgreSQL password via env var and MEK via volume mount +cat > /tmp/osmo_values.yaml < Keycloak) + oauth2Filter: + enabled: true + tokenEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token + authEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth + clientId: osmo-browser-flow + authProvider: ${AUTH_DOMAIN} + secretName: oidc-secrets + clientSecretKey: client_secret + hmacSecretKey: hmac_secret + + # JWT Filter config -- three providers + jwt: + user_header: x-osmo-user + providers: + # Provider 1: Keycloak device flow (CLI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-device + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 2: Keycloak browser flow (Web UI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-browser-flow + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 3: OSMO-signed JWTs (service accounts) + - issuer: osmo + audience: osmo + jwks_uri: http://localhost:8000/api/auth/keys + user_claim: unique_name + cluster: service +ENVOY_ENABLED +else +cat < /tmp/vault-patch.json << 'PATCH_EOF' +[ + {"op": "add", "path": "/spec/template/spec/volumes/-", "value": {"name": "vault-secrets", "secret": {"secretName": "vault-secrets"}}}, + {"op": "add", "path": "/spec/template/spec/containers/0/volumeMounts/-", "value": {"name": "vault-secrets", "mountPath": "/home/osmo/vault-agent/secrets", "readOnly": true}} +] +PATCH_EOF + +# All OSMO deployments that need the vault-secrets volume for MEK +OSMO_DEPLOYMENTS=(osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router) + +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + # Check if vault-secrets volume already exists + EXISTING_VOL=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || true) + + if [[ -z "$EXISTING_VOL" ]]; then + log_info " Patching $deploy to add vault-secrets volume..." + if kubectl patch deployment/$deploy -n "${OSMO_NAMESPACE}" --type=json --patch-file=/tmp/vault-patch.json; then + log_success " $deploy patched successfully" + else + log_warning " Failed to patch $deploy" + fi + else + log_info " $deploy already has vault-secrets volume, skipping" + fi + else + log_info " $deploy not found, skipping" + fi +done + +# Cleanup patch file +rm -f /tmp/vault-patch.json + +# Wait for rollouts to complete +log_info "Waiting for deployments to roll out with new configuration..." +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + kubectl rollout status deployment/$deploy -n "${OSMO_NAMESPACE}" --timeout=180s || \ + log_warning " Timeout waiting for $deploy rollout" + fi +done + +log_success "All OSMO deployments patched with vault-secrets volume" + +# ----------------------------------------------------------------------------- +# Step 10: Patch Services for Direct Access (without Envoy) +# ----------------------------------------------------------------------------- +# When Envoy sidecar is disabled, services need to target port 8000 directly +# instead of the 'envoy-http' named port which doesn't exist. +# When Envoy IS enabled, the 'envoy-http' targetPort is correct -- skip patching. + +if [[ "$AUTH_ENABLED" == "true" ]]; then + log_info "Envoy sidecar is ENABLED -- skipping targetPort patches (envoy-http is correct)" +else + log_info "Verifying service ports (Envoy disabled)..." + + OSMO_SERVICES=(osmo-service osmo-router osmo-logger osmo-agent) + + for svc in "${OSMO_SERVICES[@]}"; do + if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then + CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") + + if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then + log_info " Patching $svc: targetPort envoy-http -> 8000" + kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ + -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ + log_warning " Failed to patch $svc" + else + log_info " $svc: targetPort = $CURRENT_TARGET (OK)" + fi + fi + done + + log_success "Service ports verified" +fi + +# ----------------------------------------------------------------------------- +# Step 11: Verify Deployment +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying deployment configuration..." + +# Verify vault-secrets volumes are mounted +echo "" +echo "Volume configuration verification:" +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + VOL_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || echo "") + ENV_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.containers[0].env[*].name}' 2>/dev/null | grep -w "OSMO_POSTGRES_PASSWORD" || echo "") + + VOL_STATUS="✗" + ENV_STATUS="✗" + [[ -n "$VOL_CHECK" ]] && VOL_STATUS="✓" + [[ -n "$ENV_CHECK" ]] && ENV_STATUS="✓" + + echo " $deploy: vault-secrets=$VOL_STATUS, postgres_env=$ENV_STATUS" + fi +done + +echo "" +echo "Pods:" +kubectl get pods -n "${OSMO_NAMESPACE}" + +echo "" +echo "Services:" +kubectl get svc -n "${OSMO_NAMESPACE}" + +# ----------------------------------------------------------------------------- +# Step 12: Configure service_base_url (required for workflow execution) +# ----------------------------------------------------------------------------- +# The osmo-ctrl sidecar in every workflow pod needs service_base_url to +# stream logs, report task status, and refresh tokens. +# This is an application-level config that must be set via the OSMO API. + +echo "" +log_info "Configuring service_base_url for workflow execution..." + +# Detect target URL from Ingress +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + TARGET_SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${TARGET_SERVICE_URL}" +elif [[ -n "$INGRESS_URL" ]]; then + TARGET_SERVICE_URL="${INGRESS_URL}" + log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" +else + log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." + log_warning "Run ./08-configure-service-url.sh manually after verifying the Ingress." + TARGET_SERVICE_URL="" +fi + +if [[ -n "$TARGET_SERVICE_URL" ]]; then + # Start port-forward using the shared helper (auto-detects Envoy) + start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + _PF_PID=$PORT_FORWARD_PID + + _cleanup_pf() { + if [[ -n "${_PF_PID:-}" ]]; then + kill $_PF_PID 2>/dev/null || true + wait $_PF_PID 2>/dev/null || true + fi + } + + # Wait for port-forward to be ready + _pf_ready=false + for i in $(seq 1 30); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done + + if [[ "$_pf_ready" == "true" ]]; then + # Login (no-op when bypassing Envoy -- osmo_curl handles auth headers) + osmo_login 8080 || true + + # Check current value + CURRENT_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + + if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url already configured: ${CURRENT_SVC_URL}" + else + if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then + log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" + fi + + # Write config and use PATCH API + cat > /tmp/service_url_fix.json << SVCEOF +{ + "service_base_url": "${TARGET_SERVICE_URL}" +} +SVCEOF + if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then + # Verify + NEW_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url configured: ${NEW_SVC_URL}" + else + log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." + fi + else + log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." + fi + rm -f /tmp/service_url_fix.json + fi + else + log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." + fi + + _cleanup_pf +fi + +echo "" +echo "========================================" +log_success "OSMO Control Plane deployment complete!" +echo "========================================" +echo "" + +if [[ "$AUTH_ENABLED" == "true" ]]; then + # --- Auth-enabled output --- + echo "Authentication: ENABLED (Keycloak + Envoy sidecars)" + echo "" + echo "Keycloak Admin Console:" + echo " URL: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo "" + echo "OSMO Access:" + if [[ -n "$INGRESS_URL" ]]; then + echo " OSMO API: ${INGRESS_URL}/api/version (unauthenticated -- skipAuthPath)" + echo " OSMO Web UI: ${INGRESS_URL} (redirects to Keycloak login)" + fi + echo "" + echo "Login methods:" + echo " Browser: Visit ${INGRESS_URL:-https://} -- you will be redirected to Keycloak" + echo " CLI: osmo login ${INGRESS_URL:-https://}" + echo " (Opens browser for device authorization flow)" + echo "" + echo "Test user: osmo-admin / osmo-admin" + echo "" + echo "Keycloak realm management (groups, roles, users):" + echo " https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/keycloak_setup.html" + echo "" +else + # --- No-auth output --- + if [[ -n "$INGRESS_URL" ]]; then + echo "OSMO Access (via NGINX Ingress LoadBalancer):" + echo " OSMO API: ${INGRESS_URL}/api/version" + echo " OSMO UI: ${INGRESS_URL}" + echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" + echo "" + else + log_warning "Could not detect Ingress LoadBalancer IP." + echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" + echo "" + echo " Fallback (port-forward):" + echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" + echo " URL: http://localhost:8080" + echo "" + fi + + echo "NOTE: OSMO API authentication is DISABLED." + echo " The API is accessible without tokens." + echo " Set DEPLOY_KEYCLOAK=true with TLS to enable Keycloak + Envoy auth." + echo "" + echo "Test the API:" + if [[ -n "$INGRESS_URL" ]]; then + echo " curl ${INGRESS_URL}/api/version" + echo " curl ${INGRESS_URL}/api/workflow" + else + echo " curl http://localhost:8080/api/version" + echo " curl http://localhost:8080/api/workflow" + fi + echo "" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + echo "Keycloak Access (internal only, auth not enforced):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + fi +fi + +echo "Ingress resources:" +kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true +echo "" +echo "Next step - Deploy Backend Operator:" +echo " ./06-deploy-osmo-backend.sh" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh new file mode 100755 index 000000000..e720c9cb5 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh @@ -0,0 +1,410 @@ +#!/bin/bash +# +# Deploy OSMO Backend Operator +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Backend Operator Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" +OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" +OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" + +# Check for OSMO Service URL (in-cluster URL for the backend operator pods) +# IMPORTANT: Backend operators connect via WebSocket to osmo-agent, NOT osmo-service! +# The osmo-service handles REST API, osmo-agent handles WebSocket connections for backends +if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then + log_info "Auto-detecting in-cluster OSMO Agent URL..." + + # Backend operators MUST connect to osmo-agent for WebSocket connections + # The osmo-service WebSocket routes only exist in dev mode + OSMO_AGENT=$(kubectl get svc -n osmo osmo-agent -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://osmo-agent.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + # Fallback: try to detect from any osmo-agent service + OSMO_AGENT=$(kubectl get svc -n osmo -l app.kubernetes.io/name=agent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://${OSMO_AGENT}.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + echo "" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" + log_error "Note: Backend operators require osmo-agent service for WebSocket connections" + exit 1 + fi + fi +fi + +# Check for OSMO Service Token +if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then + # First, ensure namespace exists so we can check for existing secret + kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true + + # Check if token secret already exists in cluster + EXISTING_TOKEN=$(kubectl get secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" -o jsonpath='{.data.token}' 2>/dev/null | base64 -d || echo "") + + if [[ -n "$EXISTING_TOKEN" ]]; then + log_info "Using existing token from secret osmo-operator-token" + OSMO_SERVICE_TOKEN="$EXISTING_TOKEN" + elif command -v osmo &>/dev/null; then + # Check if osmo CLI is already logged in (don't try to login with in-cluster URL) + log_info "Checking if OSMO CLI is already logged in..." + + # Try to generate token - this only works if CLI is already logged in + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + TOKEN_JSON=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token" \ + --service --roles osmo-backend -t json 2>/dev/null || echo "") + + if [[ -n "$TOKEN_JSON" ]]; then + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_JSON" | jq -r '.token // empty' 2>/dev/null || echo "") + fi + + if [[ -n "$OSMO_SERVICE_TOKEN" ]]; then + log_success "Service token generated: $TOKEN_NAME (expires: $EXPIRY_DATE)" + fi + fi + + # If still no token, automatically create one using port-forward + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_info "No token found - automatically creating service token..." + + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + # Cleanup function to kill port-forwards on exit + PF_PIDS=() + cleanup_port_forwards() { + for pid in "${PF_PIDS[@]}"; do + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + done + } + trap cleanup_port_forwards EXIT + + # Detect if Keycloak auth is enabled + KEYCLOAK_ENABLED="false" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + KEYCLOAK_ENABLED="true" + elif kubectl get svc -n "${OSMO_NAMESPACE:-osmo}" keycloak &>/dev/null; then + KEYCLOAK_ENABLED="true" + fi + + if [[ "$KEYCLOAK_ENABLED" == "true" ]]; then + # --------------------------------------------------------------- + # Keycloak-enabled: use Resource Owner Password Grant to get JWT, + # then call OSMO REST API with Bearer token + # --------------------------------------------------------------- + log_info "Keycloak detected - using password grant for token creation..." + + # Derive Keycloak external URL from the ingress (ensures JWT issuer matches + # what Envoy expects -- using port-forward would produce a wrong issuer) + KC_INGRESS_HOST=$(kubectl get ingress -n "${OSMO_NAMESPACE:-osmo}" keycloak -o jsonpath='{.spec.rules[0].host}' 2>/dev/null || echo "") + if [[ -z "$KC_INGRESS_HOST" ]]; then + log_error "Could not detect Keycloak ingress hostname" + exit 1 + fi + KEYCLOAK_TOKEN_URL="https://${KC_INGRESS_HOST}/realms/osmo/protocol/openid-connect/token" + log_info "Keycloak token endpoint: ${KEYCLOAK_TOKEN_URL}" + + # Port-forward to OSMO service (for the token creation API) + log_info "Starting port-forward to OSMO service..." + kubectl port-forward -n "${OSMO_NAMESPACE:-osmo}" svc/osmo-service 8080:80 &>/dev/null & + PF_PIDS+=($!) + + # Wait for port-forward to be ready + log_info "Waiting for port-forward to be ready..." + max_wait=30 + elapsed=0 + while true; do + SVC_READY=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") + if [[ "$SVC_READY" =~ ^(200|401|403)$ ]]; then + break + fi + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s (service=$SVC_READY)" + exit 1 + fi + done + log_success "Port-forward ready" + + # Get Keycloak JWT via Resource Owner Password Grant + # Uses osmo-device client (public, directAccessGrantsEnabled=true) + # MUST use external Keycloak URL so the JWT issuer matches what Envoy expects + KC_ADMIN_USER="${OSMO_KC_ADMIN_USER:-osmo-admin}" + KC_ADMIN_PASS="${OSMO_KC_ADMIN_PASS:-osmo-admin}" + + log_info "Authenticating with Keycloak as '${KC_ADMIN_USER}'..." + KC_RESPONSE=$(curl -s -X POST "${KEYCLOAK_TOKEN_URL}" \ + -d "grant_type=password" \ + -d "client_id=osmo-device" \ + -d "username=${KC_ADMIN_USER}" \ + -d "password=${KC_ADMIN_PASS}") + + KC_JWT=$(echo "$KC_RESPONSE" | jq -r '.access_token // empty' 2>/dev/null || echo "") + if [[ -z "$KC_JWT" ]]; then + KC_ERROR=$(echo "$KC_RESPONSE" | jq -r '.error_description // .error // empty' 2>/dev/null || echo "unknown error") + log_error "Keycloak authentication failed: $KC_ERROR" + log_error "Ensure OSMO_KC_ADMIN_USER and OSMO_KC_ADMIN_PASS are set, or that osmo-admin/osmo-admin is valid" + exit 1 + fi + log_success "Keycloak authentication successful" + + # Create service token via OSMO REST API + # NOTE: Must use "x-osmo-auth" header (not Authorization), because: + # 1. Envoy's OAuth2 filter runs first and would redirect to Keycloak + # if it doesn't see OAuth cookies. The "x-osmo-auth" header triggers + # the pass_through_matcher, bypassing the OAuth2 redirect. + # 2. Envoy's JWT filter reads from "x-osmo-auth" (not Authorization). + # 3. No "Bearer " prefix -- the JWT filter has no value_prefix configured, + # so it expects the raw JWT directly. + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \ + "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}?expires_at=${EXPIRY_DATE}&roles=osmo-backend" \ + -H "x-osmo-auth: ${KC_JWT}" \ + -H "Content-Type: application/json") + + # Separate response body from HTTP status code + HTTP_CODE=$(echo "$TOKEN_RESPONSE" | tail -1) + TOKEN_BODY=$(echo "$TOKEN_RESPONSE" | sed '$d') + + if [[ "$HTTP_CODE" != "200" && "$HTTP_CODE" != "201" ]]; then + log_error "Token creation API returned HTTP $HTTP_CODE" + log_error "Response: $TOKEN_BODY" + exit 1 + fi + + # Response is the raw token string (quoted JSON string) + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_BODY" | jq -r '. // empty' 2>/dev/null || echo "") + # If jq fails (response might be a plain string, not JSON), use raw + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_BODY" | tr -d '"' | tr -d '\r' | xargs) + fi + + else + # --------------------------------------------------------------- + # No Keycloak: use dev auth method (original approach) + # --------------------------------------------------------------- + # Check if osmo CLI is available + if ! command -v osmo &>/dev/null; then + log_error "osmo CLI not found. Please install it first." + exit 1 + fi + + # Start port-forward in background + log_info "Starting port-forward to OSMO service..." + kubectl port-forward -n "${OSMO_NAMESPACE:-osmo}" svc/osmo-service 8080:80 &>/dev/null & + PF_PIDS+=($!) + + # Wait for port-forward to be ready + log_info "Waiting for port-forward to be ready..." + max_wait=30 + elapsed=0 + while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi + done + log_success "Port-forward ready" + + # Login with dev method (auth is disabled) + log_info "Logging in to OSMO (dev method)..." + if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO. If Keycloak is enabled, set DEPLOY_KEYCLOAK=true" + exit 1 + fi + log_success "Logged in successfully" + + # Create service token + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token (auto-generated)" \ + --service --roles osmo-backend 2>&1) + + # Extract token from output (format: "Access token: ") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) + fi + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_error "Failed to create service token" + echo "Response: ${TOKEN_RESPONSE:-$TOKEN_OUTPUT}" + exit 1 + fi + + log_success "Service token created: $TOKEN_NAME (expires: $EXPIRY_DATE)" + + # Stop port-forwards + cleanup_port_forwards + trap - EXIT + fi +fi + +# ----------------------------------------------------------------------------- +# Add OSMO Helm Repository +# ----------------------------------------------------------------------------- +log_info "Adding OSMO Helm repository..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespaces..." +kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +kubectl create namespace "${OSMO_WORKFLOWS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating operator token secret..." +kubectl create secret generic osmo-operator-token \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --from-literal=token="${OSMO_SERVICE_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Values File +# ----------------------------------------------------------------------------- +log_info "Creating Helm values file..." + +# Note: services.backendListener/Worker are at root level, not under global +# See: osmo-helm-charts/backend-operator/values.yaml +cat > /tmp/backend_operator_values.yaml </dev/null || true + +echo "" +echo "========================================" +log_success "OSMO Backend Operator deployment complete!" +echo "========================================" +echo "" +echo "Backend Name: ${BACKEND_NAME}" +echo "Agent URL (WebSocket): ${OSMO_SERVICE_URL}" +echo "" +# Detect Ingress URL for verification instructions +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + +echo "To verify the backend registration:" +echo "" +if [[ -n "$INGRESS_URL" ]]; then + echo " Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl (using NGINX Ingress LoadBalancer):" + echo " curl ${INGRESS_URL}/api/configs/backend" +else + echo " Terminal 1 - Start port-forward (keep running):" + echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" + echo "" + echo " Terminal 2 - Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl:" + echo " curl http://localhost:8080/api/configs/backend" +fi +echo "" +echo "Next step - Configure Storage:" +echo " ./07-configure-storage.sh" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/07-configure-storage.sh b/applications/osmo/deploy/example/002-setup/07-configure-storage.sh new file mode 100755 index 000000000..c4ef993e5 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/07-configure-storage.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# +# Configure OSMO Storage +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/configure_data_storage.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Storage Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Get Storage Configuration from Terraform +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration from Terraform..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") + +# Require NEBIUS_REGION (set by nebius-env-init.sh) +if [[ -z "${NEBIUS_REGION:-}" ]]; then + log_error "NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 +fi + +# Default endpoint if not set +if [[ -z "$S3_ENDPOINT" ]]; then + S3_ENDPOINT="https://storage.${NEBIUS_REGION}.nebius.cloud" +fi + +if [[ -z "$S3_BUCKET" ]]; then + log_error "Could not retrieve storage bucket name from Terraform" + echo "" + echo "Make sure you have run 'terraform apply' in deploy/001-iac" + echo "and that storage is enabled in your terraform.tfvars" + exit 1 +fi + +log_success "Storage bucket: ${S3_BUCKET}" +log_success "Storage endpoint: ${S3_ENDPOINT}" + +# ----------------------------------------------------------------------------- +# Check/Create osmo-storage secret +# ----------------------------------------------------------------------------- +log_info "Checking for osmo-storage secret..." + +if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then + log_warning "osmo-storage secret not found - attempting to create from MysteryBox..." + + # Get credentials from Terraform/MysteryBox + S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_KEY="" + + if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") + fi + + if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + echo "" + echo "Either re-run 04-deploy-osmo-control-plane.sh or create the secret manually:" + echo "" + echo " kubectl create secret generic osmo-storage \\" + echo " --namespace osmo \\" + echo " --from-literal=access-key-id= \\" + echo " --from-literal=secret-access-key=" + exit 1 + fi + + # Create the secret + kubectl create secret generic osmo-storage \ + --namespace osmo \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "osmo-storage secret created" +else + log_success "osmo-storage secret exists" +fi + +# ----------------------------------------------------------------------------- +# Start port-forward and configure storage +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" + +start_osmo_port_forward "${OSMO_NS}" 8080 + +# Cleanup function +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login (no-op when bypassing Envoy -- curl headers handle auth) +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Get Storage Credentials +# ----------------------------------------------------------------------------- +log_info "Retrieving storage credentials..." + +# Get access key from Terraform +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + +# Get secret key from osmo-storage secret (already created) +S3_SECRET_KEY=$(kubectl get secret osmo-storage -n osmo -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + exit 1 +fi + +# Nebius Object Storage uses S3-compatible API +# OSMO uses TOS (Torch Object Storage) scheme for S3-compatible storage with custom endpoints +# Format: tos:/// +S3_HOST=$(echo "$S3_ENDPOINT" | sed 's|https://||') +BACKEND_URI="tos://${S3_HOST}/${S3_BUCKET}" +REGION="${NEBIUS_REGION}" + +log_success "Storage credentials retrieved" + +# ----------------------------------------------------------------------------- +# Configure Workflow Log Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow log storage..." + +# Create workflow log config JSON +WORKFLOW_LOG_CONFIG=$(cat < /tmp/workflow_log_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_log_config.json "Configure workflow log storage"; then + log_success "Workflow log storage configured" +else + log_error "Failed to configure workflow log storage" + rm -f /tmp/workflow_log_config.json + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Configure Workflow Data Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow data storage..." + +# Create workflow data config JSON +WORKFLOW_DATA_CONFIG=$(cat < /tmp/workflow_data_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_data_config.json "Configure workflow data storage"; then + log_success "Workflow data storage configured" +else + log_error "Failed to configure workflow data storage" + rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + exit 1 +fi + +# Cleanup temp files +rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + +# ----------------------------------------------------------------------------- +# Configure Workflow Limits +# ----------------------------------------------------------------------------- +log_info "Configuring workflow limits (max_num_tasks=200)..." + +WORKFLOW_LIMITS_CONFIG=$(cat < /tmp/workflow_limits_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_limits_config.json "Configure workflow limits"; then + log_success "Workflow limits configured (max_num_tasks=200)" +else + log_warning "Failed to configure workflow limits (may require newer OSMO version)" +fi + +rm -f /tmp/workflow_limits_config.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying storage configuration..." + +echo "" +echo "Workflow configuration:" +osmo_curl GET "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ + log_warning "Could not retrieve workflow config for verification" + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Storage configuration complete!" +echo "========================================" +echo "" +echo "Storage Details:" +echo " Bucket: ${S3_BUCKET}" +echo " Endpoint: ${S3_ENDPOINT}" +echo " Backend URI: ${BACKEND_URI}" +echo " Region: ${REGION}" +echo "" +echo "Configured:" +echo " - workflow_log: For storing workflow logs" +echo " - workflow_data: For storing intermediate task data" +echo "" +echo "OSMO can now store workflow artifacts in Nebius Object Storage." +echo "" diff --git a/applications/osmo/deploy/example/002-setup/08-configure-service-url.sh b/applications/osmo/deploy/example/002-setup/08-configure-service-url.sh new file mode 100755 index 000000000..d0a291731 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/08-configure-service-url.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# +# Configure OSMO Service URL +# Required for osmo-ctrl sidecar to communicate with OSMO service +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Service URL Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +if [[ -z "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + log_error "OSMO_INGRESS_HOSTNAME is not set." + echo " Source your environment first: source ../000-prerequisites/nebius-env-init.sh" + echo " Or set it manually: export OSMO_INGRESS_HOSTNAME=" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" + +start_osmo_port_forward "${OSMO_NS}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login (no-op when bypassing Envoy -- curl headers handle auth) +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Determine the target service URL +# ----------------------------------------------------------------------------- +log_info "Determining target service URL..." + +# Priority: +# 1. Explicit OSMO_INGRESS_BASE_URL (user override) +# 2. Auto-detect from NGINX Ingress Controller LoadBalancer +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${SERVICE_URL}" +elif DETECTED_URL=$(detect_service_url 2>/dev/null) && [[ -n "$DETECTED_URL" ]]; then + SERVICE_URL="${DETECTED_URL}" + log_info "Auto-detected service URL: ${SERVICE_URL}" +else + log_error "Could not detect NGINX Ingress Controller URL." + log_error "Ensure 03-deploy-nginx-ingress.sh was run and the LoadBalancer has an IP." + if [[ "${OSMO_TLS_ENABLED:-false}" == "true" ]]; then + log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=https://" + else + log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=http://" + fi + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Check current service_base_url +# ----------------------------------------------------------------------------- +log_info "Checking current service_base_url..." + +CURRENT_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') +echo "Current service_base_url: '${CURRENT_URL}'" + +if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" && "$CURRENT_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url is already correctly configured: ${CURRENT_URL}" + cleanup_port_forward + trap - EXIT + exit 0 +elif [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then + log_warning "service_base_url is set to '${CURRENT_URL}' but should be '${SERVICE_URL}'" + log_info "Updating service_base_url..." +fi + +# ----------------------------------------------------------------------------- +# Configure service_base_url +# ----------------------------------------------------------------------------- +log_info "Configuring service_base_url to: ${SERVICE_URL}" + +cat > /tmp/service_url_fix.json << EOF +{ + "service_base_url": "${SERVICE_URL}" +} +EOF + +if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then + log_success "service_base_url configured" +else + log_error "Failed to configure service_base_url" + rm -f /tmp/service_url_fix.json + exit 1 +fi + +rm -f /tmp/service_url_fix.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +NEW_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + +if [[ "$NEW_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url verified: ${NEW_URL}" +else + log_error "Verification failed. Expected: ${SERVICE_URL}, Got: ${NEW_URL}" + exit 1 +fi + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Service URL configuration complete!" +echo "========================================" +echo "" +echo "Service URL: ${SERVICE_URL}" +echo "" +echo "This URL is used by the osmo-ctrl sidecar container to:" +echo " - Stream workflow logs to the OSMO service" +echo " - Report task status and completion" +echo " - Fetch authentication tokens" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/09-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/09-configure-gpu-platform.sh new file mode 100755 index 000000000..e98fca791 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/09-configure-gpu-platform.sh @@ -0,0 +1,210 @@ +#!/bin/bash +# Configure OSMO GPU platform with tolerations via pod templates +# Based on OSMO documentation: https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/resource_pools.html + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" + +# Require NEBIUS_REGION (set by nebius-env-init.sh) +if [[ -z "${NEBIUS_REGION:-}" ]]; then + echo "ERROR: NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Determine GPU platform name +# ----------------------------------------------------------------------------- +# Try to read the GPU platform from Terraform output and derive a friendly name. +# Maps: gpu-h100-sxm -> H100, gpu-h200-sxm -> H200, gpu-b200-sxm-a -> B200, etc. +# Falls back to user input if Terraform is unavailable. +if [[ -z "${GPU_PLATFORM_NAME:-}" ]]; then + TF_GPU_PLATFORM=$(get_tf_output "gpu_nodes_platform" "../001-iac" 2>/dev/null || echo "") + if [[ -n "$TF_GPU_PLATFORM" ]]; then + # Extract friendly name: gpu-h100-sxm -> H100, gpu-b200-sxm-a -> B200, gpu-l40s-a -> L40S + GPU_PLATFORM_NAME=$(echo "$TF_GPU_PLATFORM" | sed -E 's/^gpu-([a-zA-Z0-9]+).*/\1/' | tr '[:lower:]' '[:upper:]') + log_info "Auto-detected GPU platform from Terraform: ${TF_GPU_PLATFORM} -> ${GPU_PLATFORM_NAME}" + else + echo "" + echo "Could not auto-detect GPU platform from Terraform." + read -r -p "Enter GPU platform name (e.g. H100, H200, B200, L40S): " GPU_PLATFORM_NAME + if [[ -z "$GPU_PLATFORM_NAME" ]]; then + log_error "GPU platform name is required." + exit 1 + fi + fi +fi + +echo "" +echo "========================================" +echo " OSMO GPU Platform Configuration" +echo " Platform name: ${GPU_PLATFORM_NAME}" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# ----------------------------------------------------------------------------- +# Step 1: Fix default_user pod template (remove GPU resources) +# ----------------------------------------------------------------------------- +# The built-in default_user template includes nvidia.com/gpu which causes ALL +# workflows (including CPU-only) to request the nvidia RuntimeClass. This fails +# on CPU nodes. We move GPU resources to the gpu_tolerations template instead. +log_info "Updating default_user pod template (removing GPU resources)..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/default_user" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/default_user_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "default_user pod template updated (HTTP ${HTTP_CODE})" +else + log_error "Failed to update default_user pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 2: Create GPU pod template +# ----------------------------------------------------------------------------- +log_info "Creating gpu_tolerations pod template..." + +# Substitute {{NEBIUS_REGION}} placeholder in the template +GPU_POD_TEMPLATE_RESOLVED="/tmp/gpu_pod_template_resolved.json" +sed "s/{{NEBIUS_REGION}}/${NEBIUS_REGION}/g" "${SCRIPT_DIR}/gpu_pod_template.json" > "${GPU_POD_TEMPLATE_RESOLVED}" + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ + -w "\n%{http_code}" \ + -d @"${GPU_POD_TEMPLATE_RESOLVED}") +rm -f "${GPU_POD_TEMPLATE_RESOLVED}" +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Pod template created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 2b: Create shared memory pod template +# ----------------------------------------------------------------------------- +log_info "Creating shm pod template (shared memory for vLLM, PyTorch, etc.)..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/shm" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/shm_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Shared memory pod template created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create shm pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 3: Create GPU platform +# ----------------------------------------------------------------------------- +log_info "Creating platform '${GPU_PLATFORM_NAME}' in default pool..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/${GPU_PLATFORM_NAME}" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/gpu_platform_update.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Platform '${GPU_PLATFORM_NAME}' created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create platform '${GPU_PLATFORM_NAME}' (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 4: Verify configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +echo "" +echo "Pod templates:" +osmo_curl GET "${OSMO_URL}/api/configs/pod_template" | jq 'keys' + +echo "" +echo "Platform '${GPU_PLATFORM_NAME}' config:" +osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq ".platforms.${GPU_PLATFORM_NAME}" + +# ----------------------------------------------------------------------------- +# Step 5: Check GPU resources +# ----------------------------------------------------------------------------- +log_info "Checking GPU resources..." +sleep 3 # Wait for backend to pick up changes + +RESOURCE_JSON=$(osmo_curl GET "${OSMO_URL}/api/resources" 2>/dev/null || echo '{}') +RESOURCE_COUNT=$(echo "$RESOURCE_JSON" | jq '[(.resources // [])[] | select(.allocatable_fields.gpu != null)] | length' 2>/dev/null || echo "0") +echo "GPU nodes visible to OSMO: ${RESOURCE_COUNT}" + +if [[ "$RESOURCE_COUNT" -gt 0 ]]; then + echo "" + echo "GPU resources:" + echo "$RESOURCE_JSON" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' +fi + +# ----------------------------------------------------------------------------- +# Step 6: Set default pool profile +# ----------------------------------------------------------------------------- +log_info "Setting default pool to 'default'..." +osmo profile set pool default 2>/dev/null && log_success "Default pool set" || log_warning "Could not set default pool (set manually: osmo profile set pool default)" + +# ----------------------------------------------------------------------------- +# Done +# ----------------------------------------------------------------------------- +log_success "GPU platform configuration complete" +echo "" +echo "To submit a GPU workflow:" +echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default --platform ${GPU_PLATFORM_NAME}" +echo "" +echo "Or test via curl:" +echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" diff --git a/applications/osmo/deploy/example/002-setup/10-configure-backend-scheduler.sh b/applications/osmo/deploy/example/002-setup/10-configure-backend-scheduler.sh new file mode 100755 index 000000000..e47a6773e --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/10-configure-backend-scheduler.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Configure BACKEND scheduler_settings (KAI scheduler + coscheduling) for Nebius OSMO. +# Run after 05-deploy-osmo-backend.sh once the backend is ONLINE. +# Option A: Patch existing backend (keeps router_address, etc.) – default. +# Option B: Apply config from config/scheduler-config.template.json (set ROUTER_ADDRESS, etc.). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +CONFIG_DIR="${SCRIPT_DIR}/config" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" +K8S_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}" + +# Use template (Option B) if --from-template and template exists +USE_TEMPLATE=false +[[ "${1:-}" == "--from-template" ]] && USE_TEMPLATE=true + +echo "" +echo "========================================" +echo " Configure BACKEND scheduler (KAI + coscheduling)" +echo "========================================" +echo "" + +check_kubectl || exit 1 +command -v jq &>/dev/null || { log_error "jq is required"; exit 1; } + +# ----------------------------------------------------------------------------- +# Start port-forward and login +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill "$PORT_FORWARD_PID" 2>/dev/null || true + wait "$PORT_FORWARD_PID" 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +log_info "Waiting for port-forward..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +osmo_login 8080 || true + +# ----------------------------------------------------------------------------- +# Build backend config and apply +# ----------------------------------------------------------------------------- +if [[ "$USE_TEMPLATE" == "true" && -f "${CONFIG_DIR}/scheduler-config.template.json" ]]; then + # Option B: Render template and apply (set ROUTER_ADDRESS before running) + log_info "Using config from scheduler-config.template.json..." + if [[ -z "${ROUTER_ADDRESS:-}" ]]; then + # Derive from ingress: https://host -> wss://host + INGRESS_URL=$(detect_service_url 2>/dev/null || true) + if [[ -n "$INGRESS_URL" ]]; then + ROUTER_ADDRESS="wss://$(echo "$INGRESS_URL" | sed -e 's|https\?://||' -e 's|/.*||')" + log_info "Derived ROUTER_ADDRESS from ingress: ${ROUTER_ADDRESS}" + else + log_error "Set ROUTER_ADDRESS (e.g. wss://your-osmo-host) or run without --from-template to patch existing backend" + exit 1 + fi + fi + export BACKEND_NAME + export K8S_NAMESPACE + export ROUTER_ADDRESS + mkdir -p "${CONFIG_DIR}/out" + envsubst < "${CONFIG_DIR}/scheduler-config.template.json" > "${CONFIG_DIR}/out/scheduler-config.json" + BACKEND_FILE="${CONFIG_DIR}/out/scheduler-config.json" + if ! osmo config update BACKEND "$BACKEND_NAME" --file "$BACKEND_FILE" --description "Backend $BACKEND_NAME scheduler (KAI + coscheduling)"; then + log_error "Failed to apply backend config from template" + exit 1 + fi +else + # Option A: Patch existing backend (keep router_address and other fields) + log_info "Patching existing backend '$BACKEND_NAME' scheduler_settings (KAI + coscheduling)..." + BACKEND_JSON=$(osmo_curl GET "${OSMO_URL}/api/configs/backend" 2>/dev/null || true) + if [[ -z "$BACKEND_JSON" ]]; then + log_error "Could not get backend config. Is the backend registered? Run: osmo config show BACKEND" + exit 1 + fi + BACKEND_OBJECT=$(echo "$BACKEND_JSON" | jq -c --arg name "$BACKEND_NAME" \ + '.backends[] | select(.name == $name) | . + {scheduler_settings: {"scheduler_type":"kai","scheduler_name":"kai-scheduler","coscheduling":true,"scheduler_timeout":30}}') + if [[ -z "$BACKEND_OBJECT" || "$BACKEND_OBJECT" == "null" ]]; then + log_error "Backend '$BACKEND_NAME' not found in config. Available: $(echo "$BACKEND_JSON" | jq -r '.backends[].name' 2>/dev/null | tr '\n' ' ')" + exit 1 + fi + TMP_FILE=$(mktemp) + echo "$BACKEND_OBJECT" > "$TMP_FILE" + if ! osmo config update BACKEND "$BACKEND_NAME" --file "$TMP_FILE" --description "Backend $BACKEND_NAME scheduler (KAI + coscheduling)"; then + rm -f "$TMP_FILE" + log_error "Failed to update backend config" + exit 1 + fi + rm -f "$TMP_FILE" +fi + +log_success "BACKEND scheduler configuration applied" +echo "" +echo "Verify:" +echo " osmo config show BACKEND ${BACKEND_NAME}" +echo "" +echo "You should see scheduler_settings: scheduler_type=kai, coscheduling=true" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/11-configure-dataset-bucket.sh b/applications/osmo/deploy/example/002-setup/11-configure-dataset-bucket.sh new file mode 100755 index 000000000..bf676e073 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/11-configure-dataset-bucket.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# +# Register the Nebius storage bucket as an OSMO dataset bucket. +# This allows using the bucket for OSMO datasets (e.g. osmo dataset upload/list) +# with a short name (e.g. nebius/my-dataset) instead of full URIs. +# +# Requires: 06-configure-storage.sh (port-forward and workflow storage) and +# OSMO control plane running. Uses the same bucket and credentials as workflow storage. +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Dataset Bucket Configuration" +echo "========================================" +echo "" + +# Optional: name for the bucket in OSMO (default: nebius) +DATASET_BUCKET_NAME="${DATASET_BUCKET_NAME:-nebius}" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Nebius Region (from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +if [[ -z "${NEBIUS_REGION:-}" ]]; then + log_error "NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 +fi +REGION="$NEBIUS_REGION" +log_info "Using region: ${REGION}" + +S3_REGION_FOR_BOTO="${REGION}" + +# ----------------------------------------------------------------------------- +# Get Storage Configuration from Terraform +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration from Terraform..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") + +if [[ -z "$S3_ENDPOINT" ]]; then + S3_ENDPOINT="https://storage.${REGION}.nebius.cloud" +fi + +if [[ -z "$S3_BUCKET" ]]; then + log_error "Could not retrieve storage bucket name from Terraform" + echo "" + echo "Run 'terraform apply' in deploy/001-iac and ensure storage is enabled." + exit 1 +fi + +# Datasets are stored under the osmo-datasets prefix within the bucket. +# The path uses the standard s3:/// format; the actual endpoint +# is configured separately via AWS_ENDPOINT_URL_S3 in the Helm chart / pod template. +DATASET_PATH="s3://${S3_BUCKET}/osmo-datasets" + +# ----------------------------------------------------------------------------- +# Get storage credentials (for default_credential on the dataset bucket) +# ----------------------------------------------------------------------------- +log_info "Retrieving storage credentials for default_credential..." + +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") +S3_SECRET_KEY=$(kubectl get secret osmo-storage -n "${OSMO_NAMESPACE:-osmo}" -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$S3_ACCESS_KEY" ]]; then + log_warning "Could not get access key from Terraform; bucket will have no default_credential" +fi +if [[ -z "$S3_SECRET_KEY" ]]; then + S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") + if [[ -n "$S3_SECRET_REF_ID" ]]; then + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") + fi +fi + +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + log_success "Storage credentials retrieved (default_credential will be set)" +else + log_warning "Missing credentials; registering bucket without default_credential (users must supply credentials)" +fi + +log_success "Bucket: ${S3_BUCKET}" +log_success "Dataset path: ${DATASET_PATH}" +log_success "Region: ${REGION}" +log_success "S3 endpoint: ${S3_ENDPOINT}" +log_success "OSMO bucket name: ${DATASET_BUCKET_NAME}" + +# ----------------------------------------------------------------------------- +# Start port-forward and configure dataset bucket +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +start_osmo_port_forward "${OSMO_NS}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +log_info "Waiting for port-forward..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Build dataset config: add/update Nebius bucket and set as default bucket +# See: https://nvidia.github.io/OSMO/main/deployment_guide/advanced_config/dataset_buckets.html +# ----------------------------------------------------------------------------- +log_info "Building DATASET config (bucket + default_bucket)..." + +# Build bucket config object (with optional default_credential) +# PATCH API accepts only access_key_id and access_key in default_credential; +# endpoint/region are taken from the bucket at runtime. +BUCKET_JSON="/tmp/osmo_dataset_bucket_obj.json" +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + jq -n \ + --arg path "$DATASET_PATH" \ + --arg region "$S3_REGION_FOR_BOTO" \ + --arg akid "$S3_ACCESS_KEY" \ + --arg ak "$S3_SECRET_KEY" \ + '{ + dataset_path: $path, + region: $region, + description: "Nebius Object Storage bucket", + mode: "read-write", + default_credential: { + access_key_id: $akid, + access_key: $ak + } + }' > "${BUCKET_JSON}" +else + jq -n \ + --arg path "$DATASET_PATH" \ + --arg region "$S3_REGION_FOR_BOTO" \ + '{ + dataset_path: $path, + region: $region, + description: "Nebius Object Storage bucket", + mode: "read-write" + }' > "${BUCKET_JSON}" +fi + +# Fetch current dataset config so we can merge (preserve other buckets if any) +CURRENT_DATASET="/tmp/osmo_dataset_current.json" +if osmo_curl GET "http://localhost:8080/api/configs/dataset" 2>/dev/null | jq -r '.configs_dict // . | if type == "object" then . else empty end' > "${CURRENT_DATASET}" 2>/dev/null && [[ -s "${CURRENT_DATASET}" ]]; then + # Merge: add/overwrite our bucket and set default_bucket (users can omit bucket prefix) + UPDATED_DATASET="/tmp/osmo_dataset_updated.json" + jq --arg name "$DATASET_BUCKET_NAME" \ + --slurpfile bucket "${BUCKET_JSON}" \ + '.buckets[$name] = $bucket[0] | .default_bucket = $name' \ + "${CURRENT_DATASET}" > "${UPDATED_DATASET}" +else + # No existing config: create new with single bucket and set as default_bucket + UPDATED_DATASET="/tmp/osmo_dataset_updated.json" + jq -n --arg name "$DATASET_BUCKET_NAME" \ + --slurpfile bucket "${BUCKET_JSON}" \ + '{ buckets: { ($name): $bucket[0] }, default_bucket: $name }' \ + > "${UPDATED_DATASET}" +fi + +if osmo_config_update DATASET "${UPDATED_DATASET}" "Register Nebius bucket and set as default dataset bucket"; then + log_success "Dataset bucket configured and set as default" +else + log_error "Failed to configure dataset bucket" + rm -f "${BUCKET_JSON}" "${CURRENT_DATASET}" "${UPDATED_DATASET}" + exit 1 +fi + +rm -f "${BUCKET_JSON}" "${CURRENT_DATASET}" "${UPDATED_DATASET}" + +# ----------------------------------------------------------------------------- +# Verify +# ----------------------------------------------------------------------------- +log_info "Verifying..." +echo "" +osmo_curl GET "http://localhost:8080/api/configs/dataset" 2>/dev/null | jq '.configs_dict // .' || true + +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO dataset bucket configuration complete!" +echo "========================================" +echo "" +echo "Bucket '${DATASET_BUCKET_NAME}' is registered and set as the default bucket." +echo " dataset_path: ${DATASET_PATH}" +echo " default_bucket: ${DATASET_BUCKET_NAME}" +echo "" +echo "With default_bucket set, you can reference datasets without the bucket prefix:" +echo " my-dataset:latest (instead of ${DATASET_BUCKET_NAME}/my-dataset:latest)" +echo "" +echo "Usage:" +echo " osmo profile set bucket ${DATASET_BUCKET_NAME}" +echo " osmo bucket list" +echo " osmo dataset upload my-dataset:latest ./data" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/12-verify-installation.sh b/applications/osmo/deploy/example/002-setup/12-verify-installation.sh new file mode 100755 index 000000000..624832d7a --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/12-verify-installation.sh @@ -0,0 +1,426 @@ +#!/bin/bash +# ============================================================================= +# OSMO Installation Verification Script +# ============================================================================= +# Checks that all required components are properly configured: +# 1. GPU Operator running with driver enabled, version 580.95.05 +# 2. /mnt/data mounted on all nodes +# 3. 64Gi /dev/shm pod template configured in OSMO +# 4. Redis sized correctly (8 vCPU, ~52.82Gi mem, 50Gi PVC) +# 5. max_num_tasks >= 200 in WORKFLOW config +# 6. Platform name is not the default "gpu" +# +# Prerequisites: +# - kubectl configured and connected to the target cluster +# (run: nebius mk8s cluster get-credentials --id --external) +# - helm CLI installed (for GPU Operator checks) +# - jq installed +# - curl installed +# - OSMO CLI installed and accessible (for osmo login) +# - Port 8080 available locally (used for port-forward to OSMO service) +# - NEBIUS_REGION set (run: source ../000-prerequisites/nebius-env-init.sh) +# +# Usage: +# ./12-verify-installation.sh +# +# Environment variables (optional overrides): +# OSMO_URL OSMO API URL (default: http://localhost:8080) +# OSMO_NAMESPACE Namespace where OSMO is deployed (default: osmo) +# GPU_OPERATOR_NAMESPACE Namespace for GPU Operator (default: gpu-operator) +# EXPECTED_DRIVER_VERSION Expected NVIDIA driver version (default: 580.95.05) +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +EXPECTED_DRIVER_VERSION="${EXPECTED_DRIVER_VERSION:-580.95.05}" +MIN_REDIS_CPU=8 +MIN_REDIS_MEMORY_GI=50 +MIN_REDIS_PVC_GI=50 +MIN_MAX_NUM_TASKS=200 +EXPECTED_SHM_SIZE="64Gi" + +PASS=0 +FAIL=0 +WARN=0 + +check_pass() { + ((PASS++)) + log_success "$1" +} + +check_fail() { + ((FAIL++)) + log_error "$1" +} + +check_warn() { + ((WARN++)) + log_warning "$1" +} + +echo "" +echo "========================================" +echo " OSMO Installation Verification" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Prerequisite checks +# ----------------------------------------------------------------------------- +log_info "Checking prerequisites..." + +PREREQ_OK=true + +if ! command -v kubectl &>/dev/null; then + log_error "kubectl not found. Install it and configure cluster access first." + PREREQ_OK=false +fi + +if ! command -v helm &>/dev/null; then + log_error "helm not found. Install helm to check GPU Operator configuration." + PREREQ_OK=false +fi + +if ! command -v jq &>/dev/null; then + log_error "jq not found. Install jq for JSON parsing." + PREREQ_OK=false +fi + +if ! command -v curl &>/dev/null; then + log_error "curl not found." + PREREQ_OK=false +fi + +if [[ "$PREREQ_OK" != "true" ]]; then + log_error "Missing prerequisites. Fix the above and re-run." + return 2>/dev/null || true +fi + +# Verify kubectl can reach the cluster +if ! kubectl cluster-info &>/dev/null; then + log_error "kubectl cannot reach the cluster. Connect first:" + echo " nebius mk8s cluster get-credentials --id --external" + return 2>/dev/null || true +fi + +CLUSTER_CONTEXT=$(kubectl config current-context 2>/dev/null || echo "unknown") +log_info "Connected to cluster: ${CLUSTER_CONTEXT}" +echo "" + +# ============================================================================= +# Check 1: GPU Operator with driver +# ============================================================================= +log_info "--- Check 1: GPU Operator & Driver ---" + +# Check GPU Operator is deployed +if helm list -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" 2>/dev/null | grep -q gpu-operator; then + check_pass "GPU Operator helm release found" +else + check_fail "GPU Operator helm release NOT found in namespace ${GPU_OPERATOR_NAMESPACE:-gpu-operator}" +fi + +# Check driver is enabled (not disabled via --set driver.enabled=false) +DRIVER_ENABLED=$(helm get values gpu-operator -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -a -o json 2>/dev/null | jq -r '.driver.enabled // empty' || echo "unknown") +if [[ "$DRIVER_ENABLED" == "true" ]]; then + check_pass "GPU driver is enabled in GPU Operator" +elif [[ "$DRIVER_ENABLED" == "false" ]]; then + check_fail "GPU driver is DISABLED (driver.enabled=false) — driverless images need the operator to manage the driver" +else + check_warn "Could not determine if GPU driver is enabled" +fi + +# Check driver version by running nvidia-smi inside a nvidia-driver-daemonset pod +DRIVER_POD=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" \ + -l app=nvidia-driver-daemonset --field-selector=status.phase=Running \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [[ -z "$DRIVER_POD" ]]; then + check_warn "No running nvidia-driver-daemonset pod found — cannot check driver version" +else + DRIVER_NODE=$(kubectl get pod -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" "$DRIVER_POD" \ + -o jsonpath='{.spec.nodeName}' 2>/dev/null || echo "unknown") + log_info "Running nvidia-smi in pod ${DRIVER_POD} (node ${DRIVER_NODE})..." + + NVIDIA_SMI_OUTPUT=$(kubectl exec -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" "$DRIVER_POD" -- \ + nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]' || echo "") + + if [[ -z "$NVIDIA_SMI_OUTPUT" ]]; then + check_fail "Could not run nvidia-smi in pod ${DRIVER_POD}" + elif [[ "$NVIDIA_SMI_OUTPUT" == "$EXPECTED_DRIVER_VERSION" ]]; then + check_pass "nvidia-smi driver version: ${NVIDIA_SMI_OUTPUT} (on ${DRIVER_NODE})" + else + check_fail "nvidia-smi driver version: ${NVIDIA_SMI_OUTPUT} (expected ${EXPECTED_DRIVER_VERSION}, on ${DRIVER_NODE})" + fi +fi + +# Check nvidia-driver-daemonset pods are running +DRIVER_PODS=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -l app=nvidia-driver-daemonset --no-headers 2>/dev/null | wc -l | tr -d ' ') +DRIVER_PODS_READY=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -l app=nvidia-driver-daemonset --no-headers 2>/dev/null | grep -c "Running" || echo "0") +if [[ "$DRIVER_PODS" -gt 0 ]]; then + if [[ "$DRIVER_PODS_READY" -eq "$DRIVER_PODS" ]]; then + check_pass "nvidia-driver-daemonset: ${DRIVER_PODS_READY}/${DRIVER_PODS} pods Running" + else + check_fail "nvidia-driver-daemonset: ${DRIVER_PODS_READY}/${DRIVER_PODS} pods Running" + fi +else + check_warn "No nvidia-driver-daemonset pods found (expected when driver.enabled=true)" +fi + +# ============================================================================= +# Check 2: /mnt/data mounted on a GPU node +# ============================================================================= +echo "" +log_info "--- Check 2: /mnt/data on GPU node ---" + +GPU_NODE=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [[ -z "$GPU_NODE" ]]; then + check_warn "No GPU nodes found — cannot verify /mnt/data" +else + POD_NAME="mnt-check-verify" + kubectl delete pod "$POD_NAME" --force --grace-period=0 &>/dev/null || true + kubectl run "$POD_NAME" --image=busybox --restart=Never \ + --overrides="{ + \"spec\":{ + \"nodeName\":\"${GPU_NODE}\", + \"containers\":[{ + \"name\":\"check\", + \"image\":\"busybox\", + \"command\":[\"sh\",\"-c\",\"grep -q ' /host-mnt/data ' /host-proc/mounts && echo MOUNTED || echo NOT_MOUNTED\"], + \"volumeMounts\":[ + {\"name\":\"host-proc\",\"mountPath\":\"/host-proc\",\"readOnly\":true}, + {\"name\":\"host-mnt\",\"mountPath\":\"/host-mnt\",\"readOnly\":true} + ] + }], + \"volumes\":[ + {\"name\":\"host-proc\",\"hostPath\":{\"path\":\"/proc\",\"type\":\"Directory\"}}, + {\"name\":\"host-mnt\",\"hostPath\":{\"path\":\"/mnt\",\"type\":\"Directory\"}} + ], + \"tolerations\":[{\"operator\":\"Exists\"}], + \"restartPolicy\":\"Never\" + } + }" &>/dev/null + + kubectl wait --for=jsonpath='{.status.phase}'=Succeeded "pod/$POD_NAME" --timeout=30s &>/dev/null + + RESULT=$(kubectl logs "$POD_NAME" 2>/dev/null | tail -1) + kubectl delete pod "$POD_NAME" --force --grace-period=0 &>/dev/null || true + + if [[ "$RESULT" == "MOUNTED" ]]; then + check_pass "GPU node ${GPU_NODE}: /mnt/data mounted" + elif [[ "$RESULT" == "NOT_MOUNTED" ]]; then + check_fail "GPU node ${GPU_NODE}: /mnt/data NOT mounted" + else + check_warn "GPU node ${GPU_NODE}: could not verify /mnt/data" + fi +fi + +# ============================================================================= +# Check 3–6 require OSMO API access via port-forward +# ============================================================================= +echo "" +log_info "--- Setting up OSMO API access ---" + +# Start port-forward for OSMO API checks +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT RETURN + +# Wait for port-forward +max_wait=15 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + log_error "Skipping OSMO API checks (3–6). Ensure OSMO is running in namespace '${OSMO_NAMESPACE}'." + cleanup_port_forward + # Print partial summary and return + echo "" + echo "========================================" + echo " Verification Summary (partial)" + echo "========================================" + echo "" + echo -e " ${GREEN}Passed: ${PASS}${NC}" + echo -e " ${RED}Failed: ${FAIL}${NC}" + echo -e " ${YELLOW}Warnings: ${WARN}${NC}" + echo -e " Skipped: checks 3–6 (OSMO API unreachable)" + echo "" + return 2>/dev/null || true + fi +done + +osmo_login 8080 || true + +# ============================================================================= +# Check 3: Shared memory pod template (64Gi /dev/shm) +# ============================================================================= +echo "" +log_info "--- Check 3: Shared memory pod template ---" + +SHM_TEMPLATE=$(osmo_curl GET "${OSMO_URL}/api/configs/pod_template/shm" 2>/dev/null || echo "") +if [[ -n "$SHM_TEMPLATE" && "$SHM_TEMPLATE" != "null" && "$SHM_TEMPLATE" != "{}" ]]; then + # Check sizeLimit (API may return under .configs.spec or .spec) + SHM_SIZE=$(echo "$SHM_TEMPLATE" | jq -r '(.configs.spec // .spec).volumes[]? | select(.name=="shm") | .emptyDir.sizeLimit // empty' 2>/dev/null || echo "") + if [[ "$SHM_SIZE" == "$EXPECTED_SHM_SIZE" ]]; then + check_pass "shm pod template: sizeLimit=${SHM_SIZE}" + elif [[ -n "$SHM_SIZE" ]]; then + check_fail "shm pod template: sizeLimit=${SHM_SIZE} (expected ${EXPECTED_SHM_SIZE})" + else + check_warn "shm pod template exists but could not read sizeLimit" + fi + + # Check /dev/shm mount + SHM_MOUNT=$(echo "$SHM_TEMPLATE" | jq -r '(.configs.spec // .spec).containers[]?.volumeMounts[]? | select(.mountPath=="/dev/shm") | .name // empty' 2>/dev/null || echo "") + if [[ "$SHM_MOUNT" == "shm" ]]; then + check_pass "shm pod template: /dev/shm volumeMount configured" + else + check_fail "shm pod template: /dev/shm volumeMount NOT found" + fi +else + check_fail "shm pod template NOT found in OSMO" +fi + +# ============================================================================= +# Check 4: Redis configuration (8 vCPU, ~52.82Gi mem, 50Gi PVC) +# ============================================================================= +echo "" +log_info "--- Check 4: Redis resources ---" + +REDIS_STS=$(kubectl get statefulset redis-master -n "${OSMO_NAMESPACE}" -o json 2>/dev/null || echo "") +if [[ -z "$REDIS_STS" || "$REDIS_STS" == "" ]]; then + check_fail "Redis statefulset 'redis-master' not found in namespace ${OSMO_NAMESPACE}" +else + # CPU requests + REDIS_CPU=$(echo "$REDIS_STS" | jq -r '.spec.template.spec.containers[] | select(.name=="redis") | .resources.requests.cpu // empty' 2>/dev/null || echo "") + REDIS_CPU_NUM=$(echo "$REDIS_CPU" | sed 's/m$//' || echo "0") + if [[ "$REDIS_CPU" =~ m$ ]]; then + REDIS_CPU_CORES=$((REDIS_CPU_NUM / 1000)) + else + REDIS_CPU_CORES=$REDIS_CPU_NUM + fi + + if [[ "$REDIS_CPU_CORES" -ge "$MIN_REDIS_CPU" ]] 2>/dev/null; then + check_pass "Redis CPU requests: ${REDIS_CPU} (>= ${MIN_REDIS_CPU} cores)" + else + check_fail "Redis CPU requests: ${REDIS_CPU} (expected >= ${MIN_REDIS_CPU} cores)" + fi + + # Memory requests + REDIS_MEM=$(echo "$REDIS_STS" | jq -r '.spec.template.spec.containers[] | select(.name=="redis") | .resources.requests.memory // empty' 2>/dev/null || echo "") + REDIS_MEM_NUM=$(echo "$REDIS_MEM" | sed -E 's/[A-Za-z]+$//') + REDIS_MEM_UNIT=$(echo "$REDIS_MEM" | sed -E 's/^[0-9.]+//') + case "$REDIS_MEM_UNIT" in + Gi) REDIS_MEM_GI=$REDIS_MEM_NUM ;; + Mi) REDIS_MEM_GI=$((REDIS_MEM_NUM / 1024)) ;; + *) REDIS_MEM_GI=0 ;; + esac + + if [[ "$REDIS_MEM_GI" -ge "$MIN_REDIS_MEMORY_GI" ]] 2>/dev/null; then + check_pass "Redis memory requests: ${REDIS_MEM} (>= ${MIN_REDIS_MEMORY_GI}Gi)" + else + check_fail "Redis memory requests: ${REDIS_MEM} (expected >= ${MIN_REDIS_MEMORY_GI}Gi)" + fi + + # PVC size + REDIS_PVC_SIZE=$(kubectl get pvc -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=redis --no-headers -o jsonpath='{.items[0].spec.resources.requests.storage}' 2>/dev/null || echo "") + REDIS_PVC_NUM=$(echo "$REDIS_PVC_SIZE" | sed -E 's/[A-Za-z]+$//') + REDIS_PVC_UNIT=$(echo "$REDIS_PVC_SIZE" | sed -E 's/^[0-9.]+//') + case "$REDIS_PVC_UNIT" in + Gi) REDIS_PVC_GI=$REDIS_PVC_NUM ;; + Ti) REDIS_PVC_GI=$((REDIS_PVC_NUM * 1024)) ;; + *) REDIS_PVC_GI=0 ;; + esac + + if [[ "$REDIS_PVC_GI" -ge "$MIN_REDIS_PVC_GI" ]] 2>/dev/null; then + check_pass "Redis PVC size: ${REDIS_PVC_SIZE} (>= ${MIN_REDIS_PVC_GI}Gi)" + else + check_fail "Redis PVC size: ${REDIS_PVC_SIZE:-unknown} (expected >= ${MIN_REDIS_PVC_GI}Gi)" + fi +fi + +# ============================================================================= +# Check 5: max_num_tasks >= 200 +# ============================================================================= +echo "" +log_info "--- Check 5: WORKFLOW max_num_tasks ---" + +WORKFLOW_CONFIG=$(osmo_curl GET "${OSMO_URL}/api/configs/workflow" 2>/dev/null || echo "") +if [[ -n "$WORKFLOW_CONFIG" && "$WORKFLOW_CONFIG" != "null" ]]; then + MAX_NUM_TASKS=$(echo "$WORKFLOW_CONFIG" | jq -r '.max_num_tasks // .configs_dict.max_num_tasks // empty' 2>/dev/null || echo "") + if [[ -z "$MAX_NUM_TASKS" ]]; then + check_fail "max_num_tasks not set in WORKFLOW config (default is too low)" + elif [[ "$MAX_NUM_TASKS" -ge "$MIN_MAX_NUM_TASKS" ]] 2>/dev/null; then + check_pass "max_num_tasks: ${MAX_NUM_TASKS} (>= ${MIN_MAX_NUM_TASKS})" + else + check_fail "max_num_tasks: ${MAX_NUM_TASKS} (expected >= ${MIN_MAX_NUM_TASKS})" + fi +else + check_fail "Could not retrieve WORKFLOW config from OSMO API" +fi + +# ============================================================================= +# Check 6: Platform name is not default "gpu" +# ============================================================================= +echo "" +log_info "--- Check 6: Platform naming ---" + +POOL_CONFIG=$(osmo_curl GET "${OSMO_URL}/api/configs/pool/default" 2>/dev/null || echo "") +if [[ -n "$POOL_CONFIG" && "$POOL_CONFIG" != "null" ]]; then + PLATFORM_NAMES=$(echo "$POOL_CONFIG" | jq -r '.platforms // {} | keys[]' 2>/dev/null || echo "") + if [[ -z "$PLATFORM_NAMES" ]]; then + check_fail "No platforms found in default pool" + else + GPU_TYPE_FOUND=false + GENERIC_LIST="" + for NAME in $PLATFORM_NAMES; do + # Platform name must identify the GPU type (e.g. H100, H200, B200, L40S) + if echo "$NAME" | grep -qiE '^(h100|h200|b200|b300|l40s|a100|a10)'; then + check_pass "Platform '${NAME}': name identifies GPU type" + GPU_TYPE_FOUND=true + else + GENERIC_LIST="${GENERIC_LIST} ${NAME}" + fi + done + if [[ "$GPU_TYPE_FOUND" == "false" ]]; then + check_fail "No GPU-type platform found (only generic:${GENERIC_LIST}) — create one named after the GPU (e.g. H100)" + elif [[ -n "$GENERIC_LIST" ]]; then + log_info "Also found generic platforms:${GENERIC_LIST} (cannot be deleted, ignored)" + fi + fi +else + check_fail "Could not retrieve pool config from OSMO API" +fi + +# ============================================================================= +# Summary +# ============================================================================= +cleanup_port_forward +trap - EXIT RETURN + +echo "" +echo "========================================" +echo " Verification Summary" +echo "========================================" +echo "" +echo -e " ${GREEN}Passed: ${PASS}${NC}" +echo -e " ${RED}Failed: ${FAIL}${NC}" +echo -e " ${YELLOW}Warnings: ${WARN}${NC}" +echo "" + +if [[ "$FAIL" -gt 0 ]]; then + log_error "Installation has ${FAIL} issue(s) that need to be fixed." +elif [[ "$WARN" -gt 0 ]]; then + log_warning "Installation looks OK but has ${WARN} warning(s) to review." +else + log_success "All checks passed!" +fi diff --git a/applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh b/applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh new file mode 100755 index 000000000..824c2e1da --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# +# Connect OSMO Backend to a Remote Control Plane +# +# Reconfigures an already-deployed backend operator to point to a remote +# control plane at a different URL (e.g. in another K8s cluster). +# +# Required inputs (env vars or positional args): +# REMOTE_CONTROL_PLANE_URL — external HTTPS URL of the remote control plane +# REMOTE_SERVICE_TOKEN — service token from the remote control plane +# +# Usage: +# export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud +# export REMOTE_SERVICE_TOKEN= +# ./99a-connect-remote-control-plane.sh +# +# Or with positional args: +# ./99a-connect-remote-control-plane.sh +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " Connect Backend to Remote Control Plane" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_OPERATOR_NAMESPACE="${OSMO_OPERATOR_NAMESPACE:-osmo-operator}" +OSMO_WORKFLOWS_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}" +OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" + +# Accept positional args or env vars +REMOTE_CONTROL_PLANE_URL="${1:-${REMOTE_CONTROL_PLANE_URL:-}}" +REMOTE_SERVICE_TOKEN="${2:-${REMOTE_SERVICE_TOKEN:-}}" + +# Validate required inputs +if [[ -z "$REMOTE_CONTROL_PLANE_URL" ]]; then + log_error "REMOTE_CONTROL_PLANE_URL is required." + echo "" + echo "Usage:" + echo " export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud" + echo " export REMOTE_SERVICE_TOKEN=" + echo " ./99a-connect-remote-control-plane.sh" + echo "" + echo " Or: ./99a-connect-remote-control-plane.sh " + exit 1 +fi + +if [[ -z "$REMOTE_SERVICE_TOKEN" ]]; then + log_error "REMOTE_SERVICE_TOKEN is required." + echo "" + echo "Generate a service token on the remote control plane:" + echo " osmo token set backend-token-\$(date +%s) --service --roles osmo-backend --expires-at 2027-01-01" + echo "" + echo "Then export it:" + echo " export REMOTE_SERVICE_TOKEN=" + exit 1 +fi + +# Strip trailing slash from URL +REMOTE_CONTROL_PLANE_URL="${REMOTE_CONTROL_PLANE_URL%/}" + +log_info "Remote control plane URL: ${REMOTE_CONTROL_PLANE_URL}" +log_info "Backend name: ${BACKEND_NAME}" +log_info "Operator namespace: ${OSMO_OPERATOR_NAMESPACE}" + +# ----------------------------------------------------------------------------- +# Verify kubectl is connected +# ----------------------------------------------------------------------------- +log_info "Current kubectl context:" +kubectl config current-context +echo "" + +# ----------------------------------------------------------------------------- +# Test remote control plane reachability +# ----------------------------------------------------------------------------- +log_info "Testing remote control plane reachability..." + +HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 10 "${REMOTE_CONTROL_PLANE_URL}/api/version" 2>/dev/null || echo "000") + +if [[ "$HTTP_CODE" == "000" ]]; then + log_error "Cannot reach ${REMOTE_CONTROL_PLANE_URL}/api/version (connection failed)" + log_error "Check the URL and ensure the remote control plane is accessible from this network." + exit 1 +elif [[ "$HTTP_CODE" =~ ^(200|401|403)$ ]]; then + log_success "Remote control plane reachable (HTTP ${HTTP_CODE})" +else + log_warning "Remote control plane returned HTTP ${HTTP_CODE} — proceeding anyway" +fi + +# ----------------------------------------------------------------------------- +# Check that osmo-operator release exists +# ----------------------------------------------------------------------------- +log_info "Checking for existing osmo-operator Helm release..." + +if ! helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then + log_error "No osmo-operator Helm release found in namespace ${OSMO_OPERATOR_NAMESPACE}" + log_error "Deploy the backend operator first: ./06-deploy-osmo-backend.sh" + exit 1 +fi +log_success "osmo-operator release found" + +# ----------------------------------------------------------------------------- +# Create/update the osmo-operator-token secret +# ----------------------------------------------------------------------------- +log_info "Updating osmo-operator-token secret..." + +kubectl create secret generic osmo-operator-token \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --from-literal=token="${REMOTE_SERVICE_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + +log_success "osmo-operator-token secret updated" + +# ----------------------------------------------------------------------------- +# Helm upgrade — update global.serviceUrl, keep everything else +# ----------------------------------------------------------------------------- +log_info "Updating osmo-operator Helm release with remote service URL..." + +helm upgrade osmo-operator osmo/backend-operator \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --reuse-values \ + --set "global.serviceUrl=${REMOTE_CONTROL_PLANE_URL}" \ + --wait \ + --timeout 5m + +log_success "Helm release updated with serviceUrl=${REMOTE_CONTROL_PLANE_URL}" + +# ----------------------------------------------------------------------------- +# Wait for backend-listener pod to restart +# ----------------------------------------------------------------------------- +log_info "Waiting for backend-listener pod to be ready..." + +# Give the rollout a moment to start +sleep 3 + +# Wait for all pods in the operator namespace to be ready +kubectl rollout status deployment -n "${OSMO_OPERATOR_NAMESPACE}" --timeout=120s 2>/dev/null || true + +# Check backend-listener pod status +LISTENER_POD=$(kubectl get pods -n "${OSMO_OPERATOR_NAMESPACE}" -l app=backend-listener -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + +if [[ -n "$LISTENER_POD" ]]; then + log_info "Checking backend-listener logs for connection status..." + # Wait a few seconds for connection attempt + sleep 5 + kubectl logs -n "${OSMO_OPERATOR_NAMESPACE}" "$LISTENER_POD" --tail=20 2>/dev/null || true + echo "" +else + log_warning "No backend-listener pod found — check deployment status" +fi + +# ----------------------------------------------------------------------------- +# Print status +# ----------------------------------------------------------------------------- +echo "" +kubectl get pods -n "${OSMO_OPERATOR_NAMESPACE}" + +echo "" +echo "========================================" +log_success "Backend connected to remote control plane!" +echo "========================================" +echo "" +echo "Remote Control Plane: ${REMOTE_CONTROL_PLANE_URL}" +echo "Backend Name: ${BACKEND_NAME}" +echo "Operator Namespace: ${OSMO_OPERATOR_NAMESPACE}" +echo "" +echo "To verify the backend is online on the remote control plane:" +echo "" +echo " curl ${REMOTE_CONTROL_PLANE_URL}/api/configs/backend" +echo "" +echo " Or using osmo CLI (logged into the remote control plane):" +echo " osmo config show BACKEND ${BACKEND_NAME}" +echo "" +echo "To check backend-listener logs:" +echo " kubectl logs -n ${OSMO_OPERATOR_NAMESPACE} -l app=backend-listener -f" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/99b-show-keycloak-credentials.sh b/applications/osmo/deploy/example/002-setup/99b-show-keycloak-credentials.sh new file mode 100755 index 000000000..27716526d --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/99b-show-keycloak-credentials.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Show Keycloak admin credentials +# Retrieves the admin password from the keycloak-admin-secret Kubernetes secret. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" + +echo "" +echo "========================================" +echo " Keycloak Admin Credentials" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# Retrieve admin password from Kubernetes secret +log_info "Retrieving Keycloak admin password..." + +ADMIN_PASSWORD=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null) || true + +if [[ -z "${ADMIN_PASSWORD}" ]]; then + log_error "Could not retrieve Keycloak admin password from secret 'keycloak-admin-secret' in namespace '${OSMO_NAMESPACE}'." + echo " Make sure Keycloak has been deployed (04-deploy-osmo-control-plane.sh)." + exit 1 +fi + +# Determine Keycloak URL +if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + KEYCLOAK_URL="https://${KEYCLOAK_HOSTNAME}" +elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + KEYCLOAK_URL="https://auth-${OSMO_INGRESS_HOSTNAME}" +else + KEYCLOAK_URL="(unknown — set KEYCLOAK_HOSTNAME or OSMO_INGRESS_HOSTNAME)" +fi + +echo "" +echo " URL: ${KEYCLOAK_URL}" +echo " Username: admin" +echo " Password: ${ADMIN_PASSWORD}" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/README.md b/applications/osmo/deploy/example/002-setup/README.md new file mode 100755 index 000000000..05ec8b55c --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/README.md @@ -0,0 +1,363 @@ +# Kubernetes Setup Scripts + +This directory contains scripts for configuring the Kubernetes cluster with GPU infrastructure and OSMO components. + +## Prerequisites + +1. Complete infrastructure deployment (001-iac) +2. kubectl configured with cluster access: + ```bash + nebius mk8s cluster get-credentials --id --external + ``` + +## Deployment Order + +Run scripts in order: + +```bash +# 1. GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +./01-deploy-gpu-infrastructure.sh + +# 2. Observability (Prometheus, Grafana, Loki) +./02-deploy-observability.sh + +# 3. NGINX Ingress Controller (required – provides routing for OSMO services) +./03-deploy-nginx-ingress.sh + +# 4. OSMO Control Plane +./04-deploy-osmo-control-plane.sh + +# 5. OSMO Backend +./05-deploy-osmo-backend.sh + +# 6. Configure Storage (requires port-forward, see main README) +./06-configure-storage.sh + +# 7. Configure GPU Platform (required for GPU workflows) +./08-configure-gpu-platform.sh +``` + +## Scripts + +| Script | Purpose | Duration | +|--------|---------|----------| +| `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | +| `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | +| `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | +| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | +| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | +| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | + +## Configuration + +### Helm Values + +Customize deployments by editing files in `values/`: + +| File | Component | +|------|-----------| +| `gpu-operator.yaml` | NVIDIA GPU Operator | +| `network-operator.yaml` | NVIDIA Network Operator | +| `kai-scheduler.yaml` | KAI GPU Scheduler | +| `prometheus.yaml` | Prometheus + Grafana | +| `loki.yaml` | Loki Log Aggregation | +| `promtail.yaml` | Log Collection | + +### Environment Variables + +Configure via `defaults.sh` or export before running: + +```bash +# Namespaces +GPU_OPERATOR_NAMESPACE="gpu-operator" +NETWORK_OPERATOR_NAMESPACE="network-operator" +MONITORING_NAMESPACE="monitoring" +OSMO_NAMESPACE="osmo" + +# Grafana password (auto-generated if empty) +GRAFANA_ADMIN_PASSWORD="" + +# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) +OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access +OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty +``` + +### Secrets from MysteryBox + +If you ran `secrets-init.sh` in the prerequisites step, the following environment variables are set: + +| Variable | Description | +|----------|-------------| +| `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | +| `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | + +The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. + +**Secret retrieval order:** +1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) +2. **Terraform outputs** (fallback) +3. **Environment variables** (fallback) +4. **Interactive prompt** (last resort) + +To manually retrieve secrets from MysteryBox: +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_postgresql_mysterybox_secret_id \ + --key password --format json | jq -r '.data.string_value' + +# MEK (Master Encryption Key) +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_mek_mysterybox_secret_id \ + --key mek --format json | jq -r '.data.string_value' +``` + +## Accessing Services + +### Grafana Dashboard + +```bash +kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 +# Open http://localhost:3000 +# User: admin +# Password: (shown during deployment or in defaults.sh) +``` + +### Prometheus + +```bash +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 +# Open http://localhost:9090 +``` + +### OSMO API + +```bash +kubectl port-forward -n osmo svc/osmo-service 8080:80 +# Open http://localhost:8080 +``` + +### OSMO Web UI + +```bash +kubectl port-forward -n osmo svc/osmo-ui 8081:80 +# Open http://localhost:8081 +``` + +## Cleanup + +Run cleanup scripts in reverse order: + +```bash +cd cleanup + +# Remove OSMO +./uninstall-osmo-backend.sh +./uninstall-osmo-control-plane.sh + +# Remove observability +./uninstall-observability.sh + +# Remove GPU infrastructure +./uninstall-gpu-infrastructure.sh +``` + +## Configure OSMO GPU Platform + +After deploying OSMO backend, configure the GPU platform so OSMO can schedule workloads on GPU nodes. + +### Why is this needed? + +Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pods from being scheduled unless they have matching tolerations. OSMO needs to be configured with: + +1. A **pod template** with GPU tolerations and node selector +2. A **GPU platform** that references this pod template + +### Option 1: Run the Configuration Script (Recommended) + +```bash +./08-configure-gpu-platform.sh +``` + +### Option 2: Manual Configuration via API + +With port-forward running (`kubectl port-forward -n osmo svc/osmo-service 8080:80`): + +**Step 1: Create GPU Pod Template** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pod_template/gpu_tolerations' \ + -H 'Content-Type: application/json' \ + -d @gpu_pod_template.json +``` + +Where `gpu_pod_template.json` contains: + +```json +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + } +} +``` + +**Step 2: Create GPU Platform** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pool/default/platform/gpu' \ + -H 'Content-Type: application/json' \ + -d @gpu_platform_update.json +``` + +Where `gpu_platform_update.json` contains: + +```json +{ + "configs": { + "description": "GPU platform for L40S nodes", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": [], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations"] + } +} +``` + +### Verify Configuration + +```bash +# Check pod templates +curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' +# Should include: "gpu_tolerations" + +# Check GPU platform +curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms.gpu' + +# Check resources (GPU nodes should now be visible) +curl -s http://localhost:8080/api/resources | jq '.resources[] | {name: .name, gpu: .allocatable_fields.gpu}' +``` + +### Using GPU in Workflows + +Specify `platform: gpu` in your OSMO workflow: + +```yaml +workflow: + name: my-gpu-job + resources: + gpu-resource: + platform: gpu # <-- Selects GPU platform with tolerations + gpu: 1 + memory: 4Gi + tasks: + - name: train + image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + command: ["nvidia-smi"] + resource: gpu-resource +``` + +## Troubleshooting + +### GPU Nodes Not Ready + +1. Check GPU operator pods: + ```bash + kubectl get pods -n gpu-operator + ``` + +2. Check node labels: + ```bash + kubectl get nodes -l node-type=gpu --show-labels + ``` + +3. Check DCGM exporter: + ```bash + kubectl logs -n gpu-operator -l app=nvidia-dcgm-exporter + ``` + +### Pods Pending on GPU Nodes + +1. Verify tolerations: + ```bash + kubectl describe pod | grep -A5 Tolerations + ``` + +2. Check node taints: + ```bash + kubectl describe node | grep Taints + ``` + +### InfiniBand Issues + +1. Check Network Operator: + ```bash + kubectl get pods -n network-operator + ``` + +2. Verify RDMA devices: + ```bash + kubectl exec -n gpu-operator -- ibstat + ``` + +### Database Connection Failed + +1. Verify PostgreSQL is accessible: + ```bash + kubectl get secret osmo-database -n osmo -o yaml + ``` + +2. Test connection from a pod: + ```bash + kubectl run pg-test --rm -it --image=postgres:16 -- psql -h -U -d + ``` + +### OSMO Not Seeing GPU Resources + +If OSMO shows 0 GPUs or GPU workflows fail to schedule: + +1. Check if GPU platform is configured: + ```bash + curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms | keys' + # Should include "gpu" + ``` + +2. Check if GPU pod template exists: + ```bash + curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' + # Should include "gpu_tolerations" + ``` + +3. Check GPU node labels and taints: + ```bash + kubectl describe node | grep -E 'Taints:|nvidia.com/gpu' + # Should show taint: nvidia.com/gpu=true:NoSchedule + # Should show label: nvidia.com/gpu.present=true + ``` + +4. If missing, run the GPU configuration: + ```bash + ./08-configure-gpu-platform.sh + ``` + +5. Verify OSMO sees GPU resources: + ```bash + curl -s http://localhost:8080/api/resources | jq '.resources[] | select(.allocatable_fields.gpu != null)' + ``` diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh new file mode 100755 index 000000000..656c9f6d2 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Uninstall GPU Infrastructure +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling GPU Infrastructure" +echo "========================================" +echo "" + +log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing KAI Scheduler..." +helm uninstall kai-scheduler -n "${KAI_SCHEDULER_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${KAI_SCHEDULER_NAMESPACE}" --ignore-not-found + +log_info "Removing Network Operator..." +helm uninstall network-operator -n "${NETWORK_OPERATOR_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${NETWORK_OPERATOR_NAMESPACE}" --ignore-not-found + +log_info "Removing GPU Operator..." +helm uninstall gpu-operator -n "${GPU_OPERATOR_NAMESPACE}" 2>/dev/null || true + +# Remove GPU Operator CRDs +log_info "Removing GPU Operator CRDs..." +kubectl delete crd clusterpolicies.nvidia.com --ignore-not-found +kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found + +kubectl delete namespace "${GPU_OPERATOR_NAMESPACE}" --ignore-not-found + +log_success "GPU infrastructure uninstalled" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh new file mode 100755 index 000000000..9a9b14170 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Uninstall Keycloak and disable OSMO authentication +# This removes Keycloak and related secrets. After running this, re-deploy +# OSMO control plane without DEPLOY_KEYCLOAK to switch back to open API mode. +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" + +echo "" +echo "========================================" +echo " Uninstall Keycloak" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# Step 1: Uninstall Keycloak Helm release +log_info "Uninstalling Keycloak Helm release..." +helm uninstall keycloak --namespace "${OSMO_NS}" 2>/dev/null || log_info "Keycloak Helm release not found (already removed)" + +# Step 2: Delete Keycloak config job and realm ConfigMap +log_info "Cleaning up Keycloak configuration job and ConfigMap..." +kubectl delete job keycloak-osmo-setup -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete configmap keycloak-realm-json -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +# Step 3: Delete Keycloak-related secrets +log_info "Deleting Keycloak secrets..." +kubectl delete secret keycloak-admin-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret keycloak-db-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret oidc-secrets -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak secrets deleted" + +# Step 4: Delete Keycloak TLS secret +log_info "Deleting Keycloak TLS secret (${KC_TLS_SECRET})..." +kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret "${KC_TLS_SECRET}" -n "${INGRESS_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak TLS secrets deleted" + +# Step 5: Delete Keycloak PVCs (if any) +log_info "Cleaning up Keycloak PVCs..." +kubectl delete pvc -l app.kubernetes.io/name=keycloak -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +echo "" +log_success "Keycloak uninstalled" +echo "" +echo "Next steps:" +echo " 1. Re-deploy OSMO control plane without authentication:" +echo " unset DEPLOY_KEYCLOAK" +echo " ./05-deploy-osmo-control-plane.sh" +echo "" +echo " 2. (Optional) Drop the Keycloak database from PostgreSQL:" +echo " Connect to your Managed PostgreSQL and run:" +echo " DROP DATABASE IF EXISTS keycloak;" +echo "" +echo " 3. (Optional) Remove the DNS A record for the auth subdomain" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh new file mode 100755 index 000000000..9b22947bf --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Uninstall NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" +log_info "Uninstalling NGINX Ingress Controller..." +helm uninstall "${INGRESS_RELEASE_NAME}" -n "${INGRESS_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${INGRESS_NAMESPACE}" --ignore-not-found --timeout=60s 2>/dev/null || true +log_success "NGINX Ingress Controller uninstalled" + +# Uninstall cert-manager (if installed) +if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "Uninstalling cert-manager..." + kubectl delete clusterissuer letsencrypt --ignore-not-found 2>/dev/null || true + helm uninstall cert-manager -n cert-manager 2>/dev/null || true + kubectl delete namespace cert-manager --ignore-not-found --timeout=60s 2>/dev/null || true + log_success "cert-manager uninstalled" +fi diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh new file mode 100755 index 000000000..bbafe9007 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Uninstall Observability Stack +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling Observability Stack" +echo "========================================" +echo "" + +log_warning "This will remove Prometheus, Grafana, and Loki" +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing Promtail..." +helm uninstall promtail -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Loki..." +helm uninstall loki -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Prometheus stack..." +helm uninstall prometheus -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +# Remove CRDs +log_info "Removing Prometheus CRDs..." +kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found +kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found +kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd probes.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found +kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + +log_info "Removing monitoring namespace..." +kubectl delete namespace "${MONITORING_NAMESPACE}" --ignore-not-found + +log_success "Observability stack uninstalled" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh new file mode 100755 index 000000000..dba6bc817 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# +# Uninstall OSMO Backend Operator +# Reverses everything deployed by 06-deploy-osmo-backend.sh +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Backend Operator" +echo "========================================" +echo "" + +log_warning "This will remove:" +echo " - Helm release: osmo-operator (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Secret: osmo-operator-token (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Namespace: ${OSMO_OPERATOR_NAMESPACE}" +echo " - Namespace: ${OSMO_WORKFLOWS_NAMESPACE} (and all workflow pods)" +echo "" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +# Uninstall Helm release +if helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then + log_info "Uninstalling Helm release: osmo-operator..." + helm uninstall osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" --wait --timeout 5m + log_success "Helm release uninstalled" +else + log_info "Helm release osmo-operator not found — skipping" +fi + +# Delete secrets +log_info "Removing secrets..." +kubectl delete secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found + +# Delete the internal agent service (created by 05-deploy-osmo-control-plane.sh for backend operator) +log_info "Removing osmo-agent-internal service..." +kubectl delete svc osmo-agent-internal -n "${OSMO_NAMESPACE}" --ignore-not-found + +# Delete namespaces (this also removes any remaining resources inside them) +log_info "Deleting namespace: ${OSMO_WORKFLOWS_NAMESPACE}..." +kubectl delete namespace "${OSMO_WORKFLOWS_NAMESPACE}" --ignore-not-found --wait=false + +log_info "Deleting namespace: ${OSMO_OPERATOR_NAMESPACE}..." +kubectl delete namespace "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found --wait=false + +echo "" +log_success "OSMO Backend Operator uninstalled" +echo "" +echo "Note: Namespace deletion may continue in the background." +echo " kubectl get ns ${OSMO_OPERATOR_NAMESPACE} ${OSMO_WORKFLOWS_NAMESPACE} 2>/dev/null" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh new file mode 100755 index 000000000..e19f183bf --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Uninstall OSMO Control Plane +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Control Plane" +echo "========================================" +echo "" + +log_warning "This will remove OSMO Control Plane and all OSMO resources" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing OSMO Control Plane..." +kubectl delete deployment osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete service osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-database -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-storage -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_info "Removing OSMO namespace..." +kubectl delete namespace "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "OSMO Control Plane uninstalled" diff --git a/applications/osmo/deploy/example/002-setup/default_user_pod_template.json b/applications/osmo/deploy/example/002-setup/default_user_pod_template.json new file mode 100644 index 000000000..71eed214b --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/default_user_pod_template.json @@ -0,0 +1,24 @@ +{ + "configs": { + "spec": { + "containers": [ + { + "name": "{{USER_CONTAINER_NAME}}", + "resources": { + "limits": { + "cpu": "{{USER_CPU}}", + "memory": "{{USER_MEMORY}}", + "ephemeral-storage": "{{USER_STORAGE}}" + }, + "requests": { + "cpu": "{{USER_CPU}}", + "memory": "{{USER_MEMORY}}", + "ephemeral-storage": "{{USER_STORAGE}}" + } + } + } + ] + } + }, + "description": "Default user container template (GPU resources moved to gpu_tolerations template)" +} diff --git a/applications/osmo/deploy/example/002-setup/defaults.sh b/applications/osmo/deploy/example/002-setup/defaults.sh new file mode 100755 index 000000000..2c9f3feca --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/defaults.sh @@ -0,0 +1,72 @@ +# ============================================================================= +# Default Configuration for Setup Scripts +# ============================================================================= + +# Namespaces +export GPU_OPERATOR_NAMESPACE="gpu-operator" +export NETWORK_OPERATOR_NAMESPACE="network-operator" +export KAI_SCHEDULER_NAMESPACE="kai-scheduler" +export MONITORING_NAMESPACE="monitoring" +export OSMO_NAMESPACE="osmo" + +# Chart versions (leave empty for latest) +export GPU_OPERATOR_VERSION="" +export NETWORK_OPERATOR_VERSION="" +export KAI_SCHEDULER_VERSION="v0.12.4" # Check https://github.com/NVIDIA/KAI-Scheduler/releases +export PROMETHEUS_VERSION="" +export GRAFANA_VERSION="" +export LOKI_VERSION="" + +# GPU Operator settings +export GPU_DRIVER_ENABLED="false" # Use Nebius driver-full images +export TOOLKIT_ENABLED="true" +export DEVICE_PLUGIN_ENABLED="true" +export MIG_MANAGER_ENABLED="false" + +# Network Operator (only needed for InfiniBand/GPU clusters) +export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand + +# Observability settings +export PROMETHEUS_RETENTION_DAYS="15" +export LOKI_RETENTION_DAYS="7" +export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty + +# NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +# Namespace where the NGINX Ingress Controller is deployed. +export INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +# Hostname for Ingress rules (e.g. osmo.example.com). Leave empty to use the LoadBalancer IP directly. +export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +# Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. +export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" + +# TLS / SSL Configuration +# TLS enabled by default. Requires OSMO_INGRESS_HOSTNAME to be set. Set to false to disable. +export OSMO_TLS_ENABLED="${OSMO_TLS_ENABLED:-true}" +# Name of the Kubernetes TLS secret used by Ingress (both paths produce this secret). +# NOTE: The OSMO Helm chart generates ingress TLS with secretName "osmo-tls". +export OSMO_TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +# Local directory where certbot stores certificate files (Path A only). +export OSMO_TLS_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" +# Email for Let's Encrypt registration (required for 03a and 03c). +export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}" +# cert-manager namespace (Path B / 03c only). +export CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}" +# Name of the ClusterIssuer created by 03c (Path B only). +export CLUSTER_ISSUER_NAME="${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" +# TLS mode: "certbot" or "cert-manager". Set automatically by 03a/03c. +export OSMO_TLS_MODE="${OSMO_TLS_MODE:-}" + +# Keycloak / Authentication +# Keycloak deployed by default. Requires OSMO_INGRESS_HOSTNAME or KEYCLOAK_HOSTNAME. Set to false to disable. +export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-true}" +# Keycloak hostname (e.g. auth-osmo-nebius.csptst.nvidia.com). +# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth-. +export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" +# TLS secret name for the Keycloak ingress (separate from the main osmo-tls). +# Run 03a with OSMO_TLS_SECRET_NAME=osmo-tls-auth for the auth subdomain. +export KEYCLOAK_TLS_SECRET_NAME="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" + +# Paths +export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +export VALUES_DIR="${SCRIPT_DIR}/values" +export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/example/002-setup/gpu_platform_update.json b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json new file mode 100755 index 000000000..1237894e7 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json @@ -0,0 +1,14 @@ +{ + "configs": { + "description": "GPU platform", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": ["/mnt/data"], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations", "shm"] + } +} diff --git a/applications/osmo/deploy/example/002-setup/gpu_pod_template.json b/applications/osmo/deploy/example/002-setup/gpu_pod_template.json new file mode 100755 index 000000000..d704f64fc --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/gpu_pod_template.json @@ -0,0 +1,77 @@ +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "containers": [ + { + "name": "{{USER_CONTAINER_NAME}}", + "env": [ + { + "name": "AWS_ENDPOINT_URL_S3", + "value": "https://storage.{{NEBIUS_REGION}}.nebius.cloud:443" + }, + { + "name": "AWS_S3_FORCE_PATH_STYLE", + "value": "true" + }, + { + "name": "AWS_DEFAULT_REGION", + "value": "{{NEBIUS_REGION}}" + }, + { + "name": "OSMO_LOGIN_DEV", + "value": "true" + }, + { + "name": "OSMO_SKIP_DATA_AUTH", + "value": "1" + } + ], + "resources": { + "limits": { + "nvidia.com/gpu": "{{USER_GPU}}" + }, + "requests": { + "nvidia.com/gpu": "{{USER_GPU}}" + } + } + }, + { + "name": "osmo-ctrl", + "env": [ + { + "name": "AWS_ENDPOINT_URL_S3", + "value": "https://storage.{{NEBIUS_REGION}}.nebius.cloud:443" + }, + { + "name": "AWS_S3_FORCE_PATH_STYLE", + "value": "true" + }, + { + "name": "AWS_DEFAULT_REGION", + "value": "{{NEBIUS_REGION}}" + }, + { + "name": "OSMO_LOGIN_DEV", + "value": "true" + }, + { + "name": "OSMO_SKIP_DATA_AUTH", + "value": "1" + } + ] + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + }, + "description": "Add compute pod template" +} diff --git a/applications/osmo/deploy/example/002-setup/lib/common.sh b/applications/osmo/deploy/example/002-setup/lib/common.sh new file mode 100755 index 000000000..17cc53533 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/lib/common.sh @@ -0,0 +1,450 @@ +#!/bin/bash +# +# Common functions for setup scripts +# + +# Colors +export RED='\033[0;31m' +export GREEN='\033[0;32m' +export YELLOW='\033[1;33m' +export BLUE='\033[0;34m' +export NC='\033[0m' + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[✓]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[!]${NC} $1" +} + +log_error() { + echo -e "${RED}[✗]${NC} $1" +} + +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +# Read a secret value into a variable (no echo). +read_secret_var() { + local prompt=$1 + local var_name=$2 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + printf "%s: " "$prompt" >"$write_to" + stty -echo <"$read_from" + IFS= read -r value <"$read_from" + stty echo <"$read_from" + printf "\n" >"$write_to" + + eval "$var_name='$value'" +} + +# Check if command exists +check_command() { + command -v "$1" &>/dev/null +} + +# Retry with exponential backoff +retry_with_backoff() { + local max_attempts=${1:-5} + local delay=${2:-2} + local max_delay=${3:-60} + shift 3 + local cmd=("$@") + + local attempt=1 + while [[ $attempt -le $max_attempts ]]; do + log_info "Attempt $attempt/$max_attempts: ${cmd[*]}" + if "${cmd[@]}"; then + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + log_warning "Failed, retrying in ${delay}s..." + sleep "$delay" + delay=$((delay * 2)) + if [[ $delay -gt $max_delay ]]; then + delay=$max_delay + fi + fi + ((attempt++)) + done + + log_error "All $max_attempts attempts failed" + return 1 +} + +# Wait for a condition with timeout +wait_for_condition() { + local description=$1 + local timeout=${2:-300} + local interval=${3:-10} + shift 3 + local cmd=("$@") + + log_info "Waiting for $description (timeout: ${timeout}s)..." + + local elapsed=0 + while [[ $elapsed -lt $timeout ]]; do + if "${cmd[@]}" &>/dev/null; then + log_success "$description" + return 0 + fi + sleep "$interval" + ((elapsed += interval)) + echo -n "." + done + + echo "" + log_error "Timeout waiting for $description" + return 1 +} + +# Check kubectl connection and verify we're targeting the correct cluster +check_kubectl() { + if ! check_command kubectl; then + log_error "kubectl not found" + return 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + log_error "Cannot connect to Kubernetes cluster" + return 1 + fi + + # Verify current context matches the expected cluster from Terraform + local expected_cluster + expected_cluster=$(get_tf_output "cluster_name" "../001-iac" 2>/dev/null || true) + if [[ -n "$expected_cluster" ]]; then + local current_context + current_context=$(kubectl config current-context 2>/dev/null || true) + if [[ -n "$current_context" && "$current_context" != *"$expected_cluster"* ]]; then + log_error "Wrong Kubernetes context!" + log_error " Current context: $current_context" + log_error " Expected cluster: $expected_cluster" + log_info "Switch context with: nebius mk8s cluster get-credentials --id \$(terraform -chdir=../001-iac output -raw cluster_id) --external" + return 1 + fi + log_success "kubectl connected to cluster ($expected_cluster)" + else + log_success "kubectl connected to cluster" + fi + return 0 +} + +# Check Helm +check_helm() { + if ! check_command helm; then + log_error "helm not found" + return 1 + fi + + log_success "helm available" + return 0 +} + +# Install Helm chart with retry +helm_install() { + local name=$1 + local chart=$2 + local namespace=$3 + shift 3 + local extra_args=("$@") + + log_info "Installing Helm chart: $name" + + kubectl create namespace "$namespace" --dry-run=client -o yaml | kubectl apply -f - + + retry_with_backoff 3 5 30 helm upgrade --install "$name" "$chart" \ + --namespace "$namespace" \ + --wait --timeout 10m \ + "${extra_args[@]}" +} + +# Wait for pods to be ready +wait_for_pods() { + local namespace=$1 + local label_selector=$2 + local timeout=${3:-300} + + wait_for_condition "pods with label $label_selector in $namespace" \ + "$timeout" 10 \ + kubectl wait --for=condition=Ready pods \ + -n "$namespace" \ + -l "$label_selector" \ + --timeout=10s +} + +# Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. +# +# When OSMO_TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns +# https://. Otherwise falls back to http://. +# +# Lookup order: +# 0. If TLS enabled + hostname set, return https:// immediately +# 1. LoadBalancer external IP (cloud assigns a public/internal IP) +# 2. LoadBalancer hostname (some clouds return a DNS name instead) +# 3. Controller ClusterIP (fallback – works from inside the cluster) +# +# Usage: +# url=$(detect_service_url) +# [[ -n "$url" ]] && echo "OSMO reachable at $url" +detect_service_url() { + local ns="${INGRESS_NAMESPACE:-ingress-nginx}" + local tls_enabled="${OSMO_TLS_ENABLED:-false}" + local hostname="${OSMO_INGRESS_HOSTNAME:-}" + local scheme="http" + + if [[ "$tls_enabled" == "true" ]]; then + scheme="https" + # If hostname is configured, prefer it (TLS certs are issued for the domain) + if [[ -n "$hostname" ]]; then + echo "${scheme}://${hostname}" + return 0 + fi + fi + + # Find the controller service (works for the community ingress-nginx chart) + local lb_ip lb_host cluster_ip svc_name + svc_name=$(kubectl get svc -n "$ns" \ + -l app.kubernetes.io/name=ingress-nginx,app.kubernetes.io/component=controller \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + + if [[ -n "$svc_name" ]]; then + # 1. LoadBalancer IP + lb_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$lb_ip" ]]; then + echo "${scheme}://${lb_ip}" + return 0 + fi + + # 2. LoadBalancer hostname (e.g. ELB on AWS) + lb_host=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true) + if [[ -n "$lb_host" ]]; then + echo "${scheme}://${lb_host}" + return 0 + fi + + # 3. ClusterIP of the controller + cluster_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true) + if [[ -n "$cluster_ip" && "$cluster_ip" != "None" ]]; then + echo "${scheme}://${cluster_ip}" + return 0 + fi + fi + + # Nothing found + return 1 +} + +# Get Terraform output (supports nested values like "postgresql.host") +get_tf_output() { + local name=$1 + local tf_dir=${2:-../001-iac} + + # Check if name contains a dot (nested value) + if [[ "$name" == *.* ]]; then + local base_name="${name%%.*}" + local key="${name#*.}" + terraform -chdir="$tf_dir" output -json "$base_name" 2>/dev/null | jq -r ".$key // empty" + else + terraform -chdir="$tf_dir" output -json "$name" 2>/dev/null | jq -r '. // empty' + fi +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Read secret from Nebius MysteryBox +# Usage: get_mysterybox_secret +# Returns the secret value or empty string if not found +get_mysterybox_secret() { + local secret_id=$1 + local key=$2 + local nebius_path=$(get_nebius_path) + + if [[ -z "$nebius_path" ]]; then + log_warning "Nebius CLI not found, cannot read from MysteryBox" + return 1 + fi + + if [[ -z "$secret_id" ]]; then + return 1 + fi + + local result=$("$nebius_path" mysterybox v1 payload get-by-key \ + --secret-id "$secret_id" \ + --key "$key" \ + --format json 2>/dev/null) + + if [[ -n "$result" ]]; then + echo "$result" | jq -r '.data.string_value // empty' 2>/dev/null + fi +} + +# ----------------------------------------------------------------------------- +# OSMO API helpers (for use when Envoy auth sidecar is present) +# ----------------------------------------------------------------------------- +# Per OSMO documentation, the OSMO service authorises requests by reading +# the x-osmo-user and x-osmo-roles headers. Envoy normally sets these from +# the JWT but when we bypass Envoy (port-forward to pod:8000) we must set +# them ourselves. +# +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/authentication_flow.html + +# Detect if a pod has an Envoy sidecar container +# Usage: has_envoy_sidecar +# Returns 0 (true) if envoy container is found, 1 (false) otherwise +has_envoy_sidecar() { + local ns="$1" + local label="$2" + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l "$label" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -z "$pod_name" ]]; then + return 1 + fi + kubectl get pod -n "$ns" "$pod_name" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null | grep -q envoy +} + +# Start a port-forward that bypasses Envoy when the sidecar is present. +# Sets PORT_FORWARD_PID and prints log messages. +# Usage: start_osmo_port_forward [local_port] +start_osmo_port_forward() { + local ns="${1:-osmo}" + local local_port="${2:-8080}" + + if has_envoy_sidecar "$ns" "app=osmo-service"; then + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l app=osmo-service -o jsonpath='{.items[0].metadata.name}') + log_info "Envoy sidecar detected -- port-forwarding to pod/${pod_name}:8000 (bypassing auth)..." + kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & + _OSMO_AUTH_BYPASS=true + else + log_info "No Envoy sidecar -- port-forwarding to svc/osmo-service:80..." + kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & + _OSMO_AUTH_BYPASS=false + fi + PORT_FORWARD_PID=$! + export _OSMO_AUTH_BYPASS +} + +# Make an authenticated curl call to the OSMO API. +# When _OSMO_AUTH_BYPASS=true (Envoy bypassed), injects x-osmo-user and +# x-osmo-roles headers so the OSMO service authorises the request. +# Usage: osmo_curl [curl-args...] +# Example: osmo_curl GET "http://localhost:8080/api/configs/service" +# Example: osmo_curl PATCH "http://localhost:8080/api/configs/service" -d '{"configs_dict":{...}}' +osmo_curl() { + local method="$1"; shift + local url="$1"; shift + + local auth_args=() + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + auth_args+=(-H "x-osmo-user: osmo-admin" -H "x-osmo-roles: osmo-admin,osmo-user") + fi + + curl -s -X "$method" "$url" \ + -H "Content-Type: application/json" \ + "${auth_args[@]}" \ + "$@" +} + +# Log in to OSMO using the appropriate method. +# When bypassing Envoy this is a no-op (curl headers handle auth). +# Otherwise uses `osmo login --method dev`. +# Usage: osmo_login [port] +osmo_login() { + local port="${1:-8080}" + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + log_info "Auth bypass active -- using direct API headers (osmo-admin role)" + else + log_info "Logging in to OSMO..." + if ! osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + return 1 + fi + log_success "Logged in successfully" + fi +} + +# Update an OSMO config via the PATCH API (partial merge). +# When _OSMO_AUTH_BYPASS=true, uses curl; otherwise uses osmo CLI. +# Usage: osmo_config_update +# Example: osmo_config_update WORKFLOW /tmp/config.json "Configure storage" +osmo_config_update() { + local config_type="$1" + local json_file="$2" + local description="${3:-Update config}" + local port="${4:-8080}" + + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + local endpoint + endpoint="api/configs/$(echo "$config_type" | tr '[:upper:]' '[:lower:]')" + + # Build PATCH request body: {"description": "...", "configs_dict": } + local body + body=$(jq -n --arg desc "$description" --slurpfile cfg "$json_file" \ + '{description: $desc, configs_dict: $cfg[0]}') + + local http_code + http_code=$(osmo_curl PATCH "http://localhost:${port}/${endpoint}" \ + -d "$body" -o /tmp/_osmo_patch_resp.txt -w "%{http_code}") + + if [[ "$http_code" =~ ^2 ]]; then + return 0 + else + log_error "PATCH /${endpoint} returned HTTP ${http_code}" + cat /tmp/_osmo_patch_resp.txt 2>/dev/null || true + return 1 + fi + else + osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null + fi +} diff --git a/applications/osmo/deploy/example/002-setup/osmo-values-noauth.yaml b/applications/osmo/deploy/example/002-setup/osmo-values-noauth.yaml new file mode 100755 index 000000000..53eb46662 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/osmo-values-noauth.yaml @@ -0,0 +1,170 @@ +# OSMO Service values - Auth Disabled +# For testing without authentication + +global: + osmoImageLocation: nvcr.io/nvidia/osmo + osmoImageTag: latest + imagePullPolicy: IfNotPresent + +services: + postgres: + enabled: false + serviceName: postgresql.osmo.svc.cluster.local + port: 5432 + db: osmo + user: osmo_admin + passwordSecretName: postgres-secret + passwordSecretKey: password + + redis: + enabled: false + serviceName: redis-master.osmo.svc.cluster.local + port: 6379 + tlsEnabled: false + + service: + scaling: + minReplicas: 1 + maxReplicas: 1 + ingress: + enabled: false + auth: + enabled: false + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + worker: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + logger: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + agent: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + delayedJobMonitor: + replicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + +sidecars: + envoy: + enabled: false + rateLimit: + enabled: false + logAgent: + enabled: false + otel: + enabled: false diff --git a/applications/osmo/deploy/example/002-setup/sample_osmo_realm.json b/applications/osmo/deploy/example/002-setup/sample_osmo_realm.json new file mode 100755 index 000000000..54a65ed77 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/sample_osmo_realm.json @@ -0,0 +1,2636 @@ +{ + "id": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "realm": "osmo", + "notBefore": 0, + "defaultSignatureAlgorithm": "RS256", + "revokeRefreshToken": false, + "refreshTokenMaxReuse": 0, + "accessTokenLifespan": 300, + "accessTokenLifespanForImplicitFlow": 900, + "ssoSessionIdleTimeout": 604800, + "ssoSessionMaxLifespan": 604800, + "ssoSessionIdleTimeoutRememberMe": 0, + "ssoSessionMaxLifespanRememberMe": 0, + "offlineSessionIdleTimeout": 2592000, + "offlineSessionMaxLifespanEnabled": false, + "offlineSessionMaxLifespan": 5184000, + "clientSessionIdleTimeout": 0, + "clientSessionMaxLifespan": 0, + "clientOfflineSessionIdleTimeout": 0, + "clientOfflineSessionMaxLifespan": 0, + "accessCodeLifespan": 60, + "accessCodeLifespanUserAction": 300, + "accessCodeLifespanLogin": 1800, + "actionTokenGeneratedByAdminLifespan": 43200, + "actionTokenGeneratedByUserLifespan": 300, + "oauth2DeviceCodeLifespan": 600, + "oauth2DevicePollingInterval": 5, + "enabled": true, + "sslRequired": "external", + "registrationAllowed": false, + "registrationEmailAsUsername": false, + "rememberMe": false, + "verifyEmail": false, + "loginWithEmailAllowed": false, + "duplicateEmailsAllowed": false, + "resetPasswordAllowed": false, + "editUsernameAllowed": false, + "bruteForceProtected": true, + "permanentLockout": false, + "maxTemporaryLockouts": 0, + "bruteForceStrategy": "MULTIPLE", + "maxFailureWaitSeconds": 300, + "minimumQuickLoginWaitSeconds": 60, + "waitIncrementSeconds": 60, + "quickLoginCheckMilliSeconds": 1000, + "maxDeltaTimeSeconds": 43200, + "failureFactor": 30, + "roles": { + "realm": [ + { + "id": "2fbf71d8-d3c1-4de3-8c08-ae55b254e092", + "name": "uma_authorization", + "description": "${role_uma_authorization}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "e22b93a7-88eb-4f66-a5cc-7c68a35d72fb", + "name": "offline_access", + "description": "${role_offline-access}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "c3d524ce-b3c8-42fd-9e6b-777a32960bb2", + "name": "admin", + "description": "${role_admin}", + "composite": true, + "composites": { + "realm": [ + "create-realm" + ], + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "composites": { + "realm": [ + "offline_access", + "uma_authorization" + ], + "client": { + "account": [ + "manage-account", + "view-profile" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "f5584dff-7c44-4204-b387-e3caf8ea3f46", + "name": "create-realm", + "description": "${role_create-realm}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + } + ], + "client": { + "osmo-realm": [], + "realm-management": [ + { + "id": "b8b96d4c-fc77-4e20-bc64-4918144dfdcf", + "name": "manage-realm", + "description": "${role_manage-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1dbd9f8f-e5e6-41b3-ba7c-746835fd9b79", + "name": "query-clients", + "description": "${role_query-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d27fc846-afad-42f9-8b11-636f4c535a36", + "name": "view-users", + "description": "${role_view-users}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-groups", + "query-users" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "3c345b77-4bdb-4360-bf81-fe85a77cbff7", + "name": "manage-identity-providers", + "description": "${role_manage-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "4953639a-2db7-45d7-a734-c42b487647c5", + "name": "impersonation", + "description": "${role_impersonation}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "ae14995a-6e23-4b1d-a10d-dd0feebf1d4a", + "name": "view-events", + "description": "${role_view-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "5ae16954-f8ad-4237-be92-1eb6916ce6cb", + "name": "manage-authorization", + "description": "${role_manage-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "7663ba0a-60f3-46bb-9232-3a2cc1832e62", + "name": "query-realms", + "description": "${role_query-realms}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "121f50ad-06c7-4541-a40f-400710228515", + "name": "manage-clients", + "description": "${role_manage-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d8c6a12c-240c-415c-9299-30f5292d2b90", + "name": "view-clients", + "description": "${role_view-clients}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-clients" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "70ebf14f-cf79-4ad7-b4c4-3d5289288ec0", + "name": "create-client", + "description": "${role_create-client}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1abf94ab-c2a7-469c-b081-584fbbb66046", + "name": "query-groups", + "description": "${role_query-groups}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "f8e1d204-7b77-446a-84fb-675c8c85e1f1", + "name": "realm-admin", + "description": "${role_realm-admin}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "72066e7f-f80f-4008-a0b3-531d3aebd2f0", + "name": "view-identity-providers", + "description": "${role_view-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "29649597-fdc9-4330-a96d-94218a1e91b2", + "name": "view-realm", + "description": "${role_view-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "12c80e9d-c3d9-4e61-91ab-c986e3aafe48", + "name": "view-authorization", + "description": "${role_view-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "bde16849-39b1-4c85-985d-40e9a178e873", + "name": "manage-users", + "description": "${role_manage-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "62463d22-8113-41e0-af6a-fa81883c475d", + "name": "query-users", + "description": "${role_query-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "e1afbd19-239f-4e78-abd9-5019b6baa7e2", + "name": "manage-events", + "description": "${role_manage-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + } + ], + "osmo-browser-flow": [ + { + "id": "2cfce9e9-000e-4de8-a0b6-50f0a4252db3", + "name": "dashboard-admin", + "description": "Able to make change to the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "454726d1-4f76-47f6-bcfa-5d64f759134f", + "name": "grafana-user", + "description": "Able to view dashboards in grafana", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9d91ae54-e69b-46e8-baee-7a16f044ded1", + "name": "osmo-user", + "description": "A regular user of osmo who can submit and query workflows and datasets", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9ec3a04d-49a4-414b-9e2f-35b70bbea18b", + "name": "dashboard-user", + "description": "Able to view the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "dfd62581-88c7-4ebb-beac-7555d1aef105", + "name": "grafana-admin", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "aa86ac92-9df4-499c-9f78-e3ed600ddc15", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + } + ], + "security-admin-console": [], + "admin-cli": [], + "account-console": [], + "broker": [ + { + "id": "44300967-5867-4c57-a59a-5b8302cb8323", + "name": "read-token", + "description": "${role_read-token}", + "composite": false, + "clientRole": true, + "containerId": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "attributes": {} + } + ], + "osmo-device": [ + { + "id": "e126038f-20eb-4d31-a95b-e5267eb8c7f1", + "name": "osmo-user", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "20874405-f96b-456b-a3b8-86cfe8740144", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "94a41f7f-9927-489f-aa76-a9e3dafb4ed5", + "name": "osmo-backend", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + } + ], + "account": [ + { + "id": "358c4e88-41b8-458b-83d9-e4c86a357095", + "name": "manage-account-links", + "description": "${role_manage-account-links}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "499f54a7-ccc5-4fef-bece-9ccdc6a80308", + "name": "manage-consent", + "description": "${role_manage-consent}", + "composite": true, + "composites": { + "client": { + "account": [ + "view-consent" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "f14ea475-e733-4f69-8475-693da2992a72", + "name": "view-applications", + "description": "${role_view-applications}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "aea168f8-7115-468b-9118-aae87937dee9", + "name": "view-consent", + "description": "${role_view-consent}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "47acd969-e55d-4382-946b-67fb2e4bb119", + "name": "manage-account", + "description": "${role_manage-account}", + "composite": true, + "composites": { + "client": { + "account": [ + "manage-account-links" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "102cd4a5-8e95-4d3c-87de-a98c2958f5c0", + "name": "view-groups", + "description": "${role_view-groups}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "b6da542f-977e-437e-8d24-6cb4ed4612af", + "name": "delete-account", + "description": "${role_delete-account}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "2da758ad-a74d-43ef-b911-6b52c8b60d90", + "name": "view-profile", + "description": "${role_view-profile}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + } + ] + } + }, + "groups": [ + { + "id": "979a1cd5-b392-4905-a868-17603faf9ca9", + "name": "Admin", + "path": "/Admin", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "osmo-admin" + ], + "osmo-device": [ + "osmo-user", + "osmo-admin" + ] + } + }, + { + "id": "2fc39861-b636-47c8-b57b-d1719466759c", + "name": "Backend Operator", + "path": "/Backend Operator", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-device": [ + "osmo-backend" + ] + } + }, + { + "id": "57a9b7f0-36ec-46c5-9781-49d53b1c6468", + "name": "User", + "path": "/User", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "grafana-user", + "dashboard-user" + ], + "osmo-device": [ + "osmo-user" + ] + } + } + ], + "defaultRole": { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c" + }, + "requiredCredentials": [ + "password" + ], + "otpPolicyType": "totp", + "otpPolicyAlgorithm": "HmacSHA1", + "otpPolicyInitialCounter": 0, + "otpPolicyDigits": 6, + "otpPolicyLookAheadWindow": 1, + "otpPolicyPeriod": 30, + "otpPolicyCodeReusable": false, + "otpSupportedApplications": [ + "totpAppFreeOTPName", + "totpAppGoogleName", + "totpAppMicrosoftAuthenticatorName" + ], + "localizationTexts": {}, + "webAuthnPolicyRpEntityName": "keycloak", + "webAuthnPolicySignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyRpId": "", + "webAuthnPolicyAttestationConveyancePreference": "not specified", + "webAuthnPolicyAuthenticatorAttachment": "not specified", + "webAuthnPolicyRequireResidentKey": "not specified", + "webAuthnPolicyUserVerificationRequirement": "not specified", + "webAuthnPolicyCreateTimeout": 0, + "webAuthnPolicyAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyAcceptableAaguids": [], + "webAuthnPolicyExtraOrigins": [], + "webAuthnPolicyPasswordlessRpEntityName": "keycloak", + "webAuthnPolicyPasswordlessSignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyPasswordlessRpId": "", + "webAuthnPolicyPasswordlessAttestationConveyancePreference": "not specified", + "webAuthnPolicyPasswordlessAuthenticatorAttachment": "not specified", + "webAuthnPolicyPasswordlessRequireResidentKey": "not specified", + "webAuthnPolicyPasswordlessUserVerificationRequirement": "not specified", + "webAuthnPolicyPasswordlessCreateTimeout": 0, + "webAuthnPolicyPasswordlessAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyPasswordlessAcceptableAaguids": [], + "webAuthnPolicyPasswordlessExtraOrigins": [], + "scopeMappings": [ + { + "clientScope": "offline_access", + "roles": [ + "offline_access" + ] + } + ], + "clientScopeMappings": { + "account": [ + { + "client": "account-console", + "roles": [ + "manage-account", + "view-groups" + ] + } + ] + }, + "clients": [ + { + "id": "049b45a3-ba14-4735-8168-c9be73625a6f", + "clientId": "account", + "name": "${client_account}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "a18dadb1-a13d-4523-8e33-446ff5781676", + "clientId": "account-console", + "name": "${client_account-console}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "d3db99fd-64a1-48b8-82bd-a92533e2fd4c", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "14047566-1501-4403-92c7-418ef38e3ba4", + "clientId": "admin-cli", + "name": "${client_admin-cli}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "clientId": "broker", + "name": "${client_broker}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "clientId": "osmo-browser-flow", + "name": "Osmo Browser Flow", + "description": "Allow logging into osmo using the authorization code based browser flow", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com/docs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "secret": "**********", + "redirectUris": [ + "", + "https://default.com/setup/getAToken", + "https://default.com/getAToken", + "https://default.com/api/auth/getAToken" + ], + "webOrigins": [ + "*", + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "client.secret.creation.time": "1762965594", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false", + "use.refresh.tokens": "true", + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "client_credentials.use_refresh_token": "false", + "acr.loa.map": "{}", + "require.pushed.authorization.requests": "false", + "tls.client.certificate.bound.access.tokens": "false", + "display.on.consent.screen": "false", + "token.response.type.bearer.lower-case": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "8fcbb19c-503b-4173-a35b-69cc23bc112f", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-browser-flow" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "clientId": "osmo-device", + "name": "Osmo device flow", + "description": "Allow login with devices such as cli", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "https://default.com/*" + ], + "webOrigins": [ + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "display.on.consent.screen": "false", + "oauth2.device.authorization.grant.enabled": "true", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "21f8be09-ffc5-4a26-855b-6be4ab297c67", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-device" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "06a0fe4b-c247-4233-af67-78138bf5337a", + "clientId": "osmo-realm", + "name": "OSMO Realm", + "description": "", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [], + "optionalClientScopes": [] + }, + { + "id": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "clientId": "realm-management", + "name": "${client_realm-management}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "c70e9b76-96a2-41da-84da-df8b9e0d228d", + "clientId": "security-admin-console", + "name": "${client_security-admin-console}", + "rootUrl": "${authAdminUrl}", + "baseUrl": "/admin/osmo/console/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/admin/osmo/console/*" + ], + "webOrigins": [ + "+" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "e921764f-2d7f-4a08-833c-204801a096db", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + } + ], + "clientScopes": [ + { + "id": "e172a6de-ad7d-4cbd-be06-010d284b6806", + "name": "basic", + "description": "OpenID Connect scope for add all basic claims to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "e67f2d9e-7cf0-4875-a72d-ce4a086adf7b", + "name": "auth_time", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "AUTH_TIME", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "auth_time", + "jsonType.label": "long" + } + }, + { + "id": "eba73e8f-7d13-46c7-9e6e-44e8839b1022", + "name": "sub", + "protocol": "openid-connect", + "protocolMapper": "oidc-sub-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "introspection.token.claim": "true" + } + } + ] + }, + { + "id": "76307a43-d2c9-40df-a686-6c4c10e0f70d", + "name": "address", + "description": "OpenID Connect built-in scope: address", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${addressScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "32ac1e8f-3680-4c50-8bb4-7eed44c679b1", + "name": "address", + "protocol": "openid-connect", + "protocolMapper": "oidc-address-mapper", + "consentRequired": false, + "config": { + "user.attribute.formatted": "formatted", + "user.attribute.country": "country", + "user.attribute.postal_code": "postal_code", + "userinfo.token.claim": "true", + "user.attribute.street": "street", + "id.token.claim": "true", + "user.attribute.region": "region", + "access.token.claim": "true", + "user.attribute.locality": "locality" + } + } + ] + }, + { + "id": "67a444ee-3246-4878-a525-e0015e9b31cb", + "name": "offline_access", + "description": "OpenID Connect built-in scope: offline_access", + "protocol": "openid-connect", + "attributes": { + "consent.screen.text": "${offlineAccessScopeConsentText}", + "display.on.consent.screen": "true" + } + }, + { + "id": "1e8f098a-66fe-4df2-9547-47be0d040c53", + "name": "email", + "description": "OpenID Connect built-in scope: email", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${emailScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "00e95ac6-b825-4180-9558-4dffeac9584a", + "name": "email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "email", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9f5125d5-3b89-4f0f-a13e-b8fbb4d6afc1", + "name": "email verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-property-mapper", + "consentRequired": false, + "config": { + "user.attribute": "emailVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "988f9517-5cd2-4b66-90ba-3399d667d0f8", + "name": "role_list", + "description": "SAML role list", + "protocol": "saml", + "attributes": { + "consent.screen.text": "${samlRoleListScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "b78abf35-1108-40e2-a3c8-c6ea4200e817", + "name": "role list", + "protocol": "saml", + "protocolMapper": "saml-role-list-mapper", + "consentRequired": false, + "config": { + "single": "false", + "attribute.nameformat": "Basic", + "attribute.name": "Role" + } + } + ] + }, + { + "id": "f1dcc0f6-63be-4f85-a8cd-d43072e0eba4", + "name": "microprofile-jwt", + "description": "Microprofile - JWT built-in scope", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "bf488bdc-2622-45f0-95c2-df2d05fd3fab", + "name": "upn", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "upn", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5aa8e8c1-f0d7-46c4-b2da-24aa9608da9f", + "name": "groups", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "user.attribute": "foo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "groups", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "fe58e218-3aac-4780-8b5e-b61491cd457b", + "name": "profile", + "description": "OpenID Connect built-in scope: profile", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${profileScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "e0616aae-d3e0-4911-98b2-db72ad142938", + "name": "nickname", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "nickname", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "nickname", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "49cc1e1d-9401-4b57-b8a9-a37573f2eb06", + "name": "profile", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "profile", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "profile", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "e05eea05-f917-4ef3-a82f-501c82192bd6", + "name": "gender", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "gender", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "gender", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "89c031e1-bfad-4afd-af24-51db2c62a11f", + "name": "username", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "preferred_username", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "30d27d3e-3b72-49d1-a66f-0466b58dbf3b", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9fc26d9e-c109-4b30-8ec2-2fc2d95b11d6", + "name": "picture", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "picture", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "picture", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5c0dbd32-7a45-4dc9-9e4f-37570ebf5d38", + "name": "family name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "lastName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "family_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "2de0c290-124a-41be-b7d8-f61f63eed5ef", + "name": "full name", + "protocol": "openid-connect", + "protocolMapper": "oidc-full-name-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + }, + { + "id": "369e67dd-fd5e-4d90-8d80-c945c7a0c049", + "name": "updated at", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "updatedAt", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "updated_at", + "jsonType.label": "long", + "userinfo.token.claim": "true" + } + }, + { + "id": "7557b943-11a1-42bb-a119-35e8da9fcb99", + "name": "birthdate", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "birthdate", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "birthdate", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "06359527-ce26-45f7-beba-7ccf5e71d6f5", + "name": "given name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "firstName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "given_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "8f3bfe54-a74a-4eed-b2bd-4157fc574b57", + "name": "middle name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "middleName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "middle_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "a6cbf817-a0f5-483d-ae1e-c716d04e1645", + "name": "website", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "website", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "website", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "1322fc37-04e4-4e89-99d4-6c304ad36c96", + "name": "zoneinfo", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "zoneinfo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "zoneinfo", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "6aec68b8-7178-449d-9ba6-b6e1c2a9be73", + "name": "service_account", + "description": "Specific scope for a client enabled for service accounts", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "91715642-086a-493b-8f01-5c64d408b7e3", + "name": "Client ID", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "client_id", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "client_id", + "jsonType.label": "String" + } + }, + { + "id": "78dcf109-44bb-4aca-9540-a8896f26e864", + "name": "Client Host", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientHost", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientHost", + "jsonType.label": "String" + } + }, + { + "id": "e28a076d-9ee0-46ec-a2f0-a147bab66a09", + "name": "Client IP Address", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientAddress", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientAddress", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "e728df12-1bff-418d-a68d-c2036d856db2", + "name": "roles", + "description": "OpenID Connect scope for add user roles to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "${rolesScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "993f7f9d-55ba-4c1f-b84a-76e2c733bc94", + "name": "client roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "resource_access.${client_id}.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "f0b2b858-1cde-412b-a1c8-8ed3bd4e04d6", + "name": "realm roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "realm_access.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "32ad3286-1486-4196-9232-533af4c10009", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "efee9fbd-1a06-41d4-94d1-16b59f8d9a68", + "name": "web-origins", + "description": "OpenID Connect scope for add allowed web origins to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "61110fbc-75c7-40cd-aca2-9b7a714b0b22", + "name": "allowed web origins", + "protocol": "openid-connect", + "protocolMapper": "oidc-allowed-origins-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "4a0abefc-0423-403d-8383-10f989580c13", + "name": "phone", + "description": "OpenID Connect built-in scope: phone", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${phoneScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "acdce654-be20-4386-bd4f-edf2cd868f6b", + "name": "phone number", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumber", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "37082e43-4429-479d-bd80-7b8d11b17769", + "name": "phone number verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumberVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "1e5f680b-df5f-4d8c-b9c9-52b5445171ce", + "name": "acr", + "description": "OpenID Connect scope for add acr (authentication context class reference) to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "590accb2-1b94-452e-bb20-51bc643fe860", + "name": "acr loa level", + "protocol": "openid-connect", + "protocolMapper": "oidc-acr-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + } + ] + } + ], + "defaultDefaultClientScopes": [ + "role_list", + "profile", + "email", + "roles", + "web-origins", + "acr", + "basic" + ], + "defaultOptionalClientScopes": [ + "offline_access", + "address", + "phone", + "microprofile-jwt" + ], + "browserSecurityHeaders": { + "contentSecurityPolicyReportOnly": "", + "xContentTypeOptions": "nosniff", + "referrerPolicy": "no-referrer", + "xRobotsTag": "none", + "xFrameOptions": "SAMEORIGIN", + "contentSecurityPolicy": "frame-src 'self'; frame-ancestors 'self'; object-src 'none';", + "xXSSProtection": "1; mode=block", + "strictTransportSecurity": "max-age=31536000; includeSubDomains" + }, + "smtpServer": {}, + "eventsEnabled": false, + "eventsListeners": [ + "jboss-logging" + ], + "enabledEventTypes": [], + "adminEventsEnabled": false, + "adminEventsDetailsEnabled": false, + "identityProviders": [], + "identityProviderMappers": [], + "components": { + "org.keycloak.services.clientregistration.policy.ClientRegistrationPolicy": [ + { + "id": "76bd801e-c608-4338-8198-668c92446a35", + "name": "Full Scope Disabled", + "providerId": "scope", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "06472a8f-7614-4022-b08e-62f023a5fe0a", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "3667ac91-1abf-4124-91e6-ffc803dc29aa", + "name": "Consent Required", + "providerId": "consent-required", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "6e0c8a3f-b5f4-4a49-b44c-bde8ae314d89", + "name": "Max Clients Limit", + "providerId": "max-clients", + "subType": "anonymous", + "subComponents": {}, + "config": { + "max-clients": [ + "200" + ] + } + }, + { + "id": "62d78a88-78a2-4ea7-937b-9a062e946108", + "name": "Trusted Hosts", + "providerId": "trusted-hosts", + "subType": "anonymous", + "subComponents": {}, + "config": { + "host-sending-registration-request-must-match": [ + "true" + ], + "client-uris-must-match": [ + "true" + ] + } + }, + { + "id": "0ca9718d-bfca-4059-b7e8-e32ae3f70a7f", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "oidc-address-mapper", + "saml-user-property-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-full-name-mapper", + "saml-role-list-mapper", + "saml-user-attribute-mapper", + "oidc-sha256-pairwise-sub-mapper" + ] + } + }, + { + "id": "9247c25c-ce3e-4858-8dda-b2c95b2f4d09", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "2d3e37a6-c167-4992-abf8-8cbe22f1bcb9", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "saml-user-property-mapper", + "oidc-full-name-mapper", + "oidc-address-mapper", + "saml-role-list-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-sha256-pairwise-sub-mapper", + "saml-user-attribute-mapper" + ] + } + } + ], + "org.keycloak.userprofile.UserProfileProvider": [ + { + "id": "c12df2b1-cd7d-46b7-ba91-b4381a59f487", + "providerId": "declarative-user-profile", + "subComponents": {}, + "config": { + "kc.user.profile.config": [ + "{\"attributes\":[{\"name\":\"username\",\"displayName\":\"${username}\",\"validations\":{\"length\":{\"min\":3,\"max\":255},\"username-prohibited-characters\":{},\"up-username-not-idn-homograph\":{}},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"email\",\"displayName\":\"${email}\",\"validations\":{\"email\":{},\"length\":{\"max\":255}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"firstName\",\"displayName\":\"${firstName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"lastName\",\"displayName\":\"${lastName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false}],\"groups\":[{\"name\":\"user-metadata\",\"displayHeader\":\"User metadata\",\"displayDescription\":\"Attributes, which refer to user metadata\"}],\"unmanagedAttributePolicy\":\"ENABLED\"}" + ] + } + } + ], + "org.keycloak.keys.KeyProvider": [ + { + "id": "29577a17-9e8a-40cf-b804-cf36c2cf567c", + "name": "hmac-generated-hs512", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS512" + ] + } + }, + { + "id": "48051b03-e0a1-413d-af4a-d9c301f12662", + "name": "rsa-enc-generated", + "providerId": "rsa-enc-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "RSA-OAEP" + ] + } + }, + { + "id": "04c1d0e1-6889-48d2-833a-449a2a9e6fe1", + "name": "hmac-generated", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS256" + ] + } + }, + { + "id": "500737be-f83b-4e67-954e-9e71ca7ed1b0", + "name": "rsa-generated", + "providerId": "rsa-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + }, + { + "id": "7842aa88-a8fb-49a2-ac10-e437337e236a", + "name": "aes-generated", + "providerId": "aes-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + } + ] + }, + "internationalizationEnabled": false, + "supportedLocales": [], + "authenticationFlows": [ + { + "id": "43f7c655-a9cd-4d53-8161-3b3d2008c126", + "alias": "Account verification options", + "description": "Method with which to verity the existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-email-verification", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Verify Existing Account by Re-authentication", + "userSetupAllowed": false + } + ] + }, + { + "id": "0f5c2215-5f40-4509-bb6f-f28c9b743388", + "alias": "Browser - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "eb66c86a-efdc-4039-9153-cd4708f39ba7", + "alias": "Direct Grant - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e68e679a-5fc1-427b-93c6-5657f3ff6eb1", + "alias": "First broker login - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e4a832f6-bae3-41c6-8198-5c14c6ddf706", + "alias": "Handle Existing Account", + "description": "Handle what to do if there is existing account with same email/username like authenticated identity provider", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-confirm-link", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Account verification options", + "userSetupAllowed": false + } + ] + }, + { + "id": "2bbaf432-1058-4ee4-a994-d87f1c224032", + "alias": "Reset - Conditional OTP", + "description": "Flow to determine if the OTP should be reset or not. Set to REQUIRED to force.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "352782b8-ddae-4ddc-af19-86a2900ef1f9", + "alias": "User creation or linking", + "description": "Flow for the existing/non-existing user alternatives", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "create unique user config", + "authenticator": "idp-create-user-if-unique", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Handle Existing Account", + "userSetupAllowed": false + } + ] + }, + { + "id": "fdc0ecfb-67f8-4390-85a0-50ecfdc66800", + "alias": "Verify Existing Account by Re-authentication", + "description": "Reauthentication of existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "First broker login - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a656206c-59b9-47cf-8880-c0f04f04a0c3", + "alias": "browser", + "description": "browser based authentication", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-cookie", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-spnego", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "identity-provider-redirector", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 25, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "forms", + "userSetupAllowed": false + } + ] + }, + { + "id": "7616793a-19e4-4d97-b7ae-ab962acaf444", + "alias": "clients", + "description": "Base authentication for clients", + "providerId": "client-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "client-secret", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-secret-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-x509", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 40, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "1f5446d7-d5de-47fb-8e15-347105d3d062", + "alias": "direct grant", + "description": "OpenID Connect Resource Owner Grant", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "direct-grant-validate-username", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "Direct Grant - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a55463dd-3ced-4102-a263-c121db059379", + "alias": "docker auth", + "description": "Used by Docker clients to authenticate against the IDP", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "docker-http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "646a12ee-99e7-41cd-a1ea-3ed5e5a96dcf", + "alias": "first broker login", + "description": "Actions taken after first broker login with identity provider account, which is not yet linked to any Keycloak account", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "review profile config", + "authenticator": "idp-review-profile", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "User creation or linking", + "userSetupAllowed": false + } + ] + }, + { + "id": "03f283e4-7b80-4b38-b90d-33ba8b0a07c3", + "alias": "forms", + "description": "Username, password, otp and other auth forms.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Browser - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "047f04f4-b2c9-4aa9-bc38-4ed2c17d3e2c", + "alias": "registration", + "description": "registration flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-page-form", + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": true, + "flowAlias": "registration form", + "userSetupAllowed": false + } + ] + }, + { + "id": "51cfacd6-9ee8-4fb2-a3fe-9e00246d9877", + "alias": "registration form", + "description": "registration form", + "providerId": "form-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-user-creation", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-password-action", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 50, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-recaptcha-action", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 60, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "28bb511d-c4ea-4bb8-805c-086eeaf7b239", + "alias": "reset credentials", + "description": "Reset credentials for a user if they forgot their password or something", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "reset-credentials-choose-user", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-credential-email", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 40, + "autheticatorFlow": true, + "flowAlias": "Reset - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "d0189a78-5979-47ce-8536-32c8f6dec1b6", + "alias": "saml ecp", + "description": "SAML ECP Profile Authentication Flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + } + ], + "authenticatorConfig": [ + { + "id": "09fd7502-4e05-437f-865a-221fa1297e67", + "alias": "create unique user config", + "config": { + "require.password.update.after.registration": "false" + } + }, + { + "id": "9abca294-1e03-418f-841c-18b00053f949", + "alias": "review profile config", + "config": { + "update.profile.on.first.login": "missing" + } + } + ], + "requiredActions": [ + { + "alias": "CONFIGURE_TOTP", + "name": "Configure OTP", + "providerId": "CONFIGURE_TOTP", + "enabled": true, + "defaultAction": false, + "priority": 10, + "config": {} + }, + { + "alias": "TERMS_AND_CONDITIONS", + "name": "Terms and Conditions", + "providerId": "TERMS_AND_CONDITIONS", + "enabled": false, + "defaultAction": false, + "priority": 20, + "config": {} + }, + { + "alias": "UPDATE_PASSWORD", + "name": "Update Password", + "providerId": "UPDATE_PASSWORD", + "enabled": true, + "defaultAction": false, + "priority": 30, + "config": {} + }, + { + "alias": "UPDATE_PROFILE", + "name": "Update Profile", + "providerId": "UPDATE_PROFILE", + "enabled": true, + "defaultAction": false, + "priority": 40, + "config": {} + }, + { + "alias": "VERIFY_EMAIL", + "name": "Verify Email", + "providerId": "VERIFY_EMAIL", + "enabled": true, + "defaultAction": false, + "priority": 50, + "config": {} + }, + { + "alias": "delete_account", + "name": "Delete Account", + "providerId": "delete_account", + "enabled": false, + "defaultAction": false, + "priority": 60, + "config": {} + }, + { + "alias": "webauthn-register", + "name": "Webauthn Register", + "providerId": "webauthn-register", + "enabled": true, + "defaultAction": false, + "priority": 70, + "config": {} + }, + { + "alias": "webauthn-register-passwordless", + "name": "Webauthn Register Passwordless", + "providerId": "webauthn-register-passwordless", + "enabled": true, + "defaultAction": false, + "priority": 80, + "config": {} + }, + { + "alias": "delete_credential", + "name": "Delete Credential", + "providerId": "delete_credential", + "enabled": true, + "defaultAction": false, + "priority": 100, + "config": {} + }, + { + "alias": "update_user_locale", + "name": "Update User Locale", + "providerId": "update_user_locale", + "enabled": true, + "defaultAction": false, + "priority": 1000, + "config": {} + } + ], + "browserFlow": "browser", + "registrationFlow": "registration", + "directGrantFlow": "direct grant", + "resetCredentialsFlow": "reset credentials", + "clientAuthenticationFlow": "clients", + "dockerAuthenticationFlow": "docker auth", + "firstBrokerLoginFlow": "first broker login", + "attributes": { + "cibaBackchannelTokenDeliveryMode": "poll", + "cibaExpiresIn": "120", + "cibaAuthRequestedUserHint": "login_hint", + "oauth2DeviceCodeLifespan": "600", + "clientOfflineSessionMaxLifespan": "0", + "oauth2DevicePollingInterval": "5", + "clientSessionIdleTimeout": "0", + "parRequestUriLifespan": "60", + "clientSessionMaxLifespan": "0", + "clientOfflineSessionIdleTimeout": "0", + "cibaInterval": "5", + "realmReusableOtpCode": "false" + }, + "keycloakVersion": "26.1.1", + "userManagedAccessAllowed": false, + "organizationsEnabled": false, + "verifiableCredentialsEnabled": false, + "adminPermissionsEnabled": false, + "clientProfiles": { + "profiles": [] + }, + "clientPolicies": { + "policies": [] + } +} diff --git a/applications/osmo/deploy/example/002-setup/shm_pod_template.json b/applications/osmo/deploy/example/002-setup/shm_pod_template.json new file mode 100644 index 000000000..c7876c5db --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/shm_pod_template.json @@ -0,0 +1,21 @@ +{ + "configs": { + "spec": { + "containers": [{ + "name": "{{USER_CONTAINER_NAME}}", + "volumeMounts": [{ + "name": "shm", + "mountPath": "/dev/shm" + }] + }], + "volumes": [{ + "name": "shm", + "emptyDir": { + "medium": "Memory", + "sizeLimit": "64Gi" + } + }] + } + }, + "description": "Add shared memory volume for IPC (PyTorch, vLLM, TensorRT, etc.)" +} diff --git a/applications/osmo/deploy/example/002-setup/values/gpu-operator.yaml b/applications/osmo/deploy/example/002-setup/values/gpu-operator.yaml new file mode 100755 index 000000000..11cc02fdf --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/gpu-operator.yaml @@ -0,0 +1,57 @@ +# GPU Operator Helm Values +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator +# https://docs.nebius.com/kubernetes/gpu/set-up + +operator: + defaultRuntime: containerd + +# Enable driver installation by GPU Operator +# Even though Nebius nodes may have pre-installed drivers, the GPU Operator +# needs to manage the driver lifecycle for proper integration with device-plugin, +# toolkit, and other components. +driver: + enabled: true + # Let GPU Operator choose the appropriate driver version + # version: auto-detected by operator + upgradePolicy: + autoUpgrade: false # Don't auto-upgrade to avoid conflicts + +toolkit: + enabled: true + +devicePlugin: + enabled: true + config: + default: "any" + +dcgm: + enabled: true + +dcgmExporter: + enabled: true + serviceMonitor: + enabled: true + +gfd: + enabled: true + +migManager: + enabled: false + +nodeStatusExporter: + enabled: true + +# Node selector for GPU operator pods +node-feature-discovery: + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Tolerations for GPU workloads +daemonsets: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/applications/osmo/deploy/example/002-setup/values/grafana.yaml b/applications/osmo/deploy/example/002-setup/values/grafana.yaml new file mode 100755 index 000000000..ab8dd6b6b --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/grafana.yaml @@ -0,0 +1,70 @@ +# Grafana Helm Values (standalone) +# https://github.com/grafana/helm-charts/tree/main/charts/grafana + +# Note: Grafana is typically deployed as part of kube-prometheus-stack +# This file is for standalone Grafana deployment if needed + +replicas: 1 + +adminUser: admin +# adminPassword should be set via --set or secret + +persistence: + enabled: true + size: 10Gi + storageClassName: "" + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Datasources +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-kube-prometheus-prometheus:9090 + access: proxy + isDefault: true + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + +# Dashboard providers +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + +# Sidecar for dashboards +sidecar: + dashboards: + enabled: true + label: grafana_dashboard + datasources: + enabled: true + label: grafana_datasource + +# Service +service: + type: ClusterIP + port: 80 + +# Ingress (disabled by default) +ingress: + enabled: false diff --git a/applications/osmo/deploy/example/002-setup/values/kai-scheduler.yaml b/applications/osmo/deploy/example/002-setup/values/kai-scheduler.yaml new file mode 100755 index 000000000..320c867db --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/kai-scheduler.yaml @@ -0,0 +1,13 @@ +# KAI Scheduler Helm Values +# GPU-aware scheduler for OSMO +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html + +global: + # Modify the node selectors and tolerations to match your cluster + nodeSelector: {} + tolerations: [] + +scheduler: + additionalArgs: + - --default-staleness-grace-period=-1s # Disable staleness eviction + - --update-pod-eviction-condition=true # Enable OSMO to read preemption conditions diff --git a/applications/osmo/deploy/example/002-setup/values/loki.yaml b/applications/osmo/deploy/example/002-setup/values/loki.yaml new file mode 100755 index 000000000..f4c277a22 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/loki.yaml @@ -0,0 +1,68 @@ +# Loki Stack Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/loki-stack + +loki: + enabled: true + + persistence: + enabled: true + size: 50Gi + + config: + auth_enabled: false + + server: + http_listen_port: 3100 + + ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 15m + chunk_retain_period: 30s + + schema_config: + configs: + - from: 2020-01-01 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/boltdb-shipper-active + cache_location: /data/loki/boltdb-shipper-cache + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + max_entries_limit_per_query: 5000 + + table_manager: + retention_deletes_enabled: true + retention_period: 168h # 7 days + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + +# Promtail is deployed separately +promtail: + enabled: false + +# Grafana is deployed via kube-prometheus-stack +grafana: + enabled: false diff --git a/applications/osmo/deploy/example/002-setup/values/network-operator.yaml b/applications/osmo/deploy/example/002-setup/values/network-operator.yaml new file mode 100755 index 000000000..146a9daca --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/network-operator.yaml @@ -0,0 +1,62 @@ +# Network Operator Helm Values +# https://docs.nvidia.com/networking/display/cokan10/network+operator + +# Operator settings +operator: + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + +# RDMA shared device plugin (for InfiniBand) +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + deviceIDs: [101b, 101d, 1017, 1019] + ifNames: ["*"] + +# SR-IOV device plugin +sriovDevicePlugin: + deploy: false + +# NIC cluster policy +nicClusterPolicy: + deploy: true + + # RDMA + rdmaSharedDevicePlugin: + image: k8s-rdma-shared-dev-plugin + repository: ghcr.io/mellanox + version: sha-4f3eb55 + +# Secondary network +secondaryNetwork: + deploy: true + + # Multus CNI + multus: + deploy: true + image: multus-cni + repository: ghcr.io/k8snetworkplumbingwg + version: v3.9.3 + + # CNI plugins + cniPlugins: + deploy: true + image: plugins + repository: ghcr.io/k8snetworkplumbingwg + version: v1.3.0 + + # IPAM plugin + ipamPlugin: + deploy: true + image: whereabouts + repository: ghcr.io/k8snetworkplumbingwg + version: v0.6.2 diff --git a/applications/osmo/deploy/example/002-setup/values/osmo-backend-operator.yaml b/applications/osmo/deploy/example/002-setup/values/osmo-backend-operator.yaml new file mode 100755 index 000000000..b4781ae21 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/osmo-backend-operator.yaml @@ -0,0 +1,37 @@ +# OSMO Backend Operator Values +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html + +global: + # REQUIRED: OSMO image tag (e.g., 6.0.0) + osmoImageTag: "6.0.0" + + # REQUIRED: Your OSMO service URL + serviceUrl: "https://osmo.example.com" + + # Namespaces + agentNamespace: "osmo-operator" + backendNamespace: "osmo-workflows" + + # REQUIRED: Unique name for this backend + backendName: "nebius-backend" + + # Authentication + accountTokenSecret: "osmo-operator-token" + loginMethod: "token" + + # Resource configuration + services: + backendListener: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" + backendWorker: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" diff --git a/applications/osmo/deploy/example/002-setup/values/prometheus.yaml b/applications/osmo/deploy/example/002-setup/values/prometheus.yaml new file mode 100755 index 000000000..12cc634d9 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/prometheus.yaml @@ -0,0 +1,109 @@ +# Prometheus Stack Helm Values +# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +# Prometheus +prometheus: + prometheusSpec: + # Some CRDs require this to be >= 60 + maximumStartupDurationSeconds: 60 + retention: 15d + + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 8Gi + + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + + # Service monitors + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + +# Grafana +grafana: + enabled: true + + adminUser: admin + # adminPassword is set via --set flag + + persistence: + enabled: true + size: 10Gi + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # Additional datasources + additionalDataSources: + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + isDefault: false + + # Dashboards + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + # GPU dashboard + dashboards: + default: + nvidia-dcgm: + gnetId: 12239 + revision: 2 + datasource: Prometheus + +# Alertmanager +alertmanager: + enabled: true + + alertmanagerSpec: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Node exporter +nodeExporter: + enabled: true + +# Kube state metrics +kubeStateMetrics: + enabled: true + +# Prometheus operator +prometheusOperator: + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/applications/osmo/deploy/example/002-setup/values/promtail.yaml b/applications/osmo/deploy/example/002-setup/values/promtail.yaml new file mode 100755 index 000000000..601d29e57 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/values/promtail.yaml @@ -0,0 +1,46 @@ +# Promtail Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/promtail + +config: + clients: + - url: http://loki:3100/loki/api/v1/push + + snippets: + pipelineStages: + - cri: {} + - json: + expressions: + level: level + message: msg + - labels: + level: + - output: + source: message + +# Resources +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Tolerations to run on all nodes +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Volume mounts (for containerd logs if needed) +# Note: The default chart already mounts /var/lib/docker and /var/log +# Only add extra volumes if you need additional paths +extraVolumes: [] +extraVolumeMounts: [] + +# Service monitor +serviceMonitor: + enabled: true diff --git a/applications/osmo/deploy/example/README.md b/applications/osmo/deploy/example/README.md new file mode 100755 index 000000000..aeacc9336 --- /dev/null +++ b/applications/osmo/deploy/example/README.md @@ -0,0 +1,176 @@ +# Deployment Guide + +This directory contains all deployment artifacts for OSMO on Nebius. + +## Deployment Phases + +### Phase 0: Prerequisites (`000-prerequisites/`) + +Install required tools and configure your Nebius environment. + +```bash +cd 000-prerequisites + +# Install required tools (Terraform, kubectl, Helm, Nebius CLI) +./install-tools.sh + +# Check if tools are installed +./install-tools.sh --check + +# Configure Nebius environment +source ./nebius-env-init.sh + +# (Recommended) Initialize secrets in MysteryBox +source ./secrets-init.sh +``` + +### Phase 1: Infrastructure (`001-iac/`) + +Deploy cloud infrastructure using Terraform. + +```bash +cd 001-iac + +# Recommended: Cost-optimized with secure private access +cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars + +# Other options: +# terraform.tfvars.cost-optimized.example - Cheapest (public endpoints) +# terraform.tfvars.production.example - Full production setup +# terraform.tfvars.secure.example - H100 with WireGuard + +# Edit configuration +vim terraform.tfvars + +# Deploy +terraform init +terraform plan +terraform apply +``` + +**Resources Created:** +- VPC Network and Subnet +- Managed Kubernetes Cluster +- CPU and GPU Node Groups +- Managed PostgreSQL +- Object Storage Buckets +- Filestore (Shared Filesystem) +- Container Registry +- Service Accounts +- WireGuard VPN (optional) + +### Phase 2: Kubernetes Setup (`002-setup/`) + +Configure Kubernetes with GPU infrastructure and OSMO. + +```bash +cd 002-setup + +# 1. Deploy GPU infrastructure +./01-deploy-gpu-infrastructure.sh + +# 2. Deploy observability stack +./02-deploy-observability.sh + +# 3. Deploy NGINX Ingress Controller +./03-deploy-nginx-ingress.sh + +# 4. Enable TLS (optional, recommended – set up DNS A record first) +./04-enable-tls.sh + +# 5. Deploy OSMO control plane +./05-deploy-osmo-control-plane.sh + +# 6. Deploy OSMO backend +./06-deploy-osmo-backend.sh +``` + +## Directory Structure + +``` +deploy/ +├── 000-prerequisites/ +│ ├── install-tools.sh # Tool installer +│ ├── nebius-env-init.sh # Environment setup +│ ├── secrets-init.sh # MysteryBox secrets setup +│ ├── wireguard-client-setup.sh # WireGuard client config +│ └── README.md +├── 001-iac/ +│ ├── modules/ +│ │ ├── platform/ # VPC, Storage, DB, Registry +│ │ ├── k8s/ # Kubernetes cluster +│ │ └── wireguard/ # VPN infrastructure +│ ├── main.tf # Root module +│ ├── variables.tf # Input variables +│ ├── outputs.tf # Output values +│ ├── versions.tf # Provider versions +│ ├── terraform.tfvars.*.example +│ └── README.md +└── 002-setup/ + ├── lib/ + │ └── common.sh # Shared functions + ├── values/ # Helm values files + ├── 01-deploy-gpu-infrastructure.sh + ├── 02-deploy-observability.sh + ├── 03-deploy-nginx-ingress.sh + ├── 04-enable-tls.sh + ├── 05-deploy-osmo-control-plane.sh + ├── 06-deploy-osmo-backend.sh + ├── cleanup/ # Uninstall scripts + └── README.md +``` + +## Configuration Files + +| File | Purpose | Recommended | +|------|---------|-------------| +| `terraform.tfvars.cost-optimized-secure.example` | Cheap + secure (L40S + VPN) | **Recommended** | +| `terraform.tfvars.cost-optimized.example` | Cheapest (L40S, public) | Dev only | +| `terraform.tfvars.production.example` | Full production (H200 + VPN) | Production | +| `terraform.tfvars.secure.example` | H100 + VPN | Staging | + +## Environment Variables + +After running `nebius-env-init.sh`, these variables are set: + +| Variable | Description | +|----------|-------------| +| `NEBIUS_TENANT_ID` | Your Nebius tenant ID | +| `NEBIUS_PROJECT_ID` | Your Nebius project ID | +| `NEBIUS_REGION` | Deployment region | +| `TF_VAR_tenant_id` | Terraform variable for tenant | +| `TF_VAR_parent_id` | Terraform variable for project | +| `TF_VAR_region` | Terraform variable for region | + +## Cleanup + +To remove all deployed resources: + +```bash +# 1. Remove Kubernetes components +cd 002-setup/cleanup +./uninstall-osmo-backend.sh +./uninstall-osmo-control-plane.sh +./uninstall-observability.sh +./uninstall-gpu-infrastructure.sh + +# 2. Destroy infrastructure +cd ../../001-iac +terraform destroy +``` + +## Troubleshooting + +### Terraform Errors + +1. **Authentication failed**: Run `source ../000-prerequisites/nebius-env-init.sh` +2. **Resource quota exceeded**: Check Nebius console for quota limits +3. **Invalid region**: Verify region supports required GPU types + +### Kubernetes Errors + +1. **Nodes not ready**: Check GPU operator pod logs +2. **Pods pending**: Verify node group scaling +3. **Network issues**: Check Cilium pod status + +See [Troubleshooting Guide](../docs/troubleshooting.md) for more details. diff --git a/applications/osmo/workflows/README.md b/applications/osmo/workflows/README.md new file mode 100755 index 000000000..c5a9c2c63 --- /dev/null +++ b/applications/osmo/workflows/README.md @@ -0,0 +1,156 @@ +# Workflow Templates + +OSMO workflow templates for training jobs on Nebius. + +## Available Workflows + +| File | Description | GPUs | +|------|-------------|------| +| `osmo/hello_nebius.yaml` | Hello World example with GPU | 1 | +| `osmo/gpu_test.yaml` | GPU validation test | 1 | +| `osmo/train.yaml` | Single GPU training | 1 | +| `osmo/train-multi-gpu.yaml` | Multi-GPU distributed training | 8 | + +## Quick Start + +### Test CPU Workflow + +```bash +osmo workflow submit osmo/hello_nebius.yaml +``` + +This workflow runs on a GPU node and prints "Hello Nebius!". + +### Test GPU Access + +```bash +osmo workflow submit osmo/gpu_test.yaml +``` + +This workflow validates GPU availability by running `nvidia-smi` on a Nebius L40S node. + +> **Note**: GPU workflows require the GPU platform to be configured. See [Configure OSMO GPU Platform](../deploy/002-setup/README.md#configure-osmo-gpu-platform). + +## Usage + +### Submit via Script + +```bash +cd ../scripts +./submit-osmo-training.sh -w ../workflows/osmo/train.yaml +``` + +### Submit Directly + +```bash +# Single GPU +kubectl apply -f osmo/train.yaml + +# Multi-GPU +kubectl apply -f osmo/train-multi-gpu.yaml +``` + +## Workflow Structure + +### Single GPU (`train.yaml`) + +Best for: +- Development and debugging +- Small models +- Inference testing + +Resources: +- 1 GPU +- 64 GB memory +- 8 vCPUs + +### Multi-GPU (`train-multi-gpu.yaml`) + +Best for: +- Large model training +- Distributed training +- Production workloads + +Resources: +- 8 GPUs +- 1400 GB memory +- 120 vCPUs + +Features: +- InfiniBand for NCCL +- Shared memory for GPU communication +- Node affinity for GPU cluster + +## Customization + +### Change Training Image + +```yaml +containers: + - name: training + image: your-registry/your-image:tag +``` + +### Add Training Data + +```yaml +volumeMounts: + - name: shared-data + mountPath: /data +``` + +### Configure Environment + +```yaml +env: + - name: LEARNING_RATE + value: "0.001" + - name: BATCH_SIZE + value: "32" +``` + +### Add GPU Resources + +```yaml +resources: + limits: + nvidia.com/gpu: 8 +``` + +## Environment Variables + +### NCCL Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `NCCL_DEBUG` | Debug level (INFO, WARN) | INFO | +| `NCCL_IB_DISABLE` | Disable InfiniBand (0/1) | 0 | +| `NCCL_NET_GDR_LEVEL` | GPUDirect RDMA level | 5 | + +### PyTorch Distributed + +| Variable | Description | +|----------|-------------| +| `MASTER_ADDR` | Master node address | +| `MASTER_PORT` | Master node port | +| `WORLD_SIZE` | Total number of processes | +| `RANK` | Process rank | + +## Monitoring + +### View Job Status + +```bash +kubectl get jobs -n osmo +kubectl get pods -n osmo -l app=osmo-training +``` + +### View Logs + +```bash +kubectl logs -n osmo -l job-name= -f +``` + +### GPU Metrics + +Access Grafana dashboard for GPU utilization metrics. diff --git a/applications/osmo/workflows/osmo/gpu_test.yaml b/applications/osmo/workflows/osmo/gpu_test.yaml new file mode 100755 index 000000000..1075d4a78 --- /dev/null +++ b/applications/osmo/workflows/osmo/gpu_test.yaml @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# GPU Test Workflow for Nebius +# Validates GPU availability and CUDA functionality on Nebius L40S nodes +# +# Submit with: +# osmo workflow submit workflows/osmo/gpu_test.yaml +# +# Or via curl (with port-forward to osmo-service:8080): +# curl -X POST http://localhost:8080/api/workflow -H "Content-Type: application/yaml" --data-binary @workflows/osmo/gpu_test.yaml + +workflow: + name: gpu-test-nebius + resources: + gpu-resource: + platform: gpu + gpu: 1 + memory: 4Gi + cpu: 2 + tasks: + - name: check-gpu + image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + command: ["/bin/bash", "-c"] + args: + - | + echo "=== GPU Test on Nebius ===" + echo "" + echo "=== nvidia-smi output ===" + nvidia-smi + echo "" + echo "=== CUDA Version ===" + nvcc --version 2>/dev/null || echo "nvcc not available (base image)" + echo "" + echo "=== GPU Memory Info ===" + nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv + echo "" + echo "=== Environment ===" + echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}" + echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}" + echo "" + echo "=== GPU Test Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/hello_nebius.yaml b/applications/osmo/workflows/osmo/hello_nebius.yaml new file mode 100755 index 000000000..44a677d10 --- /dev/null +++ b/applications/osmo/workflows/osmo/hello_nebius.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +workflow: + name: hello-nebius + resources: + gpu-resource: + platform: gpu + gpu: 1 + memory: 2Gi + storage: 2Gi + tasks: + - name: hello + image: ubuntu:24.04 + command: ["echo"] + args: ["Hello Nebius!"] + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/test_bucket_write.yaml b/applications/osmo/workflows/osmo/test_bucket_write.yaml new file mode 100755 index 000000000..05ff9456d --- /dev/null +++ b/applications/osmo/workflows/osmo/test_bucket_write.yaml @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Test workflow: writes test.txt and uploads it to the default dataset bucket. +# Use after configuring the Nebius bucket as the default (10-configure-dataset-bucket.sh). +# Submit: osmo workflow submit workflows/osmo/test_bucket_write.yaml + +workflow: + name: test-bucket-write4 + resources: + default: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: write-test-file + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "OSMO default bucket test at MEOW $(date -Iseconds)" > {{output}}/test.txt + echo "Wrote test.txt to task output (will be uploaded to default bucket)" + cat {{output}}/test.txt + echo "Spinning for 10 seconds before stopping..." + sleep 10 + echo "Done." + outputs: + - dataset: + name: datasetv004 \ No newline at end of file diff --git a/applications/osmo/workflows/osmo/test_gpu_driver.yaml b/applications/osmo/workflows/osmo/test_gpu_driver.yaml new file mode 100644 index 000000000..a150e91a5 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_gpu_driver.yaml @@ -0,0 +1,53 @@ +# GPU Driver Version Test Workflow +# Validates that the correct NVIDIA driver is installed (580.95.05) +# and that CUDA is functional on H100 GPUs. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_gpu_driver.yaml + +workflow: + name: test-gpu-driver + resources: + gpu-resource: + platform: H100 + gpu: 1 + cpu: 2 + memory: 4Gi + tasks: + - name: check-driver + image: nvidia/cuda:13.0.2-base-ubuntu24.04 + command: ["bash", "-c"] + args: + - | + echo "=== GPU Driver Verification ===" + echo "" + EXPECTED="580.95.05" + + echo "--- Driver version ---" + DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 | tr -d '[:space:]') + echo "Installed: ${DRIVER}" + echo "Expected: ${EXPECTED}" + if [ "$DRIVER" = "$EXPECTED" ]; then + echo "PASS: driver version matches" + else + echo "FAIL: driver version mismatch" + exit 1 + fi + echo "" + + echo "--- GPU info ---" + nvidia-smi --query-gpu=name,memory.total,pci.bus_id --format=csv + echo "" + + echo "--- All GPUs visible ---" + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + echo "GPU count: ${GPU_COUNT}" + echo "" + + echo "--- CUDA compute test ---" + # Simple CUDA validation via nvidia-smi + nvidia-smi -q | grep -E "CUDA Version|Product Name|Driver Version" + echo "" + + echo "=== GPU Driver Verification Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/test_mnt_data.yaml b/applications/osmo/workflows/osmo/test_mnt_data.yaml new file mode 100644 index 000000000..a78e0fbb5 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_mnt_data.yaml @@ -0,0 +1,88 @@ +# /mnt/data Filestore Test Workflow +# Validates that the Nebius Filestore is mounted at /mnt/data and measures I/O speed. +# Uses the H100 platform (with shm template). +# +# Submit with: +# osmo workflow submit workflows/osmo/test_mnt_data.yaml + +workflow: + name: test-mnt-data + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 4 + memory: 4Gi + storage: 1Gi + tasks: + - name: check-mnt-data + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== /mnt/data Filestore Test ===" + echo "" + + # Check mount + echo "--- Mount check ---" + if mountpoint -q /mnt/data 2>/dev/null || df /mnt/data 2>/dev/null | grep -q /mnt/data; then + echo "PASS: /mnt/data is mounted" + else + echo "FAIL: /mnt/data is NOT mounted" + exit 1 + fi + echo "" + + echo "--- Filesystem info ---" + df -hT /mnt/data + echo "" + + echo "--- Mount type ---" + mount | grep /mnt/data || echo "(not visible in mount table -- may be host mount)" + echo "" + + # Install fio + apt-get update -qq && apt-get install -y -qq fio > /dev/null 2>&1 + echo "fio version: $(fio --version)" + echo "" + + TEST_DIR="/mnt/data/.osmo-bench-$$" + mkdir -p "${TEST_DIR}" + + # Sequential write test + echo "--- Sequential Write (direct I/O, 1M blocks, 8 jobs) ---" + fio --name=seq-write \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=write --bs=1M --iodepth=32 \ + --thread --numjobs=8 --size=2G --runtime=30 \ + --group_reporting + echo "" + + # Sequential read test + echo "--- Sequential Read (direct I/O, 1M blocks, 8 jobs) ---" + fio --name=seq-read \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=read --bs=1M --iodepth=32 \ + --thread --numjobs=8 --size=2G --runtime=30 \ + --group_reporting + echo "" + + # Random read/write (4K) for IOPS + echo "--- Random Read/Write 4K (IOPS test, 4 jobs) ---" + fio --name=rand-rw \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=randrw --rwmixread=70 --bs=4k --iodepth=32 \ + --thread --numjobs=4 --size=1G --runtime=30 \ + --group_reporting + echo "" + + # Cleanup + rm -rf "${TEST_DIR}" + + echo "=== /mnt/data Filestore Test Complete ===" + resource: gpu-resource + volumeMounts: + - /mnt/data diff --git a/applications/osmo/workflows/osmo/test_multi_gpu.yaml b/applications/osmo/workflows/osmo/test_multi_gpu.yaml new file mode 100644 index 000000000..1cc7c9508 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_multi_gpu.yaml @@ -0,0 +1,57 @@ +# Multi-GPU NCCL Bandwidth Test +# Requests 8 GPUs on a single H100 node and runs nccl-tests all_reduce_perf +# to validate GPU-to-GPU (NVLink/NVSwitch) communication. +# +# Uses CUDA 13.0 devel image (~4GB) to compile nccl-tests on the fly. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_multi_gpu.yaml + +workflow: + name: test-multi-gpu + resources: + gpu-resource: + platform: gpu + gpu: 8 + cpu: 16 + memory: 64Gi + tasks: + - name: nccl-test + image: nvidia/cuda:13.0.2-devel-ubuntu24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Multi-GPU NCCL Test (8x H100) ===" + echo "" + + echo "--- GPU topology ---" + nvidia-smi topo -m + echo "" + + echo "--- GPU summary ---" + nvidia-smi --query-gpu=index,name,memory.total --format=csv + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + echo "" + echo "GPUs visible: ${GPU_COUNT}" + if [ "$GPU_COUNT" -lt 8 ]; then + echo "FAIL: expected 8 GPUs, got ${GPU_COUNT}" + exit 1 + fi + echo "PASS: all 8 GPUs visible" + echo "" + + echo "--- Building nccl-tests ---" + apt-get update -qq && apt-get install -y -qq git build-essential &>/dev/null + cd /tmp + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git &>/dev/null + cd nccl-tests + make MPI=0 -j$(nproc) &>/dev/null + echo "Built successfully" + echo "" + + echo "--- NCCL all_reduce bandwidth test ---" + ./build/all_reduce_perf -b 8M -e 1G -f 2 -g 8 + echo "" + + echo "=== Multi-GPU NCCL Test Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/test_shared_fs.yaml b/applications/osmo/workflows/osmo/test_shared_fs.yaml new file mode 100644 index 000000000..1de8fea84 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shared_fs.yaml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Shared Filesystem Test Workflow +# Validates that the Nebius Filestore mounted at /mnt/data is accessible from OSMO workflows. +# Requires: enable_filestore=true in terraform.tfvars and allowed_mounts configured on the gpu platform. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shared_fs.yaml + +workflow: + name: test-shared-fs + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: test-filestore + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Shared Filesystem Test ===" + echo "" + echo "=== Mount info ===" + df -h /mnt/data + echo "" + echo "=== Writing test file ===" + echo "Hello from OSMO workflow at $(date -Iseconds)" > /mnt/data/osmo-test.txt + cat /mnt/data/osmo-test.txt + echo "" + echo "=== Directory listing ===" + ls -la /mnt/data/ + echo "" + echo "=== Shared Filesystem Test Complete ===" + resource: gpu-resource + volumeMounts: + - /mnt/data diff --git a/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml b/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml new file mode 100644 index 000000000..66b348a70 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml @@ -0,0 +1,155 @@ +# Shared Filesystem Cross-Node Test +# Validates that /mnt/data is truly shared across different tasks and nodes. +# +# How it works: +# - 3 tasks run in parallel, each on a potentially different GPU node +# - Each writes a marker file with hostname + K8s node name to /mnt/data +# - After a sync delay, each reads ALL markers to verify cross-task visibility +# - The last task to finish cleans up the test directory +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shared_fs_cross_node.yaml + +workflow: + name: test-shared-fs-cross-node + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: node-a + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-a" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + K8S_NODE=$(cat /etc/hostname 2>/dev/null || echo "unknown") + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + echo "K8s node env: ${MY_NODE_NAME:-not-set}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + # Wait for all tasks to write + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + echo " $(basename $f): $(cat $f)" + done + echo "" + echo "Result: ${TASK} sees ${FOUND}/3 markers" + [ "${FOUND}" -ge 2 ] && echo "PASS" || echo "FAIL" + resource: gpu-resource + volumeMounts: + - /mnt/data + + - name: node-b + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-b" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + echo " $(basename $f): $(cat $f)" + done + echo "" + echo "Result: ${TASK} sees ${FOUND}/3 markers" + [ "${FOUND}" -ge 2 ] && echo "PASS" || echo "FAIL" + resource: gpu-resource + volumeMounts: + - /mnt/data + + - name: node-c + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-c" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + HOSTS="" + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + CONTENT=$(cat "$f") + echo " $(basename $f): ${CONTENT}" + H=$(echo "$CONTENT" | cut -d'|' -f2) + HOSTS="${HOSTS} ${H}" + done + NUM_UNIQUE=$(echo ${HOSTS} | tr ' ' '\n' | sort -u | grep -c . || true) + echo "" + echo "========================================" + echo "SUMMARY" + echo "========================================" + echo "Markers visible: ${FOUND}/3" + echo "Unique pods: ${NUM_UNIQUE}" + if [ "${FOUND}" -ge 3 ]; then + echo "STATUS: PASS - all tasks see all data" + elif [ "${FOUND}" -ge 2 ]; then + echo "STATUS: PASS - cross-task sharing works" + else + echo "STATUS: FAIL - data not shared" + fi + echo "========================================" + + # Cleanup (node-c runs last due to sleep timing) + sleep 5 + rm -rf "${DIR}" + resource: gpu-resource + volumeMounts: + - /mnt/data diff --git a/applications/osmo/workflows/osmo/test_shm.yaml b/applications/osmo/workflows/osmo/test_shm.yaml new file mode 100644 index 000000000..d5645ae58 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shm.yaml @@ -0,0 +1,75 @@ +# Shared Memory (/dev/shm) Test Workflow +# Validates that the shm pod template is applied correctly: +# - /dev/shm is mounted as tmpfs +# - Size is 64Gi (as configured in the shm pod template) +# - Measures sequential write/read bandwidth +# +# Requires: shm pod template configured, H100 platform with shm in override_pod_template +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shm.yaml + +workflow: + name: test-shm + resources: + gpu-resource: + platform: H100 + gpu: 1 + cpu: 4 + memory: 8Gi + tasks: + - name: check-shm + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Shared Memory (/dev/shm) Test ===" + echo "" + + # Check mount + echo "--- Mount info ---" + if mount | grep -q "/dev/shm.*tmpfs"; then + echo "PASS: /dev/shm is mounted as tmpfs" + mount | grep "/dev/shm" + else + echo "FAIL: /dev/shm is NOT a tmpfs mount" + mount | grep shm || echo "(no shm mount found)" + exit 1 + fi + echo "" + + # Check size + echo "--- Size ---" + SHM_SIZE_KB=$(df -k /dev/shm | tail -1 | awk '{print $2}') + SHM_SIZE_GI=$((SHM_SIZE_KB / 1024 / 1024)) + echo "Total: ${SHM_SIZE_GI}Gi (${SHM_SIZE_KB} KB)" + if [ "$SHM_SIZE_GI" -ge 60 ]; then + echo "PASS: size >= 60Gi" + else + echo "FAIL: size ${SHM_SIZE_GI}Gi is less than expected 64Gi" + exit 1 + fi + echo "" + + # Bandwidth test - write + echo "--- Write bandwidth (1GB) ---" + WRITE_OUT=$(dd if=/dev/zero of=/dev/shm/bench.tmp bs=1M count=1024 conv=fdatasync 2>&1) + echo "$WRITE_OUT" + WRITE_SPEED=$(echo "$WRITE_OUT" | grep -oP '[\d.]+ [GM]B/s' | tail -1) + echo "Write speed: ${WRITE_SPEED:-see above}" + echo "" + + # Bandwidth test - read + echo "--- Read bandwidth (1GB) ---" + # Drop caches not possible without privileges, but tmpfs reads are from RAM anyway + READ_OUT=$(dd if=/dev/shm/bench.tmp of=/dev/null bs=1M count=1024 2>&1) + echo "$READ_OUT" + READ_SPEED=$(echo "$READ_OUT" | grep -oP '[\d.]+ [GM]B/s' | tail -1) + echo "Read speed: ${READ_SPEED:-see above}" + echo "" + + # Cleanup + rm -f /dev/shm/bench.tmp + + echo "=== Shared Memory Test Complete ===" + resource: gpu-resource diff --git a/modules/gpu-operator-custom/helm.tf b/modules/gpu-operator-custom/helm.tf old mode 100644 new mode 100755 diff --git a/modules/gpu-operator-custom/variables.tf b/modules/gpu-operator-custom/variables.tf old mode 100644 new mode 100755 diff --git a/modules/nims/bionemo.tf b/modules/nims/bionemo.tf old mode 100644 new mode 100755 diff --git a/modules/nims/provider.tf b/modules/nims/provider.tf old mode 100644 new mode 100755 diff --git a/modules/nims/qwen3-next-80b-a3b-instruct.tf b/modules/nims/qwen3-next-80b-a3b-instruct.tf old mode 100644 new mode 100755 diff --git a/modules/nims/variables.tf b/modules/nims/variables.tf old mode 100644 new mode 100755 index c968bdbce..629dcc330 --- a/modules/nims/variables.tf +++ b/modules/nims/variables.tf @@ -7,6 +7,7 @@ variable "parent_id" { variable "ngc_key" { description = "API key from Nvidia GPU cloud: catalog.ngc.nvidia.com" type = string + default = "" } variable "openfold3" {